From 1b298ba0bc4c37f09e62ddc646560e1a28aafd94 Mon Sep 17 00:00:00 2001 From: rickyota <22293266+rickyota@users.noreply.github.com> Date: Thu, 20 Jul 2023 13:30:17 +0900 Subject: [PATCH 1/6] add: docker, singularity --- README.md | 30 +++++++++-------- docker/Dockerfile | 30 +++++++++-------- docker/genoboost.def | 6 ++++ genoboost.docker.cv.sh | 73 ------------------------------------------ genoboost.docker.sh | 41 +++++++++++------------- genoboost.sh | 6 ++-- 6 files changed, 59 insertions(+), 127 deletions(-) create mode 100644 docker/genoboost.def delete mode 100644 genoboost.docker.cv.sh diff --git a/README.md b/README.md index 7749fd1..5824507 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# GenoBoost v0.4.0 +# GenoBoost v0.4.1 [![GenoBoost](https://github.com/rickyota/genoboost/actions/workflows/genoboost.yml/badge.svg)](https://github.com/rickyota/genoboost/actions/workflows/genoboost.yml) [![Release](https://github.com/rickyota/genoboost/actions/workflows/publish.yml/badge.svg)](https://github.com/rickyota/genoboost/actions/workflows/publish.yml) @@ -288,30 +288,32 @@ $ genoboost score \ ## Advanced Guide -### Docker - Using docker or singularity is recommended. -Run GenoBoost on an example dataset in `./test/data/1kg_n10000` (1000 samples x 10000 SNVs). +### Docker ```bash -$ docker run -td \ - -v "$(pwd)/test/data/1kg_n10000":/work/data:ro -v "$(pwd)/result":/work/result \ - rickyota/genoboost:latest \ - bash ./genoboost.docker.cv.sh +$ docker pull rickyota/genoboost:latest \ +$ docker run -it rickyota/genoboost:latest \ + train \ + --dir ./result \ + --file-genot ./example/genot \ + --file-phe ./example/genot.cov \ + --cov age,sex ``` ### Singularity ```bash -$ singularity build geno.sif docker://rickyota/genoboost:latest -$ singularity exec \ - --bind "$(pwd)/test/data/1kg_n10000":/work/data,"$(pwd)/result":/work/result \ - --no-home --pwd /opt/genoboost geno.sif \ - bash ./genoboost.docker.cv.sh +$ singularity build genoboost.sif ./docker/genoboost.def +$ singularity run genoboost.sif \ + train \ + --dir ./result \ + --file-genot ./example/genot \ + --file-phe ./example/genot.cov \ + --cov age,sex ``` -Result files are now in `./result/` . [release]: https://github.com/rickyota/genoboost/releases diff --git a/docker/Dockerfile b/docker/Dockerfile index e6ad4de..dc893b9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,19 +1,23 @@ -FROM --platform=linux/amd64 rust:1.63 AS builder - -WORKDIR /opt/genoboost -# TODO: install from github url -COPY ../ . -# export RUST_BACKTRACE=full -RUN cargo build --release -p boosting_rust &&\ - cp ./target/release/boosting_rust ./genoboost +FROM --platform=linux/amd64 rust:1.68 AS builder - -FROM --platform=linux/amd64 continuumio/miniconda3 AS runner +RUN apt-get update && + apt-get install -y --no-install-recommends \ + clang WORKDIR /opt/genoboost -# TODO: copy only necessary files -COPY --from=builder /opt/genoboost ./ -RUN conda env create --force -n genoboost -f ./etc/env.yml +COPY ../ . +#export RUSTFLAGS='-C target-cpu=native' +RUN cargo build \ + --release \ + --manifest-path ./projects_rust/Cargo.toml \ + --bin genoboost +ENTRYPOINT ["./projects_rust/target/genoboost"] +#FROM --platform=linux/amd64 debian:buster-slim AS runner +# +#WORKDIR /opt/genoboost +## TODO: copy only necessary files +#COPY --from=builder /opt/genoboost/projects_rust/target/release/genoboost ./ +#CMD ["./genoboost"] diff --git a/docker/genoboost.def b/docker/genoboost.def new file mode 100644 index 0000000..42369bf --- /dev/null +++ b/docker/genoboost.def @@ -0,0 +1,6 @@ +Bootstrap: docker +From: rickyota/genoboost:latest + +%runscript + /opt/genoboost/genoboost "$@" + diff --git a/genoboost.docker.cv.sh b/genoboost.docker.cv.sh deleted file mode 100644 index 6717045..0000000 --- a/genoboost.docker.cv.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -# -# GenoBoost 5-fold cross-validation - -set -eux - -# mount to this path -dir_data="/work/data/" -dir_result="/work/result/" -# output directory -dir="${dir_result}" -# prefix of plink1 file -file_plink="${dir_data}genot" -# covariate file -file_cov="${dir_data}genot.cov" -# learning rate parameters -learning_rates="0.1 0.5" - -# output directory of samples -dir_sample="${dir}samples/" -# output directory of cross-validation -dir_cv="${dir}cross_validation/" - -# create cv dataset -mkdir -p "$dir_sample" -eval "$(conda shell.bash hook)" -conda activate genoboost -python -m projects.genetics_py.src.dataset \ - --cross_validation \ - --cross_validation_n 5 \ - --dout "${dir_sample}" \ - --fplink "$file_plink" - -# train -for cvi in {0..4}; do - dir_wgt_cv="${dir_cv}tr.cv${cvi}/" - fin_sample="${dir_sample}tr.cv${cvi}.samples" - ./genoboost train \ - --dir "$dir_wgt_cv" \ - --file_plink "$file_plink" \ - --file_cov "$file_cov" \ - --file_sample "$fin_sample" \ - --learning_rates $learning_rates \ - --iter 100 \ - --clip_sample_weight "top0.1" \ - --prune_snv 0.1 -done - -# score -for cvi in {0..4}; do - dir_wgt_cv="${dir_cv}tr.cv${cvi}/" - dir_score_cv="${dir_cv}va.cv${cvi}/" - fin_sample="${dir_sample}va.cv${cvi}.samples" - ./genoboost score \ - --dir_score "$dir_score_cv" \ - --iters 10 30 50 100 \ - --file_plink "$file_plink" \ - --file_cov "$file_cov" \ - --file_sample "$fin_sample" \ - --dir_wgt "$dir_wgt_cv" \ - --learning_rates $learning_rates - - dir_score_cv="${dir_cv}ts.cv${cvi}/" - fin_sample="${dir_sample}test.samples" - ./genoboost score \ - --dir_score "$dir_score_cv" \ - --iters 10 30 50 100 \ - --file_plink "$file_plink" \ - --file_cov "$file_cov" \ - --file_sample "$fin_sample" \ - --dir_wgt "$dir_wgt_cv" \ - --learning_rates $learning_rates -done diff --git a/genoboost.docker.sh b/genoboost.docker.sh index 97e18a5..05c4921 100644 --- a/genoboost.docker.sh +++ b/genoboost.docker.sh @@ -4,35 +4,30 @@ set -eux -# mount to this path -dir_data="/work/data/" -dir_result="/work/result/" # output directory of training -dir_wgt="${dir_result}train/" +dir_wgt="./result/train/" # output directory of score -dir_score="${dir_result}score/" +dir_score="./result/score/" # prefix of plink1 file -file_plink="${dir_data}genot" +file_plink="./test/data/1kg_maf0.1_m1k/genot" # covariate file -file_cov="${dir_data}genot.cov" -# learning rate parameters -learning_rates="0.1 0.5" +file_cov="./test/data/1kg_maf0.1_m1k/genot.cov" + +function genoboost-docker() { + docker run -it rickyota/genoboost:latest "$@" +} # train -./genoboost train \ +./genoboost-docker train \ --dir "$dir_wgt" \ - --file_plink "$file_plink" \ - --file_cov "$file_cov" \ - --learning_rates $learning_rates \ - --iter 100 \ - --clip_sample_weight "top0.1" \ - --prune_snv 0.1 + --file-genot "$file_plink" \ + --file-phe "$file_cov" \ + --cov age,sex # score -./genoboost score \ - --dir_score "$dir_score" \ - --iters 10 30 50 100 \ - --file_plink "$file_plink" \ - --file_cov "$file_cov" \ - --dir_wgt "$dir_wgt" \ - --learning_rates $learning_rates +./genoboost-docker score \ + --dir-score "$dir_score" \ + --dir-wgt "$dir_wgt" \ + --file-genot "$file_plink" \ + --file-phe "$file_cov" \ + --cov age,sex diff --git a/genoboost.sh b/genoboost.sh index 6c978f6..f3c95c2 100644 --- a/genoboost.sh +++ b/genoboost.sh @@ -25,8 +25,7 @@ cp ./projects_rust/target/release/genoboost ./genoboost --dir "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex \ - --cross-validation 1 + --cov age,sex # score ./genoboost score \ @@ -34,5 +33,4 @@ cp ./projects_rust/target/release/genoboost ./genoboost --dir-wgt "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex \ - --cross-validation 1 + --cov age,sex From 21296158e1ac0b3a2a99b1ebe6808e65ab38e095 Mon Sep 17 00:00:00 2001 From: rickyota <22293266+rickyota@users.noreply.github.com> Date: Sun, 8 Oct 2023 13:03:28 +0900 Subject: [PATCH 2/6] add: README --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5824507..bcf6ff2 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,9 @@ $ genoboost train \ - [Cross-validation](#score-cv) - [Options for Score](#score-option) - [Advanced Guide](#advanced-guide) - - [Docker](#docker) - - [Singularity](#singularity) + - [Installation](#advanced-install) + - [Docker](#docker) + - [Singularity](#singularity) ## Introduction @@ -79,7 +80,7 @@ cargo build --manifest-path ./projects_rust/Cargo.toml --release --bin genoboost cp ./projects_rust/target/release/genoboost ./genoboost ``` -and you can use `genoboost` program. +and you can use `genoboost` program. This should take less than 5 minutes. #### Advanced Install @@ -288,9 +289,10 @@ $ genoboost score \ ## Advanced Guide +### Advanced Installation Using docker or singularity is recommended. -### Docker +#### Docker ```bash $ docker pull rickyota/genoboost:latest \ @@ -302,7 +304,7 @@ $ docker run -it rickyota/genoboost:latest \ --cov age,sex ``` -### Singularity +#### Singularity ```bash $ singularity build genoboost.sif ./docker/genoboost.def @@ -314,6 +316,9 @@ $ singularity run genoboost.sif \ --cov age,sex ``` +### Computational Time + +For ~216 thousands training samples and ~1.1 million SNVs for 10,000 unique SNVs, GenoBoost would take 10 hours. [release]: https://github.com/rickyota/genoboost/releases From ff6714d7cda82a07a2775349d8a1fde599ffdfbd Mon Sep 17 00:00:00 2001 From: rickyota <22293266+rickyota@users.noreply.github.com> Date: Sun, 8 Oct 2023 13:27:31 +0900 Subject: [PATCH 3/6] add: major-a2-train --- README.md | 2 +- create.publish.sh | 8 +- genoboost.cv.sh | 9 +- genoboost.docker.sh | 4 +- genoboost.sh | 5 +- projects_rust/Cargo.toml | 10 - projects_rust/boosting/benches/common.rs | 4 +- .../boosting/benches/loss_criterion.rs | 8 +- .../boosting/src/bin/boosting_res.rs | 35 +- projects_rust/boosting/src/bin/genoboost.rs | 27 +- .../boosting/src/bin_old/boosting_research.rs | 16 +- .../boosting/src/bin_old/genoboost.rs | 5 +- projects_rust/boosting/src/boosting_param.rs | 36 +- projects_rust/boosting/src/boosting_score.rs | 49 ++- .../boosting/src/boosting_score/io.rs | 219 ++++++---- .../boosting/src/boosting_score/run_scores.rs | 23 +- .../boosting/src/boosting_score/score.rs | 121 +++++- projects_rust/boosting/src/boosting_train.rs | 83 +++- .../src/boosting_train/coefficient.rs | 398 +++++++++++++----- .../boosting_train/coefficient/adjust_coef.rs | 37 +- .../src/boosting_train/coefficient/calc.rs | 31 ++ .../boosting/src/boosting_train/loss.rs | 150 +++---- .../boosting/src/boosting_train/loss/calc.rs | 364 ++++++++-------- .../src/boosting_train/regression_cov.rs | 3 - .../boosting/src/boosting_train/table.rs | 38 +- projects_rust/boosting/src/lib.rs | 109 +++-- projects_rust/boosting/src/wgt_boost/io.rs | 4 +- projects_rust/boosting/src/wgt_boosts.rs | 14 +- projects_rust/genetics/Cargo.toml | 1 - projects_rust/genetics/benches/common.rs | 8 +- projects_rust/genetics/src/alloc.rs | 5 + projects_rust/genetics/src/bin/genetics.rs | 28 +- .../genetics/src/bin/genetics_res.rs | 50 ++- .../genetics/src/bin/test_pgenlib.rs | 137 ++++++ projects_rust/genetics/src/cov.rs | 24 +- projects_rust/genetics/src/dataset.rs | 284 +++++++++---- .../genetics/src/dataset/io_genot.rs | 152 ++++++- .../genetics/src/dataset/io_genot/load.rs | 249 ++++++----- .../src/dataset/io_genot/load/plink.rs | 78 ++-- .../src/dataset/io_genot/load/plink2.rs | 379 ++++++++++++++--- .../src/dataset/io_genot/load_score.rs | 120 ++++-- projects_rust/genetics/src/dataset/samples.rs | 54 ++- .../genetics/src/dataset/samples/covs.rs | 5 +- projects_rust/genetics/src/dataset/snvs.rs | 6 +- .../genetics/src/genot/base_genot.rs | 72 +++- .../genetics/src/genot/genot_struct.rs | 64 ++- projects_rust/genetics/src/lib.rs | 308 ++++++++++++-- projects_rust/genetics/src/regression.rs | 205 +++++---- projects_rust/genetics/src/sample/io.rs | 36 +- projects_rust/genetics/src/score.rs | 264 +++++++++--- projects_rust/genetics/src/snv/snv_index.rs | 73 +++- projects_rust/genetics/src/textfile/text.rs | 1 + projects_rust/genetics/src/wgt/coef.rs | 9 + projects_rust/genetics/src/wgt/io.rs | 69 ++- projects_rust/genetics/src/wgts.rs | 15 +- 55 files changed, 3224 insertions(+), 1284 deletions(-) create mode 100644 projects_rust/genetics/src/bin/test_pgenlib.rs diff --git a/README.md b/README.md index c242669..16fd7c9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# GenoBoost v0.4.1 +# GenoBoost v1.0.0 [![GenoBoost](https://github.com/rickyota/genoboost/actions/workflows/genoboost.yml/badge.svg)](https://github.com/rickyota/genoboost/actions/workflows/genoboost.yml) [![Release](https://github.com/rickyota/genoboost/actions/workflows/publish.yml/badge.svg)](https://github.com/rickyota/genoboost/actions/workflows/publish.yml) diff --git a/create.publish.sh b/create.publish.sh index ef622ea..270d964 100644 --- a/create.publish.sh +++ b/create.publish.sh @@ -16,7 +16,7 @@ d_publish="./${artifact_name}/" cargo build \ --release \ - --target=${target} \ + --target=${target} \ --manifest-path ./projects_rust/Cargo.toml \ --no-default-features \ --bin genoboost @@ -27,15 +27,14 @@ cargo build \ mkdir -p ${d_publish} if [[ ${target} == *"windows"* ]]; then - cp ./projects_rust/target/${target}/release/genoboost.exe ${d_publish}/ + cp ./projects_rust/target/${target}/release/genoboost.exe ${d_publish}/ else - cp ./projects_rust/target/${target}/release/genoboost ${d_publish}/ + cp ./projects_rust/target/${target}/release/genoboost ${d_publish}/ fi mkdir -p ${d_publish}/example/ cp ./example/* ${d_publish}/example/ - zip -r ./${artifact_name}.zip ${d_publish} #if [[ ${target} == *"windows"* ]]; then @@ -43,4 +42,3 @@ zip -r ./${artifact_name}.zip ${d_publish} #else # zip -r ./${artifact_name}.zip ${d_publish} #fi - diff --git a/genoboost.cv.sh b/genoboost.cv.sh index 0ba94fe..d307636 100644 --- a/genoboost.cv.sh +++ b/genoboost.cv.sh @@ -12,7 +12,6 @@ file_plink="./test/data/1kg_maf0.1_m1k/genot" # covariate file file_cov="./test/data/1kg_maf0.1_m1k/genot.cov" - # compile export RUST_BACKTRACE=full cargo build --manifest-path ./projects_rust/Cargo.toml --release --bin genoboost @@ -24,14 +23,14 @@ cp ./projects_rust/target/release/genoboost ./genoboost --file-genot "$file_plink" \ --file-phe "$file_cov" \ --cov age,sex \ - --cross-validation 5 + --cross-validation 5 \ + --major_a2_train # score ./genoboost score \ --dir-score "${dir}/score" \ - --dir-wgt "${dir}/train" \ + --dir-wgt "${dir}/train" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex \ + --cov age,sex \ --cross-validation 5 - diff --git a/genoboost.docker.sh b/genoboost.docker.sh index 05c4921..b236f51 100644 --- a/genoboost.docker.sh +++ b/genoboost.docker.sh @@ -22,7 +22,7 @@ function genoboost-docker() { --dir "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex + --cov age,sex # score ./genoboost-docker score \ @@ -30,4 +30,4 @@ function genoboost-docker() { --dir-wgt "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex + --cov age,sex diff --git a/genoboost.sh b/genoboost.sh index f3c95c2..84fa5f9 100644 --- a/genoboost.sh +++ b/genoboost.sh @@ -25,7 +25,8 @@ cp ./projects_rust/target/release/genoboost ./genoboost --dir "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex + --cov age,sex \ + --major_a2_train # score ./genoboost score \ @@ -33,4 +34,4 @@ cp ./projects_rust/target/release/genoboost ./genoboost --dir-wgt "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex + --cov age,sex diff --git a/projects_rust/Cargo.toml b/projects_rust/Cargo.toml index 067eec2..b9e3ae4 100644 --- a/projects_rust/Cargo.toml +++ b/projects_rust/Cargo.toml @@ -1,16 +1,6 @@ [workspace] - - members=[ "boosting", "genetics", "cmatrix", - #"playground", - #"test_pgenlib", - #"test_rust", - #"test_pyo3", ] - - - - diff --git a/projects_rust/boosting/benches/common.rs b/projects_rust/boosting/benches/common.rs index 426731c..4621a5a 100644 --- a/projects_rust/boosting/benches/common.rs +++ b/projects_rust/boosting/benches/common.rs @@ -13,12 +13,12 @@ fn setup_vars( let sample_buf = fin_sample.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); //) -> (Dataset, Vec, LossStruct) { - let dataset: Dataset = Dataset::new( + let dataset: Dataset = Dataset::new_boost_training( fin, GenotFormat::Plink1, None, None, - "", + None, snv_buf.as_deref(), sample_buf.as_deref(), //fin_snv, diff --git a/projects_rust/boosting/benches/loss_criterion.rs b/projects_rust/boosting/benches/loss_criterion.rs index 3d15e70..6e15b02 100644 --- a/projects_rust/boosting/benches/loss_criterion.rs +++ b/projects_rust/boosting/benches/loss_criterion.rs @@ -95,7 +95,7 @@ unsafe fn bench_calculate_loss_gt_comp(c: &mut Criterion) { &mut losss, &dataset.genot(), &sample_weight, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), &HashSet::new(), ) @@ -108,7 +108,7 @@ unsafe fn bench_calculate_loss_gt_comp(c: &mut Criterion) { &mut losss, &dataset.genot(), &sample_weight, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), ) }) @@ -124,7 +124,7 @@ unsafe fn bench_calculate_loss_gt_comp(c: &mut Criterion) { &mut losss, &dataset.genot(), &sample_weight, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), &HashSet::new(), ) @@ -137,7 +137,7 @@ unsafe fn bench_calculate_loss_gt_comp(c: &mut Criterion) { &mut losss, &dataset.genot(), &sample_weight, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), ) }) diff --git a/projects_rust/boosting/src/bin/boosting_res.rs b/projects_rust/boosting/src/bin/boosting_res.rs index ab05edc..da3497f 100644 --- a/projects_rust/boosting/src/bin/boosting_res.rs +++ b/projects_rust/boosting/src/bin/boosting_res.rs @@ -49,8 +49,6 @@ struct Cli { threads: Option, #[arg(long, global = true, help = "Verbose")] verbose: bool, - #[arg(long, global = true, help = "ccccccc")] - cdef: bool, } #[derive(Debug, Subcommand)] @@ -86,7 +84,7 @@ struct TrainArgs { phe: Option, // parse later #[arg(long)] - cov: String, + cov: Option, //#[arg(long)] //file_cov: Option, #[arg(long)] @@ -113,6 +111,11 @@ struct TrainArgs { //use_adjloss: bool, //#[arg(long)] //use_const_for_loss: bool, + #[arg( + long, + help = "Set major allele in training dataset as a2 allele. Otherwise, set ref allele as a2 allele." + )] + major_a2_train: bool, #[arg(long)] resume: bool, #[arg(long)] @@ -136,19 +139,22 @@ struct ScoreArgs { file_sample: Option, #[arg(long)] file_phe: Option, + //#[arg(long)] + //phe: Option, #[arg(long)] - phe: Option, - #[arg(long)] - cov: String, + cov: Option, //#[arg(long)] //file_cov: Option, // if indicated, do not use para_best and calc score of all paras - #[arg(long)] + #[arg(long, value_parser, num_args = 1.., value_delimiter = ' ')] iters: Option>, - #[arg(long)] + #[arg(long, value_parser, num_args = 1.., value_delimiter = ' ')] learning_rates: Option>, #[arg(long)] use_iter: bool, + // TMP: to remove + #[arg(long)] + use_snv_pos: bool, } #[derive(Copy, Clone, PartialEq, Eq, Debug, ValueEnum)] @@ -181,6 +187,7 @@ impl GenotFormatArg { fn main() { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + // or use _enabled! if is_x86_feature_detected!("avx2") { log::info!("Able to use SIMD.") } else { @@ -275,6 +282,8 @@ fn main() { log::info!("file_sample {:?}", fin_sample); log::info!("boost_params {:?}", boost_params); + let make_major_a2_train = args.major_a2_train; + let use_adjloss = true; //let use_adjloss = args.use_adjloss; let use_const_for_loss = false; @@ -287,7 +296,7 @@ fn main() { genot_format, fin_phe.as_deref(), phe_name.as_deref(), - &cov_name, + cov_name.as_deref(), boost_method, &boost_params, fin_snv.as_deref(), @@ -301,6 +310,7 @@ fn main() { None, //prune_snv, //&learning_rates, is_monitor, + make_major_a2_train, ); } Commands::Score(args) => { @@ -308,7 +318,7 @@ fn main() { let fin = PathBuf::from(args.file_genot); let genot_format = args.genot_format.to_naive(); let fin_phe = args.file_phe.map(|x| PathBuf::from(x)); - let phe_name = args.phe; + //let phe_name = args.phe; let cov_name = args.cov; let fin_sample = args.file_sample.map(|x| PathBuf::from(x)); //let fin_cov = args.file_cov.map(|x| PathBuf::from(x)); @@ -345,8 +355,8 @@ fn main() { genot_format, phe_buf.as_deref(), //fin_phe.as_deref(), - phe_name.as_deref(), - Some(&cov_name), + //phe_name.as_deref(), + cov_name.as_deref(), is_every_para, iterations.as_deref(), dout_wgt.as_deref(), // use enum? @@ -357,6 +367,7 @@ fn main() { //boost_param, &learning_rates, use_iter, + args.use_snv_pos, ); } } diff --git a/projects_rust/boosting/src/bin/genoboost.rs b/projects_rust/boosting/src/bin/genoboost.rs index 85c0b6e..99c4de0 100644 --- a/projects_rust/boosting/src/bin/genoboost.rs +++ b/projects_rust/boosting/src/bin/genoboost.rs @@ -87,7 +87,7 @@ struct TrainArgs { phe: Option, // parse later #[arg(long)] - cov: String, + cov: Option, //#[arg(long)] //file_cov: Option, #[arg(long)] @@ -115,6 +115,11 @@ struct TrainArgs { resume: bool, #[arg(long)] write_loss: bool, + #[arg( + long, + help = "Set major allele in training dataset as a2 allele. Otherwise, set ref allele as a2 allele." + )] + major_a2_train: bool, // --integrate-only //#[arg(long, default_value_t = true)] //integrate: bool, @@ -147,11 +152,11 @@ struct ScoreArgs { file_sample: Option, #[arg(long)] file_phe: Option, - #[arg(long)] - phe: Option, + //#[arg(long)] + //phe: Option, // TODO: remove --cov and read from wgt? #[arg(long)] - cov: String, + cov: Option, //#[arg(long)] //file_cov: Option, // if indicated, do not use para_best and calc score of all paras @@ -309,7 +314,7 @@ fn main() { } else if args.iter_snv.is_some() { boost_params.set_iteration_snv(args.iter_snv.unwrap()) } else { - if args.train_only{ + if args.train_only { panic!("You have to use --iter-snv or --iter with --train-only"); } // else: integrate @@ -328,6 +333,8 @@ fn main() { log::info!("file_sample {:?}", fin_sample); log::info!("boost_params {:?}", boost_params); + let make_major_a2_train = args.major_a2_train; + let use_adjloss = true; //let use_adjloss = args.use_adjloss; let use_const_for_loss = false; @@ -347,7 +354,7 @@ fn main() { genot_format, fin_phe.as_deref(), phe_name.as_deref(), - &cov_name, + cov_name.as_deref(), boost_method, &boost_params, fin_snv.as_deref(), @@ -361,6 +368,7 @@ fn main() { None, //prune_snv, //&learning_rates, is_monitor, + make_major_a2_train, cross_vali, seed, ); @@ -370,7 +378,7 @@ fn main() { let fin = PathBuf::from(args.file_genot); let genot_format = args.genot_format.to_naive(); let fin_phe = args.file_phe.map(|x| PathBuf::from(x)); - let phe_name = args.phe; + //let phe_name = args.phe; let cov_name = args.cov; let fin_sample = args.file_sample.map(|x| PathBuf::from(x)); //let fin_cov = args.file_cov.map(|x| PathBuf::from(x)); @@ -407,8 +415,8 @@ fn main() { &fin, genot_format, fin_phe.as_deref(), - phe_name.as_deref(), - Some(&cov_name), + //phe_name.as_deref(), + cov_name.as_deref(), is_every_para, iterations.as_deref(), dout_wgt.as_deref(), // use enum? @@ -419,6 +427,7 @@ fn main() { &learning_rates, use_iter, cross_vali, + false, ); } } diff --git a/projects_rust/boosting/src/bin_old/boosting_research.rs b/projects_rust/boosting/src/bin_old/boosting_research.rs index 9780e64..aefc802 100644 --- a/projects_rust/boosting/src/bin_old/boosting_research.rs +++ b/projects_rust/boosting/src/bin_old/boosting_research.rs @@ -1,11 +1,11 @@ //! Application of **Genoboost**. //! Input plink file to run Genoboost. -//! +//! //! Logitnomissing //! When the denominator of s2 is 0.0, (no eps and no samples for minor homozygotes), s2 is set the same as s1 (=dominant model). //! When the denominator of s1, s0 is 0.0, (no samples for major homozygotes or heterozygotes), s0, s1 is set to 0.0 -//! -//! +//! +//! // TODO: ensure the same para when resuming // TODO: (optional) write down extract snvs from top // TODO: how to get memory? @@ -189,7 +189,7 @@ fn main() { let fin_sample_val = matches .value_of("file_sample_val") .map(|x| PathBuf::from(x)); - let is_monitor=fin_sample_val.is_some(); + let is_monitor = fin_sample_val.is_some(); //let boost_type = matches.value_of("boost_type").unwrap(); let learning_rates: Vec> = matches @@ -202,7 +202,6 @@ fn main() { let is_write_loss = matches.is_present("write_loss"); //let is_write_loss = true; - // make this Option<> here? or in Boostingparam? // should be here, or there should be option to make Option<> as default let clip_sample_weight = match matches.value_of("clip_sample_weight").unwrap() { @@ -288,7 +287,7 @@ fn main() { boost_param.check(); - log::debug!("boost_param {:?}",boost_param); + log::debug!("boost_param {:?}", boost_param); /* // set learning rate later let boost_param = BoostParam::new_str( @@ -387,8 +386,9 @@ fn main() { .values_of("iters") .unwrap() .map(|s| { - s.parse::() - .expect("Iters should be able to be parsed to non-negative integer") + s.parse::().unwrap_or_else(|| { + panic!("Iters should be able to be parsed to non-negative integer") + }) }) .collect(); // sort and deduplication diff --git a/projects_rust/boosting/src/bin_old/genoboost.rs b/projects_rust/boosting/src/bin_old/genoboost.rs index 7bcd6b8..c405962 100644 --- a/projects_rust/boosting/src/bin_old/genoboost.rs +++ b/projects_rust/boosting/src/bin_old/genoboost.rs @@ -355,8 +355,9 @@ fn main() { .values_of("iters") .unwrap() .map(|s| { - s.parse::() - .expect("Iters should be able to be parsed to non-negative integer") + s.parse::().unwrap_or_else(|| { + panic!("Iters should be able to be parsed to non-negative integer") + }) }) .collect(); // sort and deduplication diff --git a/projects_rust/boosting/src/boosting_param.rs b/projects_rust/boosting/src/boosting_param.rs index 96c93a9..4d084ac 100644 --- a/projects_rust/boosting/src/boosting_param.rs +++ b/projects_rust/boosting/src/boosting_param.rs @@ -260,14 +260,15 @@ impl BoostParams { panic!("Cannot assign is_dom_and_rec in freemodelmissing") } - if self.boost_type().is_type_ada() & self.eps().is_none() { - panic!("Cannot assign Eps::None in freemodelmissing") - } + // use effeps instead + //if self.boost_type().is_type_ada() & self.eps().is_none() { + // panic!("Cannot assign Eps::None in freemodelmissing") + //} - if self.boost_type().is_logit() & self.sample_weight_clip().is_none() { - log::info!("WARNING: Cannot assign SampleWeightClip::None in Logit") - //panic!("Cannot assign SampleWeightClip::None in Logit") - } + //if self.boost_type().is_logit() & self.sample_weight_clip().is_none() { + // log::info!("WARNING: Cannot assign SampleWeightClip::None in Logit") + // //panic!("Cannot assign SampleWeightClip::None in Logit") + //} if self.boost_type().is_type_logit() & !self.cov_way().unwrap().is_first() { // since now ps is not renewed for iteration @@ -284,7 +285,6 @@ impl BoostParams { //} } - // or itegrate with set_iteration_snv? pub fn set_iteration(self, iteration: usize) -> Self { Self { @@ -665,14 +665,26 @@ impl BoostType { } } + // FIXME: remove // allow missing value in dataset - pub fn use_missing(self) -> bool { + //pub fn use_missing(self) -> bool { + // match self { + // BoostType::Ada + // | BoostType::ConstAda + // | BoostType::LogitAdd + // | BoostType::LogitNoMissing => false, + // BoostType::FreeModelMissing | BoostType::Logit => true, + // } + //} + + // fill missing value in dataset + pub fn fill_missing(self) -> bool { match self { BoostType::Ada | BoostType::ConstAda | BoostType::LogitAdd - | BoostType::LogitNoMissing => false, - BoostType::FreeModelMissing | BoostType::Logit => true, + | BoostType::LogitNoMissing => true, + BoostType::FreeModelMissing | BoostType::Logit => false, } } @@ -1139,6 +1151,8 @@ impl FromStr for EffEps { impl EffEps { pub fn is_on_update(self) -> bool { + // true: only apply adjusting in PGS not selecting on loss + // false: apply adjusting in PGS and loss match self { Self::LimS12GmodelPropOnUpdate(..) | Self::LimS2GmodelOverPropOnUpdate(..) => true, Self::LimScore(..) diff --git a/projects_rust/boosting/src/boosting_score.rs b/projects_rust/boosting/src/boosting_score.rs index e2a21d5..0f6e00f 100644 --- a/projects_rust/boosting/src/boosting_score.rs +++ b/projects_rust/boosting/src/boosting_score.rs @@ -8,9 +8,10 @@ pub mod score; pub use crate::boosting_param::{BoostMethod, BoostParam, BoostType, EffEps, Eps, IterationNumber}; use crate::wgt_boost; use crate::wgt_boosts::WgtBoosts; -use genetics::sample; +//use genetics::sample; +use genetics::Dataset; use genetics::GenotFormat; -use genetics::{io_genot, Dataset}; +//use genetics::{io_genot, Dataset}; pub use run_scores::*; use std::path::Path; @@ -20,14 +21,14 @@ pub fn run_boosting_score_para_best( gfmt: GenotFormat, phe_buf: Option<&[u8]>, //fin_phe: Option<&Path>, - phe_name: Option<&str>, + //phe_name: Option<&str>, cov_name: Option<&str>, file_wgt: &Path, extract_sample_buf: Option<&[u8]>, //fin_sample: Option<&Path>, //boost_param: BoostParam, + use_snv_pos: bool, ) { - // check fwgt exist. wgt_boost::io::check_file_wgt_exist(&file_wgt); @@ -43,36 +44,34 @@ pub fn run_boosting_score_para_best( //let has_cov = true; //let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); - let (_, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); - let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); + //let (_, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); + //let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); let mut wgts = WgtBoosts::new_from_file(&file_wgt); //let mut wgts = WgtBoosts::new_from_file(&file_wgt, boost_param.boost_type()); //let mut wgts = WgtBoosts::new_from_file_dir(&dout_wgt_para, boost_param.boost_type()); - let use_missing = wgts.use_missing(); + //let use_missing = wgts.use_missing(); + let fill_missing = wgts.fill_missing(); let dataset = Dataset::new_score( fin, gfmt, phe_buf, - phe_name, + //phe_name, cov_name, extract_sample_buf.as_deref(), wgts.wgts_mut(), - use_missing, + fill_missing, + use_snv_pos, ); boosting_score_para_best( - dout_score, - &wgts, - &dataset, + dout_score, &wgts, &dataset, //&genotypes, //&ys_bool, - &samples_id, + //&samples_id, //n, has_cov, - ); - - + ); } pub fn run_boosting_score_para( @@ -81,7 +80,7 @@ pub fn run_boosting_score_para( gfmt: GenotFormat, phe_buf: Option<&[u8]>, //fin_phe: Option<&Path>, - phe_name: Option<&str>, + //phe_name: Option<&str>, // Option<> for nocov only cov_name: Option<&str>, iterations_in: &[usize], @@ -90,6 +89,7 @@ pub fn run_boosting_score_para( //fin_sample: Option<&Path>, //boost_param: BoostParam, use_iter: bool, + use_snv_pos: bool, ) { // check fwgt exist. wgt_boost::io::check_file_wgt_exist(&file_wgt); @@ -107,25 +107,24 @@ pub fn run_boosting_score_para( //let has_cov = true; //let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); - let (_, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); + //let (_, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); //let (_, use_samples) = sample::make_use_samples(fin_sample, fin, gfmt); - let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); + //let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); let mut wgts = WgtBoosts::new_from_file(&file_wgt); //let mut wgts = WgtBoosts::new_from_file(&file_wgt, boost_param.boost_type()); //let mut wgts = WgtBoosts::new_from_file_dir(&dout_wgt_para, boost_param.boost_type()); - let use_missing = wgts.use_missing(); + //let use_missing = wgts.use_missing(); + let fill_missing = wgts.fill_missing(); let dataset = Dataset::new_score( fin, gfmt, phe_buf.as_deref(), - //fin_phe, - phe_name, cov_name, extract_sample_buf.as_deref(), - //fin_sample, wgts.wgts_mut(), - use_missing, + fill_missing, + use_snv_pos, ); boosting_score( @@ -135,7 +134,7 @@ pub fn run_boosting_score_para( &dataset, //&genotypes, //&ys_bool, - &samples_id, + //&samples_id, //n, has_cov, use_iter, diff --git a/projects_rust/boosting/src/boosting_score/io.rs b/projects_rust/boosting/src/boosting_score/io.rs index cc10c42..72e031e 100644 --- a/projects_rust/boosting/src/boosting_score/io.rs +++ b/projects_rust/boosting/src/boosting_score/io.rs @@ -1,4 +1,5 @@ use genetics::samples::prelude::*; +use genetics::score as gscore; use std::fs; use std::fs::File; use std::io::{BufWriter, Write}; @@ -11,7 +12,7 @@ pub fn get_dir_score_cv(dout: &Path, cvi: usize) -> PathBuf { d } -pub fn create_dir(dout: &Path) { +fn create_dir(dout: &Path) { fs::create_dir_all(&dout).unwrap(); } //pub fn create_dir(dout: &Path) { @@ -20,9 +21,9 @@ pub fn create_dir(dout: &Path) { // TODO: make nocov, is_nsnvs into enum // call with iter and score -pub fn fname_score_createdir( +pub fn fname_score_concat_createdir( dout: &Path, - iter: usize, + //iter: usize, nocov: bool, is_nsnv: bool, integrate: bool, @@ -30,121 +31,157 @@ pub fn fname_score_createdir( let f = if integrate { fname_score_integrate(dout, nocov) } else { - fname_score(dout, iter, nocov, is_nsnv) + fname_score_concat(dout, nocov, is_nsnv) + //fname_score(dout, iter, nocov, is_nsnv) }; let d = f.parent().unwrap().to_path_buf(); create_dir(&d); f } -pub fn fname_score(dout: &Path, iter: usize, nocov: bool, is_nsnv: bool) -> PathBuf { - if is_nsnv { - fname_score_nsnv(dout, iter, nocov) +// // TODO: make nocov, is_nsnvs into enum +// // call with iter and score +// pub fn fname_score_createdir( +// dout: &Path, +// iter: usize, +// nocov: bool, +// is_nsnv: bool, +// integrate: bool, +// ) -> PathBuf { +// let f = if integrate { +// fname_score_integrate(dout, nocov) +// } else { +// fname_score(dout, iter, nocov, is_nsnv) +// }; +// let d = f.parent().unwrap().to_path_buf(); +// create_dir(&d); +// f +// } + +fn fname_score_concat(dout: &Path, nocov: bool, is_nsnv: bool) -> PathBuf { + let para = if is_nsnv { + "n".to_string() } else { - fname_score_iter(dout, iter, nocov) - } -} + "iter".to_string() + }; -pub fn fname_score_integrate(dout: &Path, nocov: bool) -> PathBuf { let mut d = dout.to_owned(); let fname = if nocov { - "score.tsv" + "boosting_".to_string() + ¶ + ".score" } else { - "score.withcov.tsv" + "boosting_".to_string() + ¶ + ".scorecov" }; - //d.push(dscore); d.push(fname); d } -pub fn fname_score_iter(dout: &Path, iter: usize, nocov: bool) -> PathBuf { - let mut d = dout.to_owned(); - let dscore = if nocov { - "score.iter" - //dscore = "score.nocov.iter"; - //fout.to_owned() + ".nocov.iter" + &iter.to_string() + ".score" - //dout.to_owned() + ".score.nocov.iter" + &iter.to_string() - } else { - "score.withcov.iter" - //dscore = "score.iter"; - //dout.to_owned() + ".score.iter" + &iter.to_string() - }; - - d.push(dscore); - let fname = "iter".to_string() + &iter.to_string() + ".score"; - d.push(fname); - d -} +// fn fname_score(dout: &Path, iter: usize, nocov: bool, is_nsnv: bool) -> PathBuf { +// if is_nsnv { +// fname_score_nsnv(dout, iter, nocov) +// } else { +// fname_score_iter(dout, iter, nocov) +// } +// } -pub fn fname_score_nsnv(dout: &Path, nsnv: usize, nocov: bool) -> PathBuf { +fn fname_score_integrate(dout: &Path, nocov: bool) -> PathBuf { let mut d = dout.to_owned(); - let dscore = if nocov { - "score.nsnv" - //"score.nocov.nsnv" + let fname = if nocov { + "boosting.score" + //"score.tsv" } else { - "score.withcov.nsnv" - //"score.nsnv" + "boosting.scorecov" + //"score.withcov.tsv" }; - - //let dscore; - //if nocov { - // dscore = "score.nocov.nsnv"; - //} else { - // dscore = "score.nsnv"; - //} - - d.push(dscore); - let fname = "nsnv".to_string() + &nsnv.to_string() + ".score"; d.push(fname); d - /* if nocov { - dout.to_owned() + ".score.nocov.nsnv" + &nsnv.to_string() - //fout.to_owned() + ".nocov.n" + &nsnv.to_string() + ".score" - } else { - dout.to_owned() + ".score.nsnv" + &nsnv.to_string() - //fout.to_owned() + ".n" + &nsnv.to_string() + ".score" - } */ } -// moved to genetics::score -//pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[(String, String)]) { -//pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[String]) { -pub fn write_scores(fout: &Path, scores: &[f64], _: &Phe, samples_id: &[String]) { - let file = match File::create(&fout) { - Ok(file) => file, - Err(_) => panic!( - "Cannot create file, possibly directory does not exist: {:?}", - &fout - ), - }; - - let mut writer = BufWriter::new(file); - // assume word count of one line is 30 - // no problem when it exceeds - let capacity = 30 * scores.len(); - let mut score_string = String::with_capacity(capacity); - score_string.push_str("iid\tscore\n"); - //score_string.push_str("fid\tiid\tphe\trs\n"); - - for ni in 0..scores.len() { - score_string.push_str(&samples_id[ni]); - score_string.push_str("\t"); - //score_string.push_str(&samples_id[ni]); - //score_string.push_str("\t"); - //score_string.push_str(&(phe.get_unchecked(ni) as u8).to_string()); - //score_string.push_str("\t"); - score_string.push_str(&format!("{:.5}\n", scores[ni])); +// fn fname_score_iter(dout: &Path, iter: usize, nocov: bool) -> PathBuf { +// let mut d = dout.to_owned(); +// let dscore = if nocov { +// "score.iter" +// } else { +// "score.withcov.iter" +// }; +// d.push(dscore); +// let fname = "iter-".to_string() + &iter.to_string() + ".score"; +// d.push(fname); +// d +// } + +// fn fname_score_nsnv(dout: &Path, nsnv: usize, nocov: bool) -> PathBuf { +// let mut d = dout.to_owned(); +// let dscore = if nocov { +// "score.nsnv" +// } else { +// "score.withcov.nsnv" +// }; +// d.push(dscore); +// let fname = "nsnv-".to_string() + &nsnv.to_string() + ".score"; +// d.push(fname); +// d +// } + +pub fn write_scores_concat( + fout: &Path, + score_paras: &[Vec], + paras: &[String], + //scores: &[f64], + samples_id: &[String], + is_nsnv: bool, + integrate: bool, +) { + if integrate { + assert_eq!(score_paras.len(), 1); + gscore::write_scores_nopheno(fout, &score_paras[0], samples_id) + } else { + if is_nsnv { + gscore::write_scores_paras_nopheno(fout, score_paras, "n", paras, samples_id) + } else { + gscore::write_scores_paras_nopheno(fout, score_paras, "iter", paras, samples_id) + } } - - writer.write(score_string.as_bytes()).unwrap(); - - //for ni in - // use .concat(), .join()? - // https://users.rust-lang.org/t/fast-string-concatenation/4425/5 - // -> .push_str seems fastest - // -> could be because use with_capacity beforehand } +// // moved to genetics::score +// //pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[(String, String)]) { +// //pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[String]) { +// pub fn write_scores(fout: &Path, scores: &[f64], samples_id: &[String]) { +// let file = match File::create(&fout) { +// Ok(file) => file, +// Err(_) => panic!( +// "Cannot create file, possibly directory does not exist: {:?}", +// &fout +// ), +// }; + +// let mut writer = BufWriter::new(file); +// // assume word count of one line is 30 +// // no problem when it exceeds +// let capacity = 30 * scores.len(); +// let mut score_string = String::with_capacity(capacity); +// score_string.push_str("iid\tscore\n"); +// //score_string.push_str("fid\tiid\tphe\trs\n"); + +// for ni in 0..scores.len() { +// score_string.push_str(&samples_id[ni]); +// score_string.push_str("\t"); +// //score_string.push_str(&samples_id[ni]); +// //score_string.push_str("\t"); +// //score_string.push_str(&(phe.get_unchecked(ni) as u8).to_string()); +// //score_string.push_str("\t"); +// score_string.push_str(&format!("{:.5}\n", scores[ni])); +// } + +// writer.write(score_string.as_bytes()).unwrap(); + +// //for ni in +// // use .concat(), .join()? +// // https://users.rust-lang.org/t/fast-string-concatenation/4425/5 +// // -> .push_str seems fastest +// // -> could be because use with_capacity beforehand +// } + /* // moved to genetics::score //pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[(String, String)]) { pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[String]) { diff --git a/projects_rust/boosting/src/boosting_score/run_scores.rs b/projects_rust/boosting/src/boosting_score/run_scores.rs index 7ede0f3..8ad9e0f 100644 --- a/projects_rust/boosting/src/boosting_score/run_scores.rs +++ b/projects_rust/boosting/src/boosting_score/run_scores.rs @@ -58,7 +58,7 @@ pub fn boosting_score( dataset: &Dataset, // why not samples_id in dataset.samples? //samples_id: &[(String, String)], - samples_id: &[String], + //samples_id: &[String], has_cov: bool, use_iter: bool, ) { @@ -70,7 +70,9 @@ pub fn boosting_score( if use_iter & has_cov { log::debug!("Start iters with covariates."); score::calculate_write_score_iterations( - dout, iterations, wgts, dataset, samples_id, false, + dout, iterations, wgts, dataset, + false, + //dout, iterations, wgts, dataset, samples_id, false, ); } }); @@ -79,7 +81,9 @@ pub fn boosting_score( if has_cov { log::debug!("Start nsnvs with covariates."); score::calculate_write_score_nsnvs( - dout, iterations, wgts, dataset, samples_id, false, + dout, iterations, wgts, dataset, + false, + //dout, iterations, wgts, dataset, samples_id, false, ); } }); @@ -87,15 +91,14 @@ pub fn boosting_score( let thread2 = s.spawn(|_| { if use_iter { log::debug!("Start iters without covariates."); - score::calculate_write_score_iterations( - dout, iterations, wgts, dataset, samples_id, true, - ); + score::calculate_write_score_iterations(dout, iterations, wgts, dataset, true); } }); let thread3 = s.spawn(|_| { log::debug!("Start nsnvs without covariates."); - score::calculate_write_score_nsnvs(dout, iterations, wgts, dataset, samples_id, true); + score::calculate_write_score_nsnvs(dout, iterations, wgts, dataset, true); + //score::calculate_write_score_nsnvs(dout, iterations, wgts, dataset, samples_id, true); }); thread0.join().unwrap(); @@ -112,7 +115,7 @@ pub fn boosting_score_para_best( dataset: &Dataset, // why not samples_id in dataset.samples? //samples_id: &[(String, String)], - samples_id: &[String], + //samples_id: &[String], has_cov: bool, ) { log::debug!("score for iterations"); @@ -122,13 +125,13 @@ pub fn boosting_score_para_best( let thread1 = s.spawn(|_| { if has_cov { log::debug!("Start nsnvs with covariates."); - score::calculate_write_score_para_best(dout, wgts, dataset, samples_id, false); + score::calculate_write_score_para_best(dout, wgts, dataset, false); } }); let thread3 = s.spawn(|_| { log::debug!("Start nsnvs without covariates."); - score::calculate_write_score_para_best(dout, wgts, dataset, samples_id, true); + score::calculate_write_score_para_best(dout, wgts, dataset, true); }); thread1.join().unwrap(); diff --git a/projects_rust/boosting/src/boosting_score/score.rs b/projects_rust/boosting/src/boosting_score/score.rs index 8992d94..24a1a0b 100644 --- a/projects_rust/boosting/src/boosting_score/score.rs +++ b/projects_rust/boosting/src/boosting_score/score.rs @@ -238,12 +238,12 @@ fn calculate_write_score( item_last_indexs_write: &[usize], // this should indicate the last index, not number of items wgts: &WgtBoosts, dataset: &Dataset, - samples_id: &[String], + //samples_id: &[String], //samples_id: &[(String, String)], nocov: bool, is_nsnv: bool, // for fout name item_ns_fname: Option<&[usize]>, // when fname = items_write - integrate:bool + integrate: bool, ) { let item_ns_fname = match item_ns_fname { Some(v) => v, @@ -252,8 +252,13 @@ fn calculate_write_score( let n = dataset.genot().n(); let genot = dataset.genot(); - let phe = dataset.samples().phe(); - let covs = dataset.samples().covs().unwrap(); + //let phe = dataset.samples().phe_unwrap(); + let covs = dataset.samples().covs(); + //let covs = dataset.samples().covs().unwrap(); + let samples_id = dataset.samples().names(); + + let mut score_paras: Vec> = vec![]; + let mut paras: Vec = vec![]; let mut scores = vec![0.0f64; n]; @@ -267,19 +272,18 @@ fn calculate_write_score( //log::debug!("wgt {:?}", wgt); if !(nocov && wgt.wgt().is_cov()) { - add_score(&mut scores, wgt, genot, Some(covs)); + add_score(&mut scores, wgt, genot, covs); } //log::debug!("iter_next,use{},{}", iter_next, iter_use); if item_i == *item_next { - //log::debug!("Write iter {}", iter_next); //write - let fout_iteration = - super::io::fname_score_createdir(dout, *item_fname, nocov, is_nsnv,integrate); - log::debug!("Write iteration {}: {:?}", item_fname, fout_iteration); - - super::io::write_scores(&fout_iteration, &scores, phe, samples_id); + //let fout_iteration = + //super::io::fname_score_createdir(dout, *item_fname, nocov, is_nsnv, integrate); + log::debug!("Save iteration: {}", item_fname); + paras.push(item_fname.to_string()); - //log::debug!("Done write iteration {}: {:?}", item_fname, fout_iteration); + score_paras.push(scores.clone()); + //super::io::write_scores(&fout_iteration, &scores, samples_id); // raise error... //(iter_next, iter_fname) = match iters_write_pair.next() { @@ -293,8 +297,87 @@ fn calculate_write_score( item_fname = v.1; } } + + // write + let fout_concat = super::io::fname_score_concat_createdir(dout, nocov, is_nsnv, integrate); + log::debug!("Write score: {:?}", fout_concat); + super::io::write_scores_concat( + &fout_concat, + &score_paras, + ¶s, + samples_id, + is_nsnv, + integrate, + ); } +// /// a item refer to a wgt, which is different from iter +// fn calculate_write_score( +// dout: &Path, +// item_last_indexs_write: &[usize], // this should indicate the last index, not number of items +// wgts: &WgtBoosts, +// dataset: &Dataset, +// //samples_id: &[String], +// //samples_id: &[(String, String)], +// nocov: bool, +// is_nsnv: bool, // for fout name +// item_ns_fname: Option<&[usize]>, // when fname = items_write +// integrate: bool, +// ) { +// let item_ns_fname = match item_ns_fname { +// Some(v) => v, +// None => item_last_indexs_write, +// }; + +// let n = dataset.genot().n(); +// let genot = dataset.genot(); +// //let phe = dataset.samples().phe_unwrap(); +// let covs = dataset.samples().covs(); +// //let covs = dataset.samples().covs().unwrap(); +// let samples_id = dataset.samples().names(); + +// let mut scores = vec![0.0f64; n]; + +// let mut items_write_pair = item_last_indexs_write.iter().zip(item_ns_fname.iter()); +// //let mut iters_write = iters_write.clone().iter(); +// let (mut item_next, mut item_fname) = items_write_pair.next().unwrap(); +// //let (mut iter_i, mut iter_next) = iters_write.enumerate().next().unwrap(); +// //let mut iter_next = *iters_write.next().unwrap(); + +// for (item_i, wgt) in wgts.wgts().iter().enumerate() { +// //log::debug!("wgt {:?}", wgt); + +// if !(nocov && wgt.wgt().is_cov()) { +// add_score(&mut scores, wgt, genot, covs); +// //add_score(&mut scores, wgt, genot, Some(covs)); +// } +// //log::debug!("iter_next,use{},{}", iter_next, iter_use); +// if item_i == *item_next { +// //log::debug!("Write iter {}", iter_next); +// //write +// let fout_iteration = +// super::io::fname_score_createdir(dout, *item_fname, nocov, is_nsnv, integrate); +// log::debug!("Write iteration {}: {:?}", item_fname, fout_iteration); + +// super::io::write_scores(&fout_iteration, &scores, samples_id); +// //super::io::write_scores(&fout_iteration, &scores, phe, samples_id); + +// //log::debug!("Done write iteration {}: {:?}", item_fname, fout_iteration); + +// // raise error... +// //(iter_next, iter_fname) = match iters_write_pair.next() { +// let v = match items_write_pair.next() { +// Some(v) => v, +// None => break, +// }; +// // raise error... +// //(iter_next, iter_fname) = v; +// item_next = v.0; +// item_fname = v.1; +// } +// } +// } + /// Iteration and item number are different. /// return corresponding items index to given iterations. /// assume iter_until_item is monotonically increased @@ -336,7 +419,7 @@ pub fn calculate_write_score_iterations( iterations_write: &[usize], wgts: &WgtBoosts, dataset: &Dataset, - samples_id: &[String], + //samples_id: &[String], //samples_id: &[(String, String)], nocov: bool, ) { @@ -369,11 +452,11 @@ pub fn calculate_write_score_iterations( &item_last_indexs_write, wgts, dataset, - samples_id, + //samples_id, nocov, false, Some(iterations_write), - false + false, ); } @@ -383,7 +466,7 @@ pub fn calculate_write_score_nsnvs( wgts: &WgtBoosts, dataset: &Dataset, //samples_id: &[(String, String)], - samples_id: &[String], + //samples_id: &[String], nocov: bool, ) { // first, count when (at which iteration) to write score @@ -424,7 +507,7 @@ pub fn calculate_write_score_nsnvs( &item_last_indexs_write, wgts, dataset, - samples_id, + //samples_id, nocov, true, Some(nsnvs_write), @@ -436,7 +519,7 @@ pub fn calculate_write_score_para_best( dout: &Path, wgts: &WgtBoosts, dataset: &Dataset, - samples_id: &[String], + //samples_id: &[String], nocov: bool, ) { let iterations_write = [wgts.wgts().len()]; @@ -470,7 +553,7 @@ pub fn calculate_write_score_para_best( &item_last_indexs_write, wgts, dataset, - samples_id, + //samples_id, nocov, false, Some(&iterations_write), diff --git a/projects_rust/boosting/src/boosting_train.rs b/projects_rust/boosting/src/boosting_train.rs index ad04211..68bde57 100644 --- a/projects_rust/boosting/src/boosting_train.rs +++ b/projects_rust/boosting/src/boosting_train.rs @@ -39,16 +39,23 @@ use genetics::{samples::CovsTrait, Covs, Dataset, Genot, Snvs}; use loss::LossStruct; pub use table::ContingencyTable; -fn boosting_iter_cov(score_ti: &mut [f64], wgtcov: &WgtBoost, covs: &Covs) { +fn boosting_iter_cov( + score_ti: &mut [f64], + wgtcov: &WgtBoost, + covs: Option<&Covs>, + sample_n: usize, +) { + // covs: Option<> for no cov input let cov_name = wgtcov.wgt().kind().cov().name(); let cov_vals_v; let cov_vals: &[f64]; // TODO: cleaner; put these in Covs or here? if cov_name == "const" { - cov_vals_v = vec![1.0; covs.vals().unwrap()[0].len()]; + //cov_vals_v = vec![1.0; covs.vals().unwrap()[0].len()]; + cov_vals_v = vec![1.0; sample_n]; cov_vals = &cov_vals_v; } else { - cov_vals = covs.vals_id(cov_name); + cov_vals = covs.unwrap().vals_id(cov_name); } //match wgt.wgt().model().coef() { @@ -57,7 +64,7 @@ fn boosting_iter_cov(score_ti: &mut [f64], wgtcov: &WgtBoost, covs: &Covs) { } fn boosting_iter_snv( - ti: usize, + iteration: usize, boost_param: BoostParam, loss: &mut LossStruct, genot: &Genot, @@ -76,7 +83,7 @@ fn boosting_iter_snv( let mut wgt = loss::search_min_loss_gt( loss, - ti, + iteration, genot, &sample_weight, phe, @@ -112,20 +119,28 @@ fn boosting_iter_snv( log::debug!("coef {:?}", coef_ti); */ + let mi: usize = match wgt.wgt().kind() { + WgtKind::Snv(_, _, mi) => mi.unwrap(), + _ => panic!(), + }; + // not set cont table //wgt.set_contingency_table(table_sum, is_eps); - let (coef_ti, is_eps) = coefficient::calculate_coef_root_ada( + let (coef_ti, is_eps, is_eff_eps) = coefficient::calculate_coef_ada_update( &pred, + &genot.to_genot_snv(mi), sample_weight, //ps_pad, phe, boost_param.learning_rate(), boost_param.eps(), + boost_param.eff_eps(), boost_param.boost_type(), ); log::debug!("coef {:?}", coef_ti); wgt.set_coef(coef_ti); wgt.set_is_eps(is_eps); + wgt.set_is_eff_eps(is_eff_eps); log::debug!("wgt {:?}", wgt); } else if boost_param.boost_type().is_type_logit() { let mi: usize = match wgt.wgt().kind() { @@ -135,7 +150,7 @@ fn boosting_iter_snv( //let coef_ti: Coef; // TODO: cleaner - let (coef_ti, is_eps, is_eff_eps) = coefficient::calculate_coef_root_logit( + let (coef_ti, is_eps, is_eff_eps) = coefficient::calculate_coef_logit_update( &genot.to_genot_snv(mi), sample_weight, //&pred, @@ -265,28 +280,32 @@ pub fn boosting_covs( log::debug!("wgtcovs_logreg: {:?}", &wgtcovs_logreg); - let covs = dataset.samples().covs().unwrap(); + //let covs = dataset.samples().covs().unwrap(); + let covs = dataset.samples().covs(); + + let sample_n = dataset.samples().samples_n(); let p = wgtcovs_logreg.len(); // TODO: why splitting...? for pi in 0..p { let wgtcov = wgtcovs_logreg[pi].clone(); - boosting_iter_cov(scores, &wgtcov, covs); + boosting_iter_cov(scores, &wgtcov, covs, sample_n); wgts.add_wgt(wgtcov); } if let Some(dataset_val) = dataset_val { - let covs = dataset_val.samples().covs().unwrap(); + let covs = dataset_val.samples().covs(); + // let covs = dataset_val.samples().covs().unwrap(); let p = wgtcovs_logreg.len(); for pi in 0..p { let wgtcov = wgtcovs_logreg[pi].clone(); - boosting_iter_cov(scores_val, &wgtcov, covs); + boosting_iter_cov(scores_val, &wgtcov, covs, sample_n); } } log::debug!("after cov"); - let phe = dataset.samples().phe(); + let phe = dataset.samples().phe_unwrap(); sample_weight.renew_sample_weight(scores, phe); p @@ -319,7 +338,7 @@ pub fn boosting_logit_const( sample_weight::renew_score(scores_val, &[0u8], &wgt_boost); } - let phe = dataset.samples().phe(); + let phe = dataset.samples().phe_unwrap(); sample_weight.renew_sample_weight(&scores, phe); @@ -353,6 +372,7 @@ fn run_next_iteration_monitor( is_monitor: bool, ) -> bool { if is_monitor { + // FIXME: should use IterationNumber, too if ti > ITERATION_NUMBER_SNV_LIMIT { log::info!( "Iteration number exceeded the limit, so stop the iteration: {}", @@ -490,7 +510,7 @@ pub fn boosting( ) -> Option<(usize, f64)> { let start_time = Instant::now(); - let phe = dataset.samples().phe(); + let phe = dataset.samples().phe_unwrap(); let genot = dataset.genot(); let snvs = dataset.snvs(); @@ -711,7 +731,7 @@ pub fn boosting_batch( let start_time = Instant::now(); - let phe = dataset.samples().phe(); + let phe = dataset.samples().phe_unwrap(); let genot = dataset.genot(); let snvs = dataset.snvs(); @@ -745,10 +765,19 @@ pub fn boosting_batch( Some(x) => x, None => { // TODO: make interval even + // TODO: make these argument + // 5usize output intercept=nan in likelihood-> no effect + //let nsnvs_monitor = [ + // 20usize, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 200, 300, + // 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, + // 10000, + //]; + // make this default and use this if not in argument + // if exceed 20k, run every 1k let nsnvs_monitor = [ 5usize, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, - 10000, + 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, ]; nsnvs_monitor.to_vec() } @@ -1025,16 +1054,25 @@ fn update_acc_monitor( ) { //let n = dataset_val.samples().samples_n(); let acc = pgs::nagelkerke_r2( - &dataset_val.samples().phe().inner_i32(), - //&dataset_val.samples().phe().inner_f64(), + &dataset_val.samples().phe_unwrap().inner_i32(), + //&dataset_val.samples().phe_unwrap().inner_f64(), scores_val, //&scores_val[..n], scores_val_cov, //&scores_val_cov[..n], ); + if acc.is_nan() { + panic!("Accuracy is nan."); + } + + log::debug!("nsnvs_monitor {:?}", nsnvs_monitor); + log::debug!("nsnv {:?}", nsnv); let nsnv_index = nsnvs_monitor.iter().position(|x| *x == nsnv).unwrap(); - acc_monitor[nsnv_index] = acc + log::debug!("nsnv_index {:?}", nsnv_index); + log::debug!("acc {:?}", acc); + acc_monitor[nsnv_index] = acc; + log::debug!("acc_monitor {:?}", acc_monitor); } fn compute_acc_max(acc_monitor: &mut [f64], nsnvs_monitor: &[usize]) -> (usize, f64) { @@ -1074,9 +1112,14 @@ fn monitor_acc(acc_monitor: &mut [f64], nsnvs_monitor: &[usize], nsnv: usize) -> return true; } - if nsnv <= 100 { + if nsnv <= 500 { return false; } + + //if nsnv <= 100 { + // return false; + //} + //if nsnv <= 1000 { // return false; //} diff --git a/projects_rust/boosting/src/boosting_train/coefficient.rs b/projects_rust/boosting/src/boosting_train/coefficient.rs index 0675abb..c084a90 100644 --- a/projects_rust/boosting/src/boosting_train/coefficient.rs +++ b/projects_rust/boosting/src/boosting_train/coefficient.rs @@ -11,37 +11,48 @@ use genetics::genot::GenotSnvRef; use genetics::samples::prelude::*; use genetics::wgt::Coef; -// TODO: rename -pub fn calculate_coef_root_ada( +// update: on updating PGS not for loss +// use learning rate +pub fn calculate_coef_ada_update( pred_s: &[u8], + gsnv: &GenotSnvRef, sample_weight: &SampleWeight, - //ps: &[f64], phe: &Phe, learning_rate: f64, eps: Option, + eff_eps: Option, boost_type: BoostType, -) -> (Coef, bool) { +) -> (Coef, bool, bool) { let (table_sum, is_eps) = table::calculate_table_eps(&pred_s, sample_weight.ps().unwrap(), phe, eps, boost_type); - let coef_ti = calculate_coefficients_ada(table_sum, boost_type, learning_rate, eps); - (coef_ti, is_eps) + let (coef_ti, is_eff_eps) = + calculate_coef_ada_eps(table_sum, gsnv, phe, eps, eff_eps, boost_type, false, true); + let coef_ti = coef_lr(coef_ti, learning_rate, boost_type); + (coef_ti, is_eps, is_eff_eps) } +// NO LEARNING RATE // assume table after eps -pub fn calculate_coefficients_ada( +pub fn calculate_coef_ada_eps( table: ContingencyTable, - boost_type: BoostType, - lr: f64, + gsnv: &GenotSnvRef, + phe: &Phe, + //lr: f64, eps: Option, -) -> Coef { + eff_eps: Option, + boost_type: BoostType, + on_loss: bool, + verbose: bool, +) -> (Coef, bool) { match boost_type { BoostType::Ada => { let table2_sum = table.two(); let (d, n) = table2_sum; let alpha = (d / n).ln() / 2.0; - let alpha = lr * alpha; - Coef::Binary((0.0, alpha)) + //let alpha = lr * alpha; + unimplemented!("eff_eps"); + //Coef::Binary((0.0, alpha), is_eff_eps) } BoostType::ConstAda => { let table4_sum = table.four(); @@ -49,9 +60,10 @@ pub fn calculate_coefficients_ada( let const_ti = ((d1 * d0) / (n1 * n0)).ln() / 4.0; let alpha_ti = ((d1 * n0) / (n1 * d0)).ln() / 4.0; - let const_ti = lr * const_ti; - let alpha_ti = lr * alpha_ti; - Coef::Binary((const_ti, alpha_ti)) + //let const_ti = lr * const_ti; + //let alpha_ti = lr * alpha_ti; + unimplemented!("eff_eps"); + //Coef::Binary((const_ti, alpha_ti)) } BoostType::FreeModelMissing => { let table7_sum = table.seven(); @@ -59,6 +71,7 @@ pub fn calculate_coefficients_ada( // TODO: clean do not want to use eps here if eps.is_some() && eps.unwrap().dom() { + unimplemented!("Not implemented effeps"); // TODO: create table:judge_eps(table) let s0; let s1; @@ -81,33 +94,162 @@ pub fn calculate_coefficients_ada( s2 = (d2 / n2).ln() / 2.0; } - let s0 = lr * s0; - let s1 = lr * s1; - let s2 = lr * s2; - Coef::Score4((s0, s1, s2, 0.0)) + // TMP + let is_eff_eps = false; + + //let s0 = lr * s0; + //let s1 = lr * s1; + //let s2 = lr * s2; + //(Coef::Score4((s0, s1, s2, 0.0)), is_eff_eps) + (Coef::new_score4((s0, s1, s2, 0.0)), is_eff_eps) } else { let s0 = (d0 / n0).ln() / 2.0; let s1 = (d1 / n1).ln() / 2.0; let s2 = (d2 / n2).ln() / 2.0; - let s0 = lr * s0; - let s1 = lr * s1; - let s2 = lr * s2; - Coef::Score4((s0, s1, s2, 0.0)) + if on_loss && eff_eps.is_some() && (eff_eps.unwrap().is_on_update()) { + // legacy + unimplemented!("Not using now"); + } else { + let table8_count = gsnv.stat_contingency_table(phe); + + let ((s0, s1, s2), is_eff_eps) = adjust_coef::adjust_eff_logit_no_missing( + (s0, s1, s2), + table8_count, + eff_eps, + verbose, + ); + //(Coef::Score4((s0, s1, s2, 0.0)), is_eff_eps) + (Coef::new_score4((s0, s1, s2, 0.0)), is_eff_eps) + } + + //if !on_loss { + // let ((s0, s1, s2), is_eff_eps) = adjust_coef::adjust_eff_logit_no_missing( + // (s0, s1, s2), + // table8_count, + // eff_eps, + // verbose, + // ); + // (Coef::Score4((s0, s1, s2, 0.0)), is_eff_eps) + //} else { + // // calculate coef for loss + // // some eps_eff does not apply on coef for loss + // if eff_eps.is_some() && (!eff_eps.unwrap().is_on_update()) { + // if verbose { + // log::debug!("Adjust eff_eps since for loss."); + // } + // let ((s0, s1, s2), is_eff_eps) = adjust_coef::adjust_eff_logit_no_missing( + // (s0, s1, s2), + // table8_count, + // eff_eps, + // verbose, + // ) + // (Coef::Score4((s0, s1, s2, 0.0)), is_eff_eps) + // } else { + // // legacy + // unimplemented!("Not using now"); + // if verbose { + // log::debug!("Do not adjust eff_eps since for loss."); + // } + + // //let s0 = lr * s0; + // //let s1 = lr * s1; + // //let s2 = lr * s2; + // Coef::Score4((s0, s1, s2, 0.0)) + // } + //} } } _ => panic!(), } } +// OLD: using learning rate +// // assume table after eps +// pub fn calculate_coefficients_ada( +// table: ContingencyTable, +// boost_type: BoostType, +// lr: f64, +// eps: Option, +// eff_eps: Option, +// //on_loss: bool, +// //verbose: bool, +// ) -> Coef { +// match boost_type { +// BoostType::Ada => { +// let table2_sum = table.two(); +// let (d, n) = table2_sum; +// let alpha = (d / n).ln() / 2.0; + +// let alpha = lr * alpha; +// Coef::Binary((0.0, alpha)) +// } +// BoostType::ConstAda => { +// let table4_sum = table.four(); +// let (d1, n1, d0, n0) = table4_sum; +// let const_ti = ((d1 * d0) / (n1 * n0)).ln() / 4.0; +// let alpha_ti = ((d1 * n0) / (n1 * d0)).ln() / 4.0; + +// let const_ti = lr * const_ti; +// let alpha_ti = lr * alpha_ti; +// Coef::Binary((const_ti, alpha_ti)) +// } +// BoostType::FreeModelMissing => { +// let table7_sum = table.seven(); +// let (d2, n2, d1, n1, d0, n0, _) = table7_sum; + +// // TODO: clean do not want to use eps here +// if eps.is_some() && eps.unwrap().dom() { +// unimplemented!("Not implemented effeps"); +// // TODO: create table:judge_eps(table) +// let s0; +// let s1; +// let s2; +// if (d2 < CONTINGENCY_TABLE_FILL) || (n2 < CONTINGENCY_TABLE_FILL) { +// let d1_new = d2 + d1; +// let n1_new = n2 + n1; +// s1 = (d1_new / n1_new).ln() / 2.0; +// s2 = s1; +// s0 = (d0 / n0).ln() / 2.0; +// } else if (d0 < CONTINGENCY_TABLE_FILL) || (n0 < CONTINGENCY_TABLE_FILL) { +// let d1_new = d0 + d1; +// let n1_new = n0 + n1; +// s1 = (d1_new / n1_new).ln() / 2.0; +// s0 = s1; +// s2 = (d2 / n2).ln() / 2.0; +// } else { +// s0 = (d0 / n0).ln() / 2.0; +// s1 = (d1 / n1).ln() / 2.0; +// s2 = (d2 / n2).ln() / 2.0; +// } + +// let s0 = lr * s0; +// let s1 = lr * s1; +// let s2 = lr * s2; +// Coef::Score4((s0, s1, s2, 0.0)) +// } else { +// let s0 = (d0 / n0).ln() / 2.0; +// let s1 = (d1 / n1).ln() / 2.0; +// let s2 = (d2 / n2).ln() / 2.0; + +// unimplemented!("eff_eps"); + +// let s0 = lr * s0; +// let s1 = lr * s1; +// let s2 = lr * s2; +// Coef::Score4((s0, s1, s2, 0.0)) +// } +// } +// _ => panic!(), +// } +// } + // DO NOT use predict here: troublesome // now common fn can be used for loss and wgt // TODO: integrate to calcualte_coef_root_ada -pub fn calculate_coef_root_logit( +pub fn calculate_coef_logit_update( gsnv: &GenotSnvRef, sample_weight: &SampleWeight, - //wzs_pad: &[f64], - //wls_pad: &[f64], phe: &Phe, learning_rate: f64, eps: Option, @@ -126,14 +268,12 @@ pub fn calculate_coef_root_logit( gsnv, sample_weight.wzs_pad().unwrap(), sample_weight.wls_pad().unwrap(), - //wzs_pad, - //wls_pad, phe, epsilons_wzs, epsilons_wls, eps, - boost_type, eff_eps, + boost_type, false, true, ); @@ -144,14 +284,12 @@ pub fn calculate_coef_root_logit( BoostType::LogitAdd => { let coef_ti = calculate_coef_logit_add( gsnv, - //&genot.to_genot_snv(mi), sample_weight.wzs_pad().unwrap(), sample_weight.wls_pad().unwrap(), - //wzs_pad, - //wls_pad, - //phe, - learning_rate, + //learning_rate, ); + let coef_ti = coef_lr(coef_ti, learning_rate, boost_type); + //let coef_ti = coef_ti.apply_lr(learning_rate); (coef_ti, false, false) } _ => panic!(), @@ -167,16 +305,24 @@ pub unsafe fn calculate_coef_logit_eps( epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) eps: Option, - //learning_rate: Option, - boost_type: BoostType, eff_eps: Option, + boost_type: BoostType, on_loss: bool, verbose: bool, + //learning_rate: Option, ) -> (Coef, bool, bool) { //use std::time::Instant; //let start_time = Instant::now(); - let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); + let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_sm(gsnv, wzs_pad, wls_pad); + //let (wzs_sum, wls_sum); + //#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + //{ + // if is_x86_feature_detected!("avx2") { + // let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); + // } + //} + //calculate_coef_gt_logit_simd_sm(&genot.to_genot_snv(mi), wzs, wls, phe); //println!("afr wzs_sum: {} sec", start_time.elapsed().as_micros()); @@ -196,7 +342,7 @@ pub unsafe fn calculate_coef_logit_eps( epsilons_wzs, epsilons_wls, eps, - table8_count, + ContingencyTable::EightCount(table8_count), ); //println!("afr adj: {} sec", start_time.elapsed().as_micros()); @@ -316,56 +462,59 @@ pub unsafe fn calculate_coef_logit_add( //epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) //epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) //eps: Eps, - lr: f64, + //lr: f64, ) -> Coef { - /* //use std::time::Instant; + //use std::time::Instant; //let start_time = Instant::now(); - let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); + let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_sm(gsnv, wzs_pad, wls_pad); + //let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); //calculate_coef_gt_logit_simd_sm(&genot.to_genot_snv(mi), wzs, wls, phe); let coef: Coef = calc_coef_logit_add(wzs_sum, wls_sum); //println!("afr coef: {} sec", start_time.elapsed().as_micros()); - let (c, a) = coef.linearconst_f64(); + //let (c, a) = coef.linearconst_f64(); + + //let coef = Coef::LinearConst((c, a)); - let coef = Coef::LinearConst((c, a)); - //let coef = Coef::LinearConst((lr * c, lr * a)); */ + coef - let coef = calculate_coef_logit_add_on_loss(gsnv, wzs_pad, wls_pad); + //let coef = calculate_coef_logit_add(gsnv, wzs_pad, wls_pad); - coef.apply_lr(lr) + //coef.apply_lr(lr) } -// for coef on calculating loss -pub unsafe fn calculate_coef_logit_add_on_loss( - gsnv: &GenotSnvRef, - wzs_pad: &[f64], - wls_pad: &[f64], - //_phe: &Phe, - //epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) - //epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) - //eps: Eps, - //lr: f64, -) -> Coef { - //use std::time::Instant; - //let start_time = Instant::now(); +// // for coef on calculating loss +// pub unsafe fn calculate_coef_logit_add_on_loss( +// gsnv: &GenotSnvRef, +// wzs_pad: &[f64], +// wls_pad: &[f64], +// //_phe: &Phe, +// //epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) +// //epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) +// //eps: Eps, +// //lr: f64, +// ) -> Coef { +// //use std::time::Instant; +// //let start_time = Instant::now(); - let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); - //calculate_coef_gt_logit_simd_sm(&genot.to_genot_snv(mi), wzs, wls, phe); +// let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_sm(gsnv, wzs_pad, wls_pad); +// //let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); +// //calculate_coef_gt_logit_simd_sm(&genot.to_genot_snv(mi), wzs, wls, phe); - let coef: Coef = calc_coef_logit_add(wzs_sum, wls_sum); +// let coef: Coef = calc_coef_logit_add(wzs_sum, wls_sum); - //println!("afr coef: {} sec", start_time.elapsed().as_micros()); +// //println!("afr coef: {} sec", start_time.elapsed().as_micros()); - let (c, a) = coef.linearconst_f64(); +// let (c, a) = coef.linearconst_f64(); - let coef = Coef::LinearConst((c, a)); - //let coef = Coef::LinearConst((lr * c, lr * a)); +// let coef = Coef::LinearConst((c, a)); +// //let coef = Coef::LinearConst((lr * c, lr * a)); - coef -} +// coef +// } pub fn calculate_coef_logit( wzs_sum: (f64, f64, f64), @@ -387,9 +536,11 @@ pub fn calculate_coef_logit( } pub fn calculate_coef_logit_no_missing( + // TODO: use ContingencyTable wzs_sum: (f64, f64, f64), wls_sum: (f64, f64, f64), eff_eps: Option, + // TODO: use ContingencyTable // for eff_eps table8_count: (usize, usize, usize, usize, usize, usize, usize, usize), on_loss: bool, @@ -401,24 +552,39 @@ pub fn calculate_coef_logit_no_missing( log::debug!("Coef2,1,0 bfr EffEps {}, {}, {}", s2, s1, s0); } - if !on_loss { - adjust_coef::adjust_eff_logit_no_missing((s0, s1, s2), table8_count, eff_eps, verbose) + if on_loss && eff_eps.is_some() && (eff_eps.unwrap().is_on_update()) { + // legacy + unimplemented!("Not using now"); + //if verbose { + // log::debug!("Do not adjust eff_eps since for loss."); + //} } else { - // calculate coef for loss - // some eps_eff does not apply on coef for loss - if eff_eps.is_some() && (!eff_eps.unwrap().is_on_update()) { - if verbose { - log::debug!("Adjust eff_eps since for loss."); - } - adjust_coef::adjust_eff_logit_no_missing((s0, s1, s2), table8_count, eff_eps, verbose) - } else { - // legacy - if verbose { - log::debug!("Do not adjust eff_eps since for loss."); - } - (Coef::Score3((s0, s1, s2)), false) - } + let (scores, is_eff_eps) = + adjust_coef::adjust_eff_logit_no_missing((s0, s1, s2), table8_count, eff_eps, verbose); + (Coef::new_score3(scores), is_eff_eps) } + + //if !on_loss { + // let (scores, is_eff_eps)=adjust_coef::adjust_eff_logit_no_missing((s0, s1, s2), table8_count, eff_eps, verbose); + // (Coef::new_score3(scores), is_eff_eps) + //} else { + // // calculate coef for loss + // // some eps_eff does not apply on coef for loss + // if eff_eps.is_some() && (!eff_eps.unwrap().is_on_update()) { + // if verbose { + // log::debug!("Adjust eff_eps since for loss."); + // } + // let (scores, is_eff_eps)=adjust_coef::adjust_eff_logit_no_missing((s0, s1, s2), table8_count, eff_eps, verbose); + // (Coef::new_score3(scores), is_eff_eps) + // } else { + // // legacy + // unimplemented!("Not using now"); + // if verbose { + // log::debug!("Do not adjust eff_eps since for loss."); + // } + // (Coef::Score3((s0, s1, s2)), false) + // } + //} } pub fn calculate_coef_from_weights( @@ -494,15 +660,17 @@ pub fn coef_lr(coef: Coef, lr: f64, boost_type: BoostType) -> Coef { //let lr = learning_rate.unwrap_or(1.0); match boost_type { - BoostType::Logit => { + BoostType::FreeModelMissing | BoostType::Logit => { let (s0, s1, s2, _sm) = coef.score4_f64(); Coef::Score4((lr * s0, lr * s1, lr * s2, 0.0)) } BoostType::LogitNoMissing => { - let (s0, s1, s2) = coef.score3_f64(); - Coef::Score3((lr * s0, lr * s1, lr * s2)) + coef.apply_lr(lr) + //let (s0, s1, s2) = coef.score3_f64(); + //Coef::Score3((lr * s0, lr * s1, lr * s2)) } - _ => panic!("wrong"), + BoostType::LogitAdd => coef.apply_lr(lr), + _ => panic!("wrong: {:?}", boost_type), } } @@ -526,25 +694,43 @@ mod tests { // (v - w).abs() < e //} - #[test] - fn test_calculate_coefficients_freemodelmissing() { - let t = ContingencyTable::new_seven((0.02, 0.01, 0.1, 0.2, 0.3, 0.3, 0.07)); - let coef = calculate_coefficients_ada(t, BoostType::FreeModelMissing, 1.0, Some(Eps::Med)); - assert_eq!( - coef, - Coef::Score4((0.0, 0.5f64.ln() / 2.0, 2.0f64.ln() / 2.0, 0.0)) - ); - } + // TODO: create test + //#[test] + //fn test_calculate_coefficients_freemodelmissing() { + // let t = ContingencyTable::new_seven((0.02, 0.01, 0.1, 0.2, 0.3, 0.3, 0.07)); + // let (coef, is_eff_eps) = calculate_coef_ada_eps( + // t, + // BoostType::FreeModelMissing, + // //1.0, + // Some(Eps::Med), + // None, + // false, + // false, + // ); + // assert_eq!( + // coef, + // Coef::Score4((0.0, 0.5f64.ln() / 2.0, 2.0f64.ln() / 2.0, 0.0)) + // ); + // assert_eq!(is_eff_eps, false) + //} - #[test] - fn test_calculate_coefficients_freemodelmissing_lr() { - let t = ContingencyTable::new_seven((0.02, 0.01, 0.1, 0.2, 0.3, 0.3, 0.07)); - let coef = calculate_coefficients_ada(t, BoostType::FreeModelMissing, 0.1, Some(Eps::Med)); - assert_eq!( - coef, - Coef::Score4((0.0, 0.5f64.ln() / 20.0, 2.0f64.ln() / 20.0, 0.0)) - ); - } + // #[test] + // fn test_calculate_coefficients_freemodelmissing_lr() { + // let t = ContingencyTable::new_seven((0.02, 0.01, 0.1, 0.2, 0.3, 0.3, 0.07)); + // let coef = calculate_coefficients_ada( + // t, + // BoostType::FreeModelMissing, + // 0.1, + // Some(Eps::Med), + // None, + // false, + // false, + // ); + // assert_eq!( + // coef, + // Coef::Score4((0.0, 0.5f64.ln() / 20.0, 2.0f64.ln() / 20.0, 0.0)) + // ); + // } /// for speed #[allow(dead_code)] fn setup_test4_logit() -> (Genot, Phe, Vec, Vec) { diff --git a/projects_rust/boosting/src/boosting_train/coefficient/adjust_coef.rs b/projects_rust/boosting/src/boosting_train/coefficient/adjust_coef.rs index 4536a12..be7099c 100644 --- a/projects_rust/boosting/src/boosting_train/coefficient/adjust_coef.rs +++ b/projects_rust/boosting/src/boosting_train/coefficient/adjust_coef.rs @@ -1,12 +1,18 @@ use crate::EffEps; -use genetics::wgt::Coef; +//use genetics::wgt::Coef; +/// +/// scores: // (s0, s1, s2) +/// return scores: // (s0, s1, s2) +/// pub fn adjust_eff_logit_no_missing( scores: (f64, f64, f64), // (s0, s1, s2) table8_count: (usize, usize, usize, usize, usize, usize, usize, usize), eff_eps: Option, verbose: bool, -) -> (Coef, bool) { +) -> ((f64, f64, f64), bool) { + // since will be used to Modelfree() + //) -> (Coef, bool) { let (s0, s1, s2) = scores; if let Some(eff_eps) = eff_eps { @@ -20,7 +26,8 @@ pub fn adjust_eff_logit_no_missing( let (s0_new, is_eff_eps_0) = adjust_eff_score_lims2(s0, d0, n0, (s2, s1), clim, sratio); let is_eff_eps = is_eff_eps_2 | is_eff_eps_1 | is_eff_eps_0; - (Coef::Score3((s0_new, s1_new, s2_new)), is_eff_eps) + ((s0_new, s1_new, s2_new), is_eff_eps) + //(Coef::Score3((s0_new, s1_new, s2_new)), is_eff_eps) } EffEps::LimScoreProp(plim, sratio) => { //let (d2, n2, d1, n1, d0, n0, _dm, _nm) = table8_count; @@ -46,7 +53,8 @@ pub fn adjust_eff_logit_no_missing( sratio, ); let is_eff_eps = is_eff_eps_2 | is_eff_eps_1 | is_eff_eps_0; - (Coef::Score3((s0_new, s1_new, s2_new)), is_eff_eps) + ((s0_new, s1_new, s2_new), is_eff_eps) + //(Coef::Score3((s0_new, s1_new, s2_new)), is_eff_eps) } EffEps::LimS2GmodelProp(plim, rec_max_ratio, rec_min_ratio) => { //let (d2, n2, d1, n1, d0, n0, _dm, _nm) = table8_count; @@ -60,7 +68,8 @@ pub fn adjust_eff_logit_no_missing( rec_min_ratio, ); let is_eff_eps = is_eff_eps_2; - (Coef::Score3((s0, s1, s2_new)), is_eff_eps) + ((s0, s1, s2_new), is_eff_eps) + //(Coef::Score3((s0, s1, s2_new)), is_eff_eps) } EffEps::LimS12GmodelProp( plim, @@ -89,7 +98,8 @@ pub fn adjust_eff_logit_no_missing( het_min_ratio, ); let is_eff_eps = is_eff_eps_2; - (Coef::Score3((s0, s1_new, s2_new)), is_eff_eps) + ((s0, s1_new, s2_new), is_eff_eps) + //(Coef::Score3((s0, s1_new, s2_new)), is_eff_eps) } EffEps::LimS2GmodelOverProp( plim, @@ -117,7 +127,8 @@ pub fn adjust_eff_logit_no_missing( //het_min_ratio, ); let is_eff_eps = is_eff_eps_2; - (Coef::Score3((s0, s1, s2_new)), is_eff_eps) + ((s0, s1, s2_new), is_eff_eps) + //(Coef::Score3((s0, s1, s2_new)), is_eff_eps) } EffEps::LimS2GmodelOverKeepSignProp( plim, @@ -138,7 +149,8 @@ pub fn adjust_eff_logit_no_missing( //het_min_ratio, ); let is_eff_eps = is_eff_eps_2; - (Coef::Score3((s0, s1, s2_new)), is_eff_eps) + ((s0, s1, s2_new), is_eff_eps) + //(Coef::Score3((s0, s1, s2_new)), is_eff_eps) } EffEps::LimS2GmodelBorderProp( @@ -162,7 +174,8 @@ pub fn adjust_eff_logit_no_missing( verbose, ); let is_eff_eps = is_eff_eps_2; - (Coef::Score3((s0, s1, s2_new)), is_eff_eps) + ((s0, s1, s2_new), is_eff_eps) + //(Coef::Score3((s0, s1, s2_new)), is_eff_eps) } EffEps::LimS2AddProp(plim) => { //let (d2, n2, d1, n1, d0, n0, _dm, _nm) = table8_count; @@ -174,11 +187,13 @@ pub fn adjust_eff_logit_no_missing( plim, ); let is_eff_eps = is_eff_eps_2; - (Coef::Score3((s0, s1, s2_new)), is_eff_eps) + ((s0, s1, s2_new), is_eff_eps) + //(Coef::Score3((s0, s1, s2_new)), is_eff_eps) } } } else { - (Coef::Score3((s0, s1, s2)), false) + ((s0, s1, s2), false) + //(Coef::Score3((s0, s1, s2)), false) } } diff --git a/projects_rust/boosting/src/boosting_train/coefficient/calc.rs b/projects_rust/boosting/src/boosting_train/coefficient/calc.rs index ca2ee32..03cec4f 100644 --- a/projects_rust/boosting/src/boosting_train/coefficient/calc.rs +++ b/projects_rust/boosting/src/boosting_train/coefficient/calc.rs @@ -1,5 +1,24 @@ use genetics::genot::prelude::*; +pub unsafe fn calculate_coef_gt_logit_sm( + gsnv: &GenotSnvRef, + wzs_pad: &[f64], + wls_pad: &[f64], + //phe: &Phe, + //epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) + //epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) + //eps: Eps, +) -> ((f64, f64, f64), (f64, f64, f64)) { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx2") { + //let (wzs_sum, wls_sum) = calc::calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); + return calculate_coef_gt_logit_simd_sm(gsnv, wzs_pad, wls_pad); + } + } + return calculate_coef_gt_logit_nosimd_sm(gsnv, wzs_pad, wls_pad); +} + // TODO: move to coef.rs #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] @@ -232,3 +251,15 @@ pub unsafe fn calculate_coef_gt_logit_simd_sm( //Coef::Score4((s0, s1, s2, sm)) } + +pub unsafe fn calculate_coef_gt_logit_nosimd_sm( + _gsnv: &GenotSnvRef, + _wzs_pad: &[f64], + _wls_pad: &[f64], + //phe: &Phe, + //epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) + //epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) + //eps: Eps, +) -> ((f64, f64, f64), (f64, f64, f64)) { + unimplemented!() +} diff --git a/projects_rust/boosting/src/boosting_train/loss.rs b/projects_rust/boosting/src/boosting_train/loss.rs index 9175196..4c064b4 100644 --- a/projects_rust/boosting/src/boosting_train/loss.rs +++ b/projects_rust/boosting/src/boosting_train/loss.rs @@ -3,8 +3,8 @@ pub mod calc; use std::collections::HashSet; -use super::coefficient; -use super::compute_pred; +//use super::coefficient; +//use super::compute_pred; use super::epsilon; use super::sample_weight::SampleWeight; use super::table; @@ -347,7 +347,7 @@ pub fn search_min_loss_gt( // search min loss let (loss_min, var_mi, var_si) = loss.search_min(skip_snv); - let var_mi: usize = var_mi.expect("No SNVs were loss<1.0"); + let var_mi: usize = var_mi.unwrap_or_else(|| panic!("No SNVs were loss<1.0")); log::debug!( "contingency table count {:?}", @@ -483,6 +483,7 @@ pub fn calculate_loss_gt( boost_param, ); } + // FIXME: BoostType::LogitNoMissing _ => unimplemented!(), } } @@ -701,78 +702,77 @@ pub fn search_min_loss_gt_pruned_loss_ss_second( } */ -pub fn create_loss_const( - iter: usize, - ps: &[f64], - pred: &mut [u8], - //pred: &mut [B8], - phe: &Phe, - //_n: usize, - //_boosting_param: BoostParam, -) -> WgtBoost { - /* - let mut const_var = Var::construct_var(CovKind::Const, "const".to_owned()); - const_var.set_vals_const(n); - let const_wgt = CovWgt::construct(const_var); - */ - - compute_pred::compute_pred_const(pred); - - //let abcd_sum = calculate::calculate_abcd_sum( - let (table2_sum, is_eps) = table::calculate_table2(pred, ps, phe); - - let loss = calc::calculate_loss_ab(table2_sum); - - // since all predict=1 - let wgt = Wgt::construct_const_threshold(0.5); - - /* - let wgt = Wgt::construct_wgt( - WgtKind::Cov(const_wgt), - Model::Binary(BinaryModel::construct_threshold(0.5)), - ); - */ - let mut wgt_boost = WgtBoost::construct_wgt(wgt, iter, loss, table2_sum, is_eps, None); - - // no learning rate - let coef_ti = coefficient::calculate_coefficients_ada( - wgt_boost.contingency_table().unwrap(), - BoostType::Ada, - 1.0, - Some(crate::Eps::Med), // will not be used - ); - - wgt_boost.wgt_mut().model_mut().set_coef(coef_ti); - //wgt.model_mut().set_coef_binary(coef_ti); - - wgt_boost - - /* - if let WgtKind::Cov(ref mut cov_wgt) = &mut (wgt.get_kind_mut()) { - if let Model::Binary(ref mut binary_model) = &mut (cov_wgt.get_model_mut()) { - binary_model.set_coef(coef_ti); - } - } - */ - - /* - if let WgtKind::Snv(ref mut snv_wgt) = &mut (wgt.get_kind_mut()) { - snv_wgt.get_model_mut().set_coef(coef_ti); - //let model = snv_wgt.get_model_mut(); - //model.set_coef(coef_ti); - //(*(*snv_wgt).get_model()).set_coef(coef_ti); - //(*(*snv_wgt).get_model()).set_coef(coef_ti); - } else { - panic!("wgt should be Snv.") - } - */ - - //// TODO: make these func - //compute::compute_mis(&mut mis, &wgt, ys, mistakes, n); - //let abcd_sum = calculate::calculate_abcd_sum(); - - //wgt -} +// // not using now +// pub fn create_loss_const( +// iter: usize, +// ps: &[f64], +// pred: &mut [u8], +// //pred: &mut [B8], +// phe: &Phe, +// //_n: usize, +// //_boosting_param: BoostParam, +// ) -> WgtBoost { +// /* +// let mut const_var = Var::construct_var(CovKind::Const, "const".to_owned()); +// const_var.set_vals_const(n); +// let const_wgt = CovWgt::construct(const_var); +// */ +// compute_pred::compute_pred_const(pred); + +// //let abcd_sum = calculate::calculate_abcd_sum( +// let (table2_sum, is_eps) = table::calculate_table2(pred, ps, phe); + +// let loss = calc::calculate_loss_ab(table2_sum); + +// // since all predict=1 +// let wgt = Wgt::construct_const_threshold(0.5); + +// /* +// let wgt = Wgt::construct_wgt( +// WgtKind::Cov(const_wgt), +// Model::Binary(BinaryModel::construct_threshold(0.5)), +// ); +// */ +// let mut wgt_boost = WgtBoost::construct_wgt(wgt, iter, loss, table2_sum, is_eps, None); + +// unimplemented!("Why using Eps::Med?->introduce eff_eps"); +// // no learning rate +// let coef_ti = coefficient::calculate_coef_ada_eps( +// wgt_boost.contingency_table().unwrap(), +// BoostType::Ada, +// 1.0, +// None, //Some(crate::Eps::Med), // will not be used +// ); + +// wgt_boost.wgt_mut().model_mut().set_coef(coef_ti); +// //wgt.model_mut().set_coef_binary(coef_ti); + +// wgt_boost + +// /* +// if let WgtKind::Cov(ref mut cov_wgt) = &mut (wgt.get_kind_mut()) { +// if let Model::Binary(ref mut binary_model) = &mut (cov_wgt.get_model_mut()) { +// binary_model.set_coef(coef_ti); +// } +// } +// */ +// /* +// if let WgtKind::Snv(ref mut snv_wgt) = &mut (wgt.get_kind_mut()) { +// snv_wgt.get_model_mut().set_coef(coef_ti); +// //let model = snv_wgt.get_model_mut(); +// //model.set_coef(coef_ti); +// //(*(*snv_wgt).get_model()).set_coef(coef_ti); +// //(*(*snv_wgt).get_model()).set_coef(coef_ti); +// } else { +// panic!("wgt should be Snv.") +// } +// */ +// //// TODO: make these func +// //compute::compute_mis(&mut mis, &wgt, ys, mistakes, n); +// //let abcd_sum = calculate::calculate_abcd_sum(); + +// //wgt +// } /* #[cfg(test)] diff --git a/projects_rust/boosting/src/boosting_train/loss/calc.rs b/projects_rust/boosting/src/boosting_train/loss/calc.rs index f6262b8..5b95b10 100644 --- a/projects_rust/boosting/src/boosting_train/loss/calc.rs +++ b/projects_rust/boosting/src/boosting_train/loss/calc.rs @@ -1,9 +1,10 @@ use super::epsilon; use super::table; +use super::BoostType; use super::LossStruct; use crate::boosting_train::coefficient; use crate::boosting_train::sample_weight::SampleWeight; -use crate::{BoostParam, ContingencyTable, Eps}; +use crate::{BoostParam, ContingencyTable, EffEps, Eps}; use genetics::genot::prelude::*; use genetics::samples::prelude::*; use genetics::wgt::Coef; @@ -27,6 +28,18 @@ pub fn calculate_loss_table7(table: ContingencyTable) -> f64 { m + 2.0 * ((d2 * n2).sqrt() + (d1 * n1).sqrt() + (d0 * n0).sqrt()) } +pub fn calculate_loss_table7_coef(table: ContingencyTable, coef: Coef) -> f64 { + let (d2, n2, d1, n1, d0, n0, m) = table.seven(); + //m + 2.0 * ((d2 * n2).sqrt() + (d1 * n1).sqrt() + (d0 * n0).sqrt()) + let (s0, s1, s2, _) = coef.score4_f64(); + m + d2 * (-s2).exp() + + d1 * (-s1).exp() + + d0 * (-s0).exp() + + n2 * (s2).exp() + + n1 * (s1).exp() + + n0 * (s0).exp() +} + pub fn calculate_loss_table7_or_5(table: ContingencyTable) -> f64 { if let ContingencyTable::Seven(_) = table { calculate_loss_table7(table) @@ -280,6 +293,8 @@ pub unsafe fn calculate_loss_gt_constada_simd( assert_eq!(losss.inner_mut().len(), genot.m() * 2); assert_eq!(phe.n(), genot.n()); + unimplemented!("effeps not implemented"); + //let n = phe.n(); let epsilons = epsilon::calculate_epsilons(sample_weight.ps().unwrap(), phe, boost_param.eps()); //let epsilons = epsilon::calculate_epsilons(&ps_pad[..n], phe, boost_param.eps()); @@ -630,30 +645,47 @@ unsafe fn calculate_loss_gt_freemodelmissing_simd_sm( phe: &Phe, epsilons: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) eps: Option, + eff_eps: Option, + boost_type: BoostType, ) -> f64 { let table7_ori = table::calculate_table7_sum_simd(gsnv, ps_pad, phe); - if eps.is_none() { + let table7 = if eps.is_none() { + table7_ori + } else { + if eps.unwrap().dom() { + unimplemented!("Not implemented effeps"); + //let (table7_or_5, _) = table::adjust_eps_table7_dom(table7_ori, eps.unwrap()); + //let loss_ = calculate_loss_table7_or_5(table7_or_5); + } else { + let (table7, _) = table::adjust_eps_table7_nondom(table7_ori, epsilons, eps.unwrap()); + table7 + } + }; + + let (coef, _) = coefficient::calculate_coef_ada_eps( + table7, gsnv, phe, eps, eff_eps, boost_type, true, false, + ); + //let coef = coefficient::calculate_coef_freemodelmissing_eps(table7,eff_eps); + + let loss_ = calculate_loss_table7_coef(table7, coef); + // faster but not using eff_eps + //let loss_ = calculate_loss_table7(table7); + loss_ + + /* if eps.is_none() { let loss_ = calculate_loss_table7(table7_ori); return loss_; } // TODO: cleaner if eps.is_some() && eps.unwrap().dom() { - let (table7_or_5, _) = table::adjust_eps_table7_dom( - table7_ori, //(d2_sum, n2_sum, d1_sum, n1_sum, d0_sum, n0_sum), - eps.unwrap(), - ); - - let loss_ = calculate_loss_table7_or_5(table7_or_5); - - loss_ + unimplemented!("Not implemented effeps"); + //let (table7_or_5, _) = table::adjust_eps_table7_dom(table7_ori, eps.unwrap()); + //let loss_ = calculate_loss_table7_or_5(table7_or_5); + //loss_ } else { - let (table7, _) = table::adjust_eps_table7_nondom( - table7_ori, //(d2_sum, n2_sum, d1_sum, n1_sum, d0_sum, n0_sum), - epsilons, - eps.unwrap(), - ); + let (table7, _) = table::adjust_eps_table7_nondom(table7_ori, epsilons, eps.unwrap()); //if mi % 20000 == 0 { // log::debug!("table7 {:?}", table7); @@ -665,7 +697,7 @@ unsafe fn calculate_loss_gt_freemodelmissing_simd_sm( let loss_ = calculate_loss_table7(table7); loss_ - } + } */ } // unsafe is necessary with #[target_feature] @@ -703,6 +735,8 @@ pub unsafe fn calculate_loss_gt_freemodelmissing_simd( phe, epsilons, boost_param.eps(), + boost_param.eff_eps(), + boost_param.boost_type(), ) } }); @@ -1199,8 +1233,8 @@ pub unsafe fn calculate_loss_gt_logit_simd( epsilons_wls, boost_param.eps(), //boost_param.learning_rate(), - boost_param.boost_type(), boost_param.eff_eps(), + boost_param.boost_type(), true, false, ); @@ -1252,14 +1286,11 @@ pub unsafe fn calculate_loss_gt_logit_add_simd( if skip_snv.contains(&mi) { *loss = f64::MAX; } else { - let coef = coefficient::calculate_coef_logit_add_on_loss( + let coef = coefficient::calculate_coef_logit_add( &genot.to_genot_snv(mi), sample_weight.wzs_pad().unwrap(), sample_weight.wls_pad().unwrap(), - ///////////////////////// - /////////////////////// TMP - //None, - //boost_param.learning_rate(), + // no lr ); // debug //let (c, a) = coef.linearconst_f64(); @@ -1365,6 +1396,8 @@ pub fn calculate_loss_gt_freemodelmissing_nosimd( ) { assert_eq!(losss.inner_mut().len(), genot.m()); + unimplemented!("ny eff_eps"); + let epsilons = epsilon::calculate_epsilons(sample_weight.ps().unwrap(), phe, boost_param.eps()); //log::debug!("epsilon case, cont: {:.2e},{:.2e}", epsilons.0, epsilons.1); @@ -1464,7 +1497,7 @@ mod tests { &mut losss, &dataset.genot(), &ps, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), // ConstAda BoostParam::new_type1(), ); @@ -1483,7 +1516,7 @@ mod tests { assert_eq!(losss.inner().len(), dataset.snvs().snvs_n() * 2); - //let ys = vec![0u8; dataset.samples().phe().inner().len()]; + //let ys = vec![0u8; dataset.samples().phe_unwrap().inner().len()]; let n = dataset.genot().n(); let len_n = n / 8 + 5; let ys = vec![0u8; len_n]; @@ -1503,7 +1536,7 @@ mod tests { //&dataset.genot().genot_inner().inner(), &ps, &ys, - //&dataset.samples().phe().inner(), + //&dataset.samples().phe_unwrap().inner(), // ConstAda dataset.genot().n(), BoostParam::new_type1(), @@ -1518,7 +1551,7 @@ mod tests { &mut losss, &dataset.genot(), &ps, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), ); } @@ -1531,7 +1564,7 @@ mod tests { &mut losss, &dataset.genot(), &ps, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), ); } @@ -1544,7 +1577,7 @@ mod tests { &mut losss, &dataset.genot(), &ps, - &dataset.samples().phe(), + &dataset.samples().phe_unwrap(), BoostParam::new_type1(), ); } @@ -1617,100 +1650,101 @@ mod tests { //(genot, phe, ps) } - #[test] - fn test_calculate_loss_gt_constada_simd_2() { - // This error below means SIMD memory is not aligned. - // "process didn't exit successfully: (signal: 11, SIGSEGV: invalid memory reference)" - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if is_x86_feature_detected!("avx2") { - let (genot, phe, sample_weight) = setup_test2(); - - let m = genot.m(); - let losss = vec![0.0f64; 2 * m]; - let mut losss = LossStruct::ConstAda(losss, m, 2); - - unsafe { - calculate_loss_gt_constada_simd( - &mut losss, - &genot, - &sample_weight, - //&ps, - &phe, - BoostParam::new_type1(), - &HashSet::::new(), - ); - } - //calculate_loss_gt_simd(&mut losss, &mistakes, &ps, &ys, m, n); - //println!("losss: {:?}", losss); - - // abcd - // ps: [0.1,0.2,0.3,0.4] - // ys: [0,1,0,1] - // eps: (0.2, 0.2) - // dom: [0.6,0.3,0.0,0.1] - // -> [0.8,0.5,0.2,0.3] - // rec: [0.4,0.0,0.2,0.4] - // ->[0.6,0.2,0.4,0.6] - // dom: 1.7548090 - // rec: 1.67261622 - // -> not realistic due to small n - assert_float_absolute_eq!(losss.inner_mut()[0], 1.7548090); - assert_float_absolute_eq!(losss.inner_mut()[0], 1.7548090); - //assert!(is_eq_f64(losss.inner_mut()[1], 1.67261622, 1e-7)); - //assert!(is_eq_f64(losss.inner_mut()[0], 1.7548090, 1e-7)); - //assert!(is_eq_f64(losss.inner_mut()[1], 1.67261622, 1e-7)); - } - } - } - - #[test] - fn test_calculate_loss_gt_constada_nosimd_2() { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if is_x86_feature_detected!("avx2") { - let (genot, phe, sw) = setup_test2(); - - let m = genot.m(); - let losss = vec![0.0f64; 2 * m]; - let mut losss = LossStruct::ConstAda(losss, m, 2); - unsafe { - calculate_loss_gt_constada_simd( - &mut losss, - &genot, - &sw, - &phe, - BoostParam::new_type1(), - &HashSet::::new(), - ); - } - - let losss_nosimd = vec![0.0f64; 2 * m]; - let mut losss_nosimd = LossStruct::ConstAda(losss_nosimd, m, 2); - calculate_loss_gt_constada_nosimd( - &mut losss_nosimd, - &genot, - &sw, - &phe, - BoostParam::new_type1(), - ); - - assert_float_absolute_eq!(losss_nosimd.inner_mut()[0], losss.inner_mut()[0]); - assert_float_absolute_eq!(losss_nosimd.inner_mut()[1], losss.inner_mut()[1]); - //assert!(is_eq_f64( - // losss_nosimd.inner_mut()[0], - // losss.inner_mut()[0], - // 1e-7 - //)); - //assert!(is_eq_f64( - // losss_nosimd.inner_mut()[1], - // losss.inner_mut()[1], - // 1e-7 - //)); - } - } - } + // TODO: after eff_eps + // #[test] + // fn test_calculate_loss_gt_constada_simd_2() { + // // This error below means SIMD memory is not aligned. + // // "process didn't exit successfully: (signal: 11, SIGSEGV: invalid memory reference)" + + // #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + // { + // if is_x86_feature_detected!("avx2") { + // let (genot, phe, sample_weight) = setup_test2(); + + // let m = genot.m(); + // let losss = vec![0.0f64; 2 * m]; + // let mut losss = LossStruct::ConstAda(losss, m, 2); + + // unsafe { + // calculate_loss_gt_constada_simd( + // &mut losss, + // &genot, + // &sample_weight, + // //&ps, + // &phe, + // BoostParam::new_type1(), + // &HashSet::::new(), + // ); + // } + // //calculate_loss_gt_simd(&mut losss, &mistakes, &ps, &ys, m, n); + // //println!("losss: {:?}", losss); + + // // abcd + // // ps: [0.1,0.2,0.3,0.4] + // // ys: [0,1,0,1] + // // eps: (0.2, 0.2) + // // dom: [0.6,0.3,0.0,0.1] + // // -> [0.8,0.5,0.2,0.3] + // // rec: [0.4,0.0,0.2,0.4] + // // ->[0.6,0.2,0.4,0.6] + // // dom: 1.7548090 + // // rec: 1.67261622 + // // -> not realistic due to small n + // assert_float_absolute_eq!(losss.inner_mut()[0], 1.7548090); + // assert_float_absolute_eq!(losss.inner_mut()[0], 1.7548090); + // //assert!(is_eq_f64(losss.inner_mut()[1], 1.67261622, 1e-7)); + // //assert!(is_eq_f64(losss.inner_mut()[0], 1.7548090, 1e-7)); + // //assert!(is_eq_f64(losss.inner_mut()[1], 1.67261622, 1e-7)); + // } + // } + // } + + // #[test] + // fn test_calculate_loss_gt_constada_nosimd_2() { + // #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + // { + // if is_x86_feature_detected!("avx2") { + // let (genot, phe, sw) = setup_test2(); + + // let m = genot.m(); + // let losss = vec![0.0f64; 2 * m]; + // let mut losss = LossStruct::ConstAda(losss, m, 2); + // unsafe { + // calculate_loss_gt_constada_simd( + // &mut losss, + // &genot, + // &sw, + // &phe, + // BoostParam::new_type1(), + // &HashSet::::new(), + // ); + // } + + // let losss_nosimd = vec![0.0f64; 2 * m]; + // let mut losss_nosimd = LossStruct::ConstAda(losss_nosimd, m, 2); + // calculate_loss_gt_constada_nosimd( + // &mut losss_nosimd, + // &genot, + // &sw, + // &phe, + // BoostParam::new_type1(), + // ); + + // assert_float_absolute_eq!(losss_nosimd.inner_mut()[0], losss.inner_mut()[0]); + // assert_float_absolute_eq!(losss_nosimd.inner_mut()[1], losss.inner_mut()[1]); + // //assert!(is_eq_f64( + // // losss_nosimd.inner_mut()[0], + // // losss.inner_mut()[0], + // // 1e-7 + // //)); + // //assert!(is_eq_f64( + // // losss_nosimd.inner_mut()[1], + // // losss.inner_mut()[1], + // // 1e-7 + // //)); + // } + // } + // } #[test] fn test_calculate_loss_gt_freemodelmissing_simd_3() { @@ -1732,7 +1766,8 @@ mod tests { &genot, &ps, &phe, - BoostParam::new_type1(), + BoostParam::new_type2(), + //BoostParam::new_type1(), &HashSet::::new(), ); } @@ -1748,45 +1783,46 @@ mod tests { } } - #[test] - fn test_calculate_loss_gt_freemodelmissing_nosimd_3() { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if is_x86_feature_detected!("avx2") { - let (genot, phe, sw) = setup_test2(); - - let m = genot.m(); - let losss = vec![0.0f64; m]; - let mut losss = LossStruct::LossOne(losss, m); - unsafe { - calculate_loss_gt_freemodelmissing_simd( - &mut losss, - &genot, - &sw, - &phe, - BoostParam::new_type1(), - &HashSet::::new(), - ); - } - - let losss_nosimd = vec![0.0f64; m]; - let mut losss_nosimd = LossStruct::LossOne(losss_nosimd, m); - calculate_loss_gt_freemodelmissing_nosimd( - &mut losss_nosimd, - &genot, - &sw, - &phe, - BoostParam::new_type1(), - ); - - assert_float_absolute_eq!(losss_nosimd.inner_mut()[0], losss.inner_mut()[0]); - - //assert!(is_eq_f64( - // losss_nosimd.inner_mut()[0], - // losss.inner_mut()[0], - // 1e-7 - //)); - } - } - } + // #[test] + // fn test_calculate_loss_gt_freemodelmissing_nosimd_3() { + // #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + // { + // if is_x86_feature_detected!("avx2") { + // let (genot, phe, sw) = setup_test2(); + + // let m = genot.m(); + // let losss = vec![0.0f64; m]; + // let mut losss = LossStruct::LossOne(losss, m); + // unsafe { + // calculate_loss_gt_freemodelmissing_simd( + // &mut losss, + // &genot, + // &sw, + // &phe, + // BoostParam::new_type2(), + // //BoostParam::new_type1(), + // &HashSet::::new(), + // ); + // } + + // let losss_nosimd = vec![0.0f64; m]; + // let mut losss_nosimd = LossStruct::LossOne(losss_nosimd, m); + // calculate_loss_gt_freemodelmissing_nosimd( + // &mut losss_nosimd, + // &genot, + // &sw, + // &phe, + // BoostParam::new_type2(), + // ); + + // assert_float_absolute_eq!(losss_nosimd.inner_mut()[0], losss.inner_mut()[0]); + + // //assert!(is_eq_f64( + // // losss_nosimd.inner_mut()[0], + // // losss.inner_mut()[0], + // // 1e-7 + // //)); + // } + // } + // } } diff --git a/projects_rust/boosting/src/boosting_train/regression_cov.rs b/projects_rust/boosting/src/boosting_train/regression_cov.rs index 81f176d..a967062 100644 --- a/projects_rust/boosting/src/boosting_train/regression_cov.rs +++ b/projects_rust/boosting/src/boosting_train/regression_cov.rs @@ -150,9 +150,6 @@ pub fn logistic_regression_covs(samples: &Samples, iteration_start: usize) -> Ve // TODO: should pass WgtBoosts and add inside pub fn logistic_regression_covs(samples: &Samples, iteration_start: usize) -> Vec { log::debug!("use smartcore for logreg_cov."); - //log::debug!("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - //log::debug!("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - //log::debug!("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); let wgts = regression::logistic_regression_covs(samples); let mut wgts_cov: Vec = Vec::new(); diff --git a/projects_rust/boosting/src/boosting_train/table.rs b/projects_rust/boosting/src/boosting_train/table.rs index 7f152d9..7f4d232 100644 --- a/projects_rust/boosting/src/boosting_train/table.rs +++ b/projects_rust/boosting/src/boosting_train/table.rs @@ -9,8 +9,7 @@ pub const MACHINE_EPS: f64 = 1e-12; //pub type WgtColumn = Coef; -// TODO: make Five -// TODO:remove Six (same as Seven) +// TODO: reverse order: (d0, n0, d1, n1, d2, n2, m) #[derive(Copy, Clone, Debug)] pub enum ContingencyTable { // AdaBoost @@ -27,25 +26,36 @@ pub enum ContingencyTable { // FreeModelMissing // (D2, N2, D1, N1, D0, N0, M); Diseased, Not diseased, Missing Seven((f64, f64, f64, f64, f64, f64, f64)), + // count + // (D2, N2, D1, N1, D0, N0, DM, NM); Diseased, Not diseased, Missing + EightCount((usize, usize, usize, usize, usize, usize, usize, usize)), } impl ContingencyTable { pub fn new_four(x: (f64, f64, f64, f64)) -> ContingencyTable { let x = ContingencyTable::Four(x); - x.check(); + x.check_f64(); x } pub fn new_five(x: (f64, f64, f64, f64, f64)) -> ContingencyTable { let x = ContingencyTable::Five(x); - x.check(); + x.check_f64(); x } pub fn new_seven(x: (f64, f64, f64, f64, f64, f64, f64)) -> ContingencyTable { let x = ContingencyTable::Seven(x); - x.check(); + x.check_f64(); x } + + pub fn new_eight_count( + x: (usize, usize, usize, usize, usize, usize, usize, usize), + ) -> ContingencyTable { + let x = ContingencyTable::EightCount(x); + x + } + pub fn two(self) -> (f64, f64) { match self { ContingencyTable::Two(x) => x, @@ -71,6 +81,13 @@ impl ContingencyTable { } } + pub fn eight_count(self) -> (usize, usize, usize, usize, usize, usize, usize, usize) { + match self { + ContingencyTable::EightCount(x) => x, + _ => panic!("Not eight count."), + } + } + pub fn is_five(self) -> bool { if let ContingencyTable::Seven(_) = self { true @@ -86,7 +103,7 @@ impl ContingencyTable { } } - pub fn check(self) { + pub fn check_f64(self) { self.check_all_positive(); } @@ -123,6 +140,7 @@ impl ContingencyTable { && (n0 > -MACHINE_EPS) && (m >= -MACHINE_EPS) } + Self::EightCount(_) => true, }; if !f { panic!( @@ -277,7 +295,8 @@ pub fn adjust_eps_logit( epsilons_wzs: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) epsilons_wls: (f64, f64), //(epsilon_case: f64, epsilon_cont: f64,) eps: Option, - table8_count: (usize, usize, usize, usize, usize, usize, usize, usize), + //table8_count: (usize, usize, usize, usize, usize, usize, usize, usize), + table8_count: ContingencyTable, ) -> ((f64, f64, f64), (f64, f64, f64), bool) { if eps.is_some() && eps.unwrap().dom() { panic!("wrong eps") @@ -336,14 +355,15 @@ pub fn adjust_eps_logit( fn adjust_eps_logit_nondom_add_cat( wzs_sum: (f64, f64, f64), epsilons: (f64, f64), - table8_count: (usize, usize, usize, usize, usize, usize, usize, usize), + //table8_count: (usize, usize, usize, usize, usize, usize, usize, usize), + table8_count: ContingencyTable, negative_for_cont: bool, ) -> ((f64, f64, f64), bool) { let (wzs_sum2, wzs_sum1, wzs_sum0) = wzs_sum; //let (mut wzs_sum2, mut wzs_sum1, mut wzs_sum0)=wzs_sum; //let (epsilon_case, epsilon_cont) = epsilons_wzs; - let (d2, n2, d1, n1, d0, n0, _dm, _nm) = table8_count; + let (d2, n2, d1, n1, d0, n0, _dm, _nm) = table8_count.eight_count(); fn add_eps_sum( wzs_sum: f64, diff --git a/projects_rust/boosting/src/lib.rs b/projects_rust/boosting/src/lib.rs index 1836d33..4cb261e 100644 --- a/projects_rust/boosting/src/lib.rs +++ b/projects_rust/boosting/src/lib.rs @@ -47,7 +47,7 @@ pub fn run_boosting( gfmt: GenotFormat, fin_phe: Option<&Path>, phe_name: Option<&str>, - cov_name: &str, + cov_name: Option<&str>, boost_method: BoostMethod, boost_params: &BoostParams, fin_snv: Option<&Path>, @@ -63,6 +63,7 @@ pub fn run_boosting( //learning_rates: &[f64], is_monitor: bool, //nsnvs_monitor: Option>, + make_major_a2_train: bool, ) { // check fwgt does not exist. if !is_resume { @@ -99,6 +100,7 @@ pub fn run_boosting( //fin_sample_val, use_adjloss, prune_snv, + make_major_a2_train, ); log::info!("Created dataset: {} sec", start_time.elapsed().as_secs()); @@ -163,7 +165,8 @@ pub fn run_boosting( .zip(learning_rates.iter()) .max_by(|((_, acc_a), _), ((_, acc_b), _)| acc_a.partial_cmp(acc_b).unwrap()) .map(|((nsnv, _), lr)| (nsnv, lr)) - .expect("Cannot determine the highest accuracy."); + .unwrap_or_else(|| panic!("Cannot determine the highest accuracy.")); + //.expect("Cannot determine the highest accuracy."); log::info!( "The best parameter is #SNVs={:?}, learning rate={:?}.", nsnv_acc_max, @@ -260,7 +263,7 @@ pub fn run_boosting_integrate_cv( gfmt: GenotFormat, fin_phe: Option<&Path>, phe_name: Option<&str>, - cov_name: &str, + cov_name: Option<&str>, boost_method: BoostMethod, boost_params_types: &BoostParamsTypes, fin_snv: Option<&Path>, @@ -275,13 +278,14 @@ pub fn run_boosting_integrate_cv( prune_snv: Option, //learning_rates: &[f64], is_monitor: bool, + make_major_a2_train: bool, cross_validation: Option, seed: Option, ) { match cross_validation { Some(cvn) => { // FIXME: if fin_sample is Some(), extract samples only from them - // FIXME: write down sample id + // FIXME: write down sample id let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); // create cv samples let sample_idx_cvs: Vec<(Vec, Vec)> = if cvn == 1 { @@ -327,21 +331,13 @@ pub fn run_boosting_integrate_cv( &fin, gfmt, phe_buf.as_deref(), - //fin_phe.as_deref(), phe_name.as_deref(), - &cov_name, + cov_name.as_deref(), boost_method, &boost_params_types, snv_buf.as_deref(), - //fin_snv, Some(&sample_buf), - //fin_sample, Some(&sample_val_buf), - //fin_sample_val, - //fin_snv.as_deref(), - //fin_sample.as_deref(), - //fin_cov.as_deref(), - //fin_sample_val.as_deref(), use_adjloss, use_const_for_loss, is_resume, @@ -349,6 +345,7 @@ pub fn run_boosting_integrate_cv( prune_snv, //&learning_rates, true, //is_monitor, + make_major_a2_train, ); } } @@ -367,21 +364,13 @@ pub fn run_boosting_integrate_cv( &fin, gfmt, phe_buf.as_deref(), - //fin_phe.as_deref(), phe_name.as_deref(), - &cov_name, + cov_name.as_deref(), boost_method, &boost_params_types, snv_buf.as_deref(), - //fin_snv, sample_buf.as_deref(), - //fin_sample, sample_val_buf.as_deref(), - //fin_sample_val, - //fin_snv.as_deref(), - //fin_sample.as_deref(), - //fin_cov.as_deref(), - //fin_sample_val.as_deref(), use_adjloss, use_const_for_loss, is_resume, @@ -389,6 +378,7 @@ pub fn run_boosting_integrate_cv( prune_snv, //&learning_rates, is_monitor, + make_major_a2_train, ); } } @@ -403,7 +393,7 @@ pub fn run_boosting_integrate( phe_buf: Option<&[u8]>, //fin_phe: Option<&Path>, phe_name: Option<&str>, - cov_name: &str, + cov_name: Option<&str>, boost_method: BoostMethod, boost_params_types: &BoostParamsTypes, snv_buf: Option<&[u8]>, @@ -421,6 +411,7 @@ pub fn run_boosting_integrate( prune_snv: Option, //learning_rates: &[f64], is_monitor: bool, + make_major_a2_train: bool, ) { // TODO: if all wgt exceeds #SNVs, then exit here. io_genot::check_valid_fin(fin, gfmt); @@ -456,6 +447,7 @@ pub fn run_boosting_integrate( //fin_sample_val, use_adjloss, prune_snv, + make_major_a2_train, ); log::info!("Created dataset: {} sec", start_time.elapsed().as_secs()); @@ -693,7 +685,7 @@ fn load_dataset_boosting( //fin_phe: Option<&Path>, phe_buf: Option<&[u8]>, phe_name: Option<&str>, - cov_name: &str, + cov_name: Option<&str>, boost_params: &BoostParams, snv_buf: Option<&[u8]>, //fin_snv: Option<&Path>, @@ -704,6 +696,7 @@ fn load_dataset_boosting( //fin_sample_val: Option<&Path>, use_adjloss: bool, prune_snv: Option, + make_major_a2_train: bool, ) -> (Dataset, Option) { //let phe_buf = fin_phe.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); //let snv_buf = fin_snv.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); @@ -712,14 +705,16 @@ fn load_dataset_boosting( // FIXME: check if in .fam or .phe, col=0 is not duplicated let boost_param = boost_params.param_lr_none(); - let use_missing = boost_param.boost_type().use_missing(); + //let use_missing = boost_param.boost_type().use_missing(); + let fill_missing = boost_param.boost_type().fill_missing(); // create dataset // extract snvs by loss function if let Some(prop_prune_snv) = prune_snv { log::info!("Prune SNVs by decreasing loss: {}", prop_prune_snv); let start = Instant::now(); - sample_val_buf.expect("Not Implemented"); + sample_val_buf.unwrap_or_else(|| panic!("Not Implemented")); + //.expect("Not Implemented"); //fin_sample_val.expect("Not Implemented"); // TODO: better @@ -755,28 +750,13 @@ fn load_dataset_boosting( cov_name, snv_buf.as_deref(), sample_buf.as_deref(), - use_missing, Some(&filt_snv), + fill_missing, + make_major_a2_train, None, ); - /* let dataset = Dataset::new_old( - fin, - gfmt, - fin_phe, - phe_name, - cov_name, - fin_snv, - fin_sample, - fin_cov, - use_missing, - Some(&filt_snv), - None, - ); - */ - //TODO: mv boostmethod::classic let n = dataset.samples().samples_n(); - //let mut pred: Vec = vec![0u8; n]; let mut scores: Vec = vec![0.0; n]; @@ -787,7 +767,7 @@ fn load_dataset_boosting( boost_param.sample_weight_clip(), boost_param.sample_weight_wls_clip(), ); - sample_weight.renew_sample_weight(&scores, dataset.samples().phe()); + sample_weight.renew_sample_weight(&scores, dataset.samples().phe_unwrap()); let mut wgts = WgtBoosts::new(boost_param.boost_type()); let _ = boosting_train::boosting_covs( @@ -806,7 +786,7 @@ fn load_dataset_boosting( &mut loss, dataset.genot(), &sample_weight, - dataset.samples().phe(), + dataset.samples().phe_unwrap(), boost_param, &HashSet::::new(), use_adjloss, @@ -832,7 +812,7 @@ fn load_dataset_boosting( log::debug!("created use_snvs_loss {}", use_snvs_loss.len()); - let dataset = Dataset::new( + let dataset = Dataset::new_boost_training( fin, gfmt, phe_buf.as_deref(), @@ -843,9 +823,10 @@ fn load_dataset_boosting( sample_buf.as_deref(), //fin_snv, //fin_sample, - use_missing, + fill_missing, Some(&use_snvs_loss), None, + make_major_a2_train, ); log::debug!( @@ -859,7 +840,7 @@ fn load_dataset_boosting( //let phe_buf = fin_phe.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); //let snv_buf = fin_snv.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); //let sample_buf = fin_sample.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); - let dataset = Dataset::new( + let dataset = Dataset::new_boost_training( fin, gfmt, phe_buf.as_deref(), @@ -870,9 +851,11 @@ fn load_dataset_boosting( sample_buf.as_deref(), //fin_snv, //fin_sample, - use_missing, + //use_missing, + fill_missing, None, None, + make_major_a2_train, ); //dataset_ext = dataset; @@ -882,7 +865,7 @@ fn load_dataset_boosting( // need to align and sort snv //let sample_val_buf = // fin_sample_val.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); - let dataset_val = Dataset::new( + let dataset_val = Dataset::new_boost_training( fin, gfmt, phe_buf.as_deref(), @@ -893,9 +876,11 @@ fn load_dataset_boosting( sample_val_buf.as_deref(), //fin_snv, //fin_sample_val, - use_missing, + //use_missing, + fill_missing, None, Some(dataset.snvs()), + make_major_a2_train, ); //dataset_ext_val = Some(dataset_val); Some(dataset_val) @@ -920,7 +905,7 @@ pub fn run_boosting_score_cv( gfmt: GenotFormat, fin_phe: Option<&Path>, //fin_phe: &Path, - phe_name: Option<&str>, + //phe_name: Option<&str>, cov_name: Option<&str>, is_every_para: bool, iterations_in: Option<&[usize]>, @@ -931,6 +916,7 @@ pub fn run_boosting_score_cv( learning_rates: &[f64], use_iter: bool, cross_vali: Option, + use_snv_pos: bool, ) { match cross_vali { Some(cvn) => { @@ -953,7 +939,7 @@ pub fn run_boosting_score_cv( gfmt, phe_buf.as_deref(), //fin_phe.as_deref(), - phe_name.as_deref(), + //phe_name.as_deref(), cov_name, is_every_para, iterations_in, @@ -965,6 +951,7 @@ pub fn run_boosting_score_cv( //boost_param, &learning_rates, use_iter, + use_snv_pos, ); } } @@ -979,7 +966,7 @@ pub fn run_boosting_score_cv( gfmt, phe_buf.as_deref(), //fin_phe.as_deref(), - phe_name.as_deref(), + //phe_name.as_deref(), cov_name, is_every_para, iterations_in, @@ -991,6 +978,7 @@ pub fn run_boosting_score_cv( //boost_param, &learning_rates, use_iter, + use_snv_pos, ); } } @@ -1004,7 +992,7 @@ pub fn run_boosting_score( phe_buf: Option<&[u8]>, //fin_phe: Option<&Path>, //fin_phe: &Path, - phe_name: Option<&str>, + //phe_name: Option<&str>, cov_name: Option<&str>, is_every_para: bool, iterations_in: Option<&[usize]>, @@ -1015,6 +1003,7 @@ pub fn run_boosting_score( //boost_param: BoostParam, learning_rates: &[f64], use_iter: bool, + use_snv_pos: bool, ) { io_genot::check_valid_fin(fin, gfmt); @@ -1022,6 +1011,7 @@ pub fn run_boosting_score( //let sample_buf = fin_sample.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); if !is_every_para { + // score of the best para let dout_wgt = dout_wgt.unwrap(); let file_wgt_para = wgt_boost::io::get_file_wgt_best(dout_wgt); @@ -1037,12 +1027,13 @@ pub fn run_boosting_score( gfmt, phe_buf.as_deref(), //fin_phe, - phe_name, + //phe_name, cov_name, &file_wgt_para, sample_buf.as_deref(), //fin_sample, //boost_param, + use_snv_pos, ); } else { if let Some(dout_wgt) = dout_wgt { @@ -1066,7 +1057,7 @@ pub fn run_boosting_score( gfmt, phe_buf.as_deref(), //fin_phe, - phe_name, + //phe_name, cov_name, iterations_in.unwrap(), &file_wgt_para, @@ -1074,6 +1065,7 @@ pub fn run_boosting_score( //fin_sample, //boost_param, use_iter, + use_snv_pos, ); //let dout_wgt_para = wgt_boost::io::get_dname_para(dout_wgt, learning_rate); /* @@ -1124,7 +1116,7 @@ pub fn run_boosting_score( gfmt, phe_buf.as_deref(), //fin_phe, - phe_name, + //phe_name, cov_name, iterations_in.unwrap(), &file_wgt, @@ -1132,6 +1124,7 @@ pub fn run_boosting_score( //fin_sample, //boost_param, use_iter, + use_snv_pos, ); } else { panic!("sth wrong.") diff --git a/projects_rust/boosting/src/wgt_boost/io.rs b/projects_rust/boosting/src/wgt_boost/io.rs index c652bb6..ae999af 100644 --- a/projects_rust/boosting/src/wgt_boost/io.rs +++ b/projects_rust/boosting/src/wgt_boost/io.rs @@ -17,9 +17,9 @@ pub fn get_dir_score(dout: &Path, learning_rate: f64) -> PathBuf { get_dname_para(dout, learning_rate) } -pub fn get_dir_cv(dout: &Path,cvi: usize) -> PathBuf { +pub fn get_dir_cv(dout: &Path, cvi: usize) -> PathBuf { let mut d = dout.to_owned(); - let dpara = String::from("cv-")+ &cvi.to_string(); + let dpara = String::from("cv-") + &cvi.to_string(); d.push(dpara); d } diff --git a/projects_rust/boosting/src/wgt_boosts.rs b/projects_rust/boosting/src/wgt_boosts.rs index c1b66bc..b0e85f1 100644 --- a/projects_rust/boosting/src/wgt_boosts.rs +++ b/projects_rust/boosting/src/wgt_boosts.rs @@ -28,9 +28,6 @@ impl WgtBoosts { columns: io::wgt_columns(boost_type), index_next_write: 0, } - - - } //pub fn new_from_file_pathbuf(fin_wgt: &Path, boost_type: BoostType) -> Self { @@ -117,9 +114,14 @@ impl WgtBoosts { &self.columns } - pub fn use_missing(&self)->bool{ + //pub fn use_missing(&self) -> bool { + // // TODO: ok? + // self.columns.contains(&"scorem".to_string()) + //} + + pub fn fill_missing(&self) -> bool { // TODO: ok? - self.columns.contains(&"scorem".to_string()) + !self.columns.contains(&"scorem".to_string()) } // TODO: test @@ -133,7 +135,7 @@ impl WgtBoosts { .len() } - pub fn last_wgt(&self)->Option<&WgtBoost>{ + pub fn last_wgt(&self) -> Option<&WgtBoost> { self.wgts().last() //if self.wgts_n()==0{ // return None; diff --git a/projects_rust/genetics/Cargo.toml b/projects_rust/genetics/Cargo.toml index b014b16..d770e21 100644 --- a/projects_rust/genetics/Cargo.toml +++ b/projects_rust/genetics/Cargo.toml @@ -15,7 +15,6 @@ plink2 = ["dep:pgenlib","dep:bindgen","dep:zstd"] pgenlib = {package="pgenlib_rust", path = "../../lib/pgenlib_rust", optional = true} bindgen = {version="0.65.1", optional = true} zstd = {version="0.12", optional=true} -#zstd = "0.12" cmatrix={path="../cmatrix"} mysmartcore={ path="../../lib/mysmartcore"} maligned="0" diff --git a/projects_rust/genetics/benches/common.rs b/projects_rust/genetics/benches/common.rs index b2cb499..e39388c 100644 --- a/projects_rust/genetics/benches/common.rs +++ b/projects_rust/genetics/benches/common.rs @@ -50,19 +50,21 @@ fn setup_vars( let snv_buf = fin_snv.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); let sample_buf = fin_sample.map(|x| genetics::textfile::read_file_to_end(x, None).unwrap()); - let dataset: Dataset = Dataset::new( + let dataset: Dataset = Dataset::new_boost_training( fin, GenotFormat::Plink1, None, None, - "", + None, snv_buf.as_deref(), //fin_snv, sample_buf.as_deref(), //fin_sample, - false, + //false, + true, None, None, + false, ); let n = dataset.samples().samples_n(); let m = dataset.snvs().snvs_n(); diff --git a/projects_rust/genetics/src/alloc.rs b/projects_rust/genetics/src/alloc.rs index fc1672e..bcc5ada 100644 --- a/projects_rust/genetics/src/alloc.rs +++ b/projects_rust/genetics/src/alloc.rs @@ -72,6 +72,11 @@ pub fn get_available_memory() -> Option { } } +/// memory as GB +pub fn mem_gb(mem: usize) -> f64 { + (mem as f64) / ((1024usize * 1024 * 1024) as f64) +} + /* // should create unused_memory()? pub fn get_total_memory_old() -> usize { let mut system = sysinfo::System::new_all(); diff --git a/projects_rust/genetics/src/bin/genetics.rs b/projects_rust/genetics/src/bin/genetics.rs index 2c972a3..3adb14f 100644 --- a/projects_rust/genetics/src/bin/genetics.rs +++ b/projects_rust/genetics/src/bin/genetics.rs @@ -47,8 +47,8 @@ struct ScoreArgs { file_sample: Option, #[arg(long)] file_phe: Option, - #[arg(long)] - phe: Option, + //#[arg(long)] + //phe: Option, #[arg(long)] cov: Option, #[arg(long)] @@ -73,7 +73,6 @@ impl GenotFormatArg { } fn main() { - unimplemented!(); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] @@ -118,14 +117,14 @@ fn main() { let fin = PathBuf::from(args.file_plink); let genot_format = args.genot_format.to_naive(); let fin_phe = args.file_phe.map(|x| PathBuf::from(x)); - let phe_name = match args.phe { - None => None, - // &*String -> str - Some(y) => match &*y { - "None" => None, - z => Some(z.to_string()), - }, - }; + //let phe_name = match args.phe { + // None => None, + // // &*String -> str + // Some(y) => match &*y { + // "None" => None, + // z => Some(z.to_string()), + // }, + //}; let cov_name = args.cov; let fin_sample = args.file_sample.map(|x| PathBuf::from(x)); @@ -146,13 +145,18 @@ fn main() { &fin, genot_format, fin_phe.as_deref(), - phe_name.as_deref(), + //phe_name.as_deref(), cov_name.as_deref(), dout_wgt.as_deref(), // use enum? fout_wgt.as_deref(), //fin_cov.as_deref(), fin_sample.as_deref(), + None, + None, is_resume, + false, + false, + false, // TODO ); } } diff --git a/projects_rust/genetics/src/bin/genetics_res.rs b/projects_rust/genetics/src/bin/genetics_res.rs index 4c42a1b..351ba71 100644 --- a/projects_rust/genetics/src/bin/genetics_res.rs +++ b/projects_rust/genetics/src/bin/genetics_res.rs @@ -1,8 +1,6 @@ //! General lib and application of general functions //! - Polygenic score //! -//! -//! use clap::{ArgGroup, Args, Parser, Subcommand, ValueEnum}; //#[macro_use] @@ -18,7 +16,7 @@ struct Cli { #[command(subcommand)] command: Commands, - // globa=true makes you able to `-- trian --verbose` + // global=true makes you able to `-- train --verbose` #[arg(long, global = true, help = "Number of threads")] threads: Option, #[arg(long, global = true, help = "Verbose")] @@ -48,12 +46,34 @@ struct ScoreArgs { file_sample: Option, #[arg(long)] file_phe: Option, - #[arg(long)] - phe: Option, + //#[arg(long)] + //phe: Option, #[arg(long)] cov: Option, #[arg(long)] resume: bool, + #[arg( + long, + help = "Concat parameters into one file. Only use wgt file name is *_n-[#snv].wgt for --concat 'n'. " + )] + concat: Option, + #[arg( + long, + help = "Opposite of --concat. Use wgts not calculated in --concat." + )] + no_concat: Option, + #[arg( + long, + help = "Allow snvs or alleles not in genot. The score is ignored." + )] + allow_nonexist_snv: bool, + #[arg( + long, + help = "When matching snvs in wgt and genot, use chromosome and posotion not variant id to match." + )] + use_snv_pos: bool, + #[arg(long, help = "Use column score0-score2.")] + nonadd: bool, } #[derive(Copy, Clone, PartialEq, Eq, Debug, ValueEnum)] @@ -116,7 +136,7 @@ fn main() { let fin = PathBuf::from(args.file_genot); let genot_format = args.genot_format.to_naive(); let fin_phe = args.file_phe.map(|x| PathBuf::from(x)); - let phe_name = args.phe; + //let phe_name = args.phe; let cov_name = args.cov; let fin_sample = args.file_sample.map(|x| PathBuf::from(x)); @@ -125,18 +145,34 @@ fn main() { let is_resume = args.resume; + let concat = args.concat; + let no_concat = args.no_concat; + + if concat.is_some() & no_concat.is_some() { + panic!("--concat and --no-concat cannot be used together."); + } + + let allow_nonexist_snv = args.allow_nonexist_snv; + let use_snv_pos = args.use_snv_pos; + let is_nonadd = args.nonadd; + genetics::run_score( &dout_score, &fin, genot_format, fin_phe.as_deref(), - phe_name.as_deref(), + //phe_name.as_deref(), cov_name.as_deref(), dout_wgt.as_deref(), // use enum? fout_wgt.as_deref(), //fin_cov.as_deref(), fin_sample.as_deref(), + concat.as_deref(), + no_concat.as_deref(), is_resume, + allow_nonexist_snv, + use_snv_pos, + is_nonadd, ); } } diff --git a/projects_rust/genetics/src/bin/test_pgenlib.rs b/projects_rust/genetics/src/bin/test_pgenlib.rs new file mode 100644 index 0000000..afd06a2 --- /dev/null +++ b/projects_rust/genetics/src/bin/test_pgenlib.rs @@ -0,0 +1,137 @@ +//! General lib and application of general functions +//! - Polygenic score +//! +//! +//! + +use clap::{Parser, ValueEnum}; +//#[macro_use] +//extern crate clap; +//use clap::{AppSettings, Arg, ArgMatches, SubCommand}; +//use rayon; +use genetics::GenotFormat; +use std::{path::PathBuf, time::Instant}; +//use clap::app_from_crate; //error + +//#[derive(Debug, Parser)] +//struct Cli { +// #[command(subcommand)] +// command: Commands, +// +// // global=true makes you able to `-- train --verbose` +// #[arg(long, global = true, help = "Number of threads")] +// threads: Option, +// #[arg(long, global = true, help = "Verbose")] +// verbose: bool, +//} + +//#[derive(Debug, Subcommand)] +//enum Commands { +// #[command(about = "score")] +// Score(ScoreArgs), +//} + +#[derive(Parser, Debug)] +#[command(author, version, about = "test pgenlib")] +struct Args { + #[arg(long)] + file_genot: String, + #[arg(long, value_enum)] + genot_format: GenotFormatArg, + #[arg(long)] + file_sample: Option, + #[arg(long, help = "Number of threads")] + threads: Option, + #[arg(long, help = "Verbose")] + verbose: bool, +} + +#[derive(Copy, Clone, PartialEq, Eq, Debug, ValueEnum)] +enum GenotFormatArg { + Plink, + Plink2, + Plink2Vzs, +} + +impl GenotFormatArg { + pub fn to_naive(self) -> GenotFormat { + match self { + GenotFormatArg::Plink => GenotFormat::Plink1, + GenotFormatArg::Plink2 => GenotFormat::Plink2, + GenotFormatArg::Plink2Vzs => GenotFormat::Plink2Vzs, + } + } +} + +fn main() { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx2") { + log::info!("Able to use SIMD.") + } else { + log::info!("Not able to use SIMD since avx2 is not detected.") + } + } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + log::info!("Not able to use SIMD since arch is not x86 or x86_64.") + } + + let start = Instant::now(); + + let args = Args::parse(); + println!("args: {:?}", args); + + //let cli = Cli::parse(); + //println!("cli: {:?}", cli); + + let verbose = args.verbose; + if verbose { + std::env::set_var("RUST_LOG", "debug"); + } else { + std::env::set_var("RUST_LOG", "info"); + } + env_logger::init(); + + if let Some(n_threads) = args.threads { + //n_threads=n_threads.min(num_cpus::get()); + rayon::ThreadPoolBuilder::new() + .num_threads(n_threads) + .build_global() + .unwrap(); + }; + // otherwise, use default thread number + log::debug!("num_thread set: {}", rayon::current_num_threads()); + + //let dout_score = PathBuf::from(args.dir_score); + let fin = PathBuf::from(args.file_genot); + let _genot_format = args.genot_format.to_naive(); + //let fin_phe = args.file_phe.map(|x| PathBuf::from(x)); + //let phe_name = args.phe; + //let cov_name = args.cov; + let fin_sample = args.file_sample.map(|x| PathBuf::from(x)); + + //let dout_wgt = args.dir_wgt.map(|x| PathBuf::from(x)); + //let fout_wgt = args.file_wgt.map(|x| PathBuf::from(x)); + + //let is_resume = args.resume; + + //let concat = args.concat; + + println!("start test_pgenlib"); + //let g=genetics::test_pgenlib(&fin, genot_format, fin_sample.as_deref()); + + let g_plink2 = genetics::test_pgenlib(&fin, GenotFormat::Plink2Vzs, fin_sample.as_deref()); + let g_plink1 = genetics::test_pgenlib(&fin, GenotFormat::Plink1, fin_sample.as_deref()); + + //assert_eq!(g_plink2, g_plink1); + if g_plink2 == g_plink1 { + println!("OK: plink1==plink2"); + } else { + println!("**WARNING**: plink1!=plink2"); + } + + let end = start.elapsed(); + log::info!("It took {} seconds.", end.as_secs()); + log::info!("Done!!"); +} diff --git a/projects_rust/genetics/src/cov.rs b/projects_rust/genetics/src/cov.rs index f7772b4..f909ccd 100644 --- a/projects_rust/genetics/src/cov.rs +++ b/projects_rust/genetics/src/cov.rs @@ -323,22 +323,22 @@ mod tests { use crate::{io_genot, sample, samples, GenotFormat}; - fn setup_test() -> (Option, usize, HashMap) { - let fin = PathBuf::from("../../test/data/toy1/genot"); - let fin_var = Some(PathBuf::from("../../test/data/toy1/genot.cov")); - let fin_sample = None; + // fn setup_test() -> (Option, usize, HashMap) { + // let fin = PathBuf::from("../../test/data/toy1/genot"); + // let fin_var = Some(PathBuf::from("../../test/data/toy1/genot.cov")); + // let fin_sample = None; - let gfmt = GenotFormat::Plink1; - let n_in: usize = io_genot::compute_num_sample(&fin, gfmt).unwrap(); - println!("n_in: {}", n_in); + // let gfmt = GenotFormat::Plink1; + // let n_in: usize = io_genot::compute_num_sample(&fin, gfmt).unwrap(); + // println!("n_in: {}", n_in); - let (n, use_samples) = sample::make_use_samples(fin_sample, &fin, gfmt); - println!("n: {}", n); + // let (n, use_samples) = sample::make_use_samples(fin_sample, &fin, gfmt); + // println!("n: {}", n); - let sample_id_to_n = samples::create_sample_id_to_n(&fin, gfmt, Some(&use_samples)); + // let sample_id_to_n = samples::create_sample_id_to_n(&fin, gfmt, Some(&use_samples)); - (fin_var, n_in, sample_id_to_n) - } + // (fin_var, n_in, sample_id_to_n) + // } /* #[test] fn test_load_vars() { diff --git a/projects_rust/genetics/src/dataset.rs b/projects_rust/genetics/src/dataset.rs index 1b1a4bd..03431ec 100644 --- a/projects_rust/genetics/src/dataset.rs +++ b/projects_rust/genetics/src/dataset.rs @@ -8,13 +8,9 @@ pub mod io_genot; pub mod samples; pub mod snvs; -use crate::{ - genot::{BaseGenot, BaseGenotMut, BaseGenotSnv}, - sample, - snv, vec, - wgt::WgtTrait, - Covs, Genot, GenotFormat, Samples, Snvs, Wgts, -}; +use crate::genot::prelude::*; +//genot::{BaseGenot, BaseGenotMut, BaseGenotSnv, BaseGenotSnvMut}, +use crate::{sample, snv, vec, wgt::WgtTrait, Covs, GenotFormat, Samples, Snvs, Wgts}; use rayon::iter::{ParallelBridge, ParallelIterator}; use std::{path::Path, time::Instant}; @@ -34,9 +30,10 @@ impl Dataset { /// /// Input is one of following /// 1. plink2 + fin_phe - /// 2. plink2 (cov and phe in .psam) : phe_buf is option - /// 3. plink1 (phe) + fin_phe (cov) : phe_name is option + /// 2. plink2 (phe) + fin_phe (cov) : phe_name is in psam + /// 3. plink2 (cov and phe in .psam) : phe_buf is option /// 4. plink1 + fin_phe (cov and phe) + /// 5. plink1 (phe) + fin_phe (cov) : phe_name is option pub fn new( fin: &Path, gfmt: GenotFormat, @@ -44,30 +41,27 @@ impl Dataset { //phe_buf: &[u8], //fin_phe: Option<&Path>, phe_name: Option<&str>, - cov_name: &str, + cov_name: Option<&str>, extract_snv_buf: Option<&[u8]>, //fin_snv: Option<&Path>, extract_sample_buf: Option<&[u8]>, //fin_sample: Option<&Path>, //fin_cov: Option<&Path>, //to deprecate - use_missing: bool, // TODO: merge filt_snv and fin_snv into use_snvs, filt_snv: Option<&[bool]>, + fill_missing: bool, + make_major_a2_train: bool, snvs_train: Option<&Snvs>, //for use_missing of vali ) -> Self { - let start = Instant::now(); - io_genot::check_valid_fin(fin, gfmt); - let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); - log::debug!("m_in: {}", m_in); - let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); - log::debug!("n_in: {}", n_in); + //let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); + //log::debug!("m_in: {}", m_in); + //let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); + //log::debug!("n_in: {}", n_in); // load snvs let snvs_in = io_genot::load_snvs(fin, gfmt); //let snvs_in = plink::load_snvs(fin, m_in); let (mut m, mut use_snvs) = snv::make_use_snvs_buf(extract_snv_buf, &snvs_in); - //let (mut m, mut use_snvs) = snv::make_use_snvs(fin_snv, &snvs_in); - //let (m, use_snvs) = plink::make_use_snvs(fin_snv, &snvs_in); if let Some(filt_snv_) = filt_snv { log::debug!("filt_snv before m: {}", m); @@ -81,32 +75,114 @@ impl Dataset { panic!("Using SNVs are zero. Please check fin_snv.") } log::debug!("m: {}", m); - //let snv_indexs = snv::extract_snvs_consume(snvs_in, &use_snvs, m); let (n, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); - //let (n, use_samples) = sample::make_use_samples(fin_sample, fin, gfmt); if n == 0 { panic!("Using samples are zero. Please check fin_sample.") } log::debug!("n: {}", n); - // do not fill missing here; below - let genot = - io_genot::load::generate_genot(fin, gfmt, m, n, &use_snvs, Some(&use_samples), true); - //plink::load::generate_genot(fin, m, n, &use_snvs, Some(&use_samples), use_missing); + Self::new_use_vec( + fin, + gfmt, + phe_buf, + phe_name, + cov_name, + Some(&use_snvs), + Some(&use_samples), + fill_missing, + make_major_a2_train, + snvs_train, + ) + } - let sample_id_to_n = samples::create_sample_id_to_n(fin, gfmt, Some(&use_samples)); + pub fn new_boost_training( + fin: &Path, + gfmt: GenotFormat, + phe_buf: Option<&[u8]>, + //phe_buf: &[u8], + //fin_phe: Option<&Path>, + phe_name: Option<&str>, + cov_name: Option<&str>, + extract_snv_buf: Option<&[u8]>, + //fin_snv: Option<&Path>, + extract_sample_buf: Option<&[u8]>, + //fin_sample: Option<&Path>, + //fin_cov: Option<&Path>, //to deprecate + fill_missing: bool, + // TODO: merge filt_snv and fin_snv into use_snvs, + filt_snv: Option<&[bool]>, + snvs_train: Option<&Snvs>, //for use_missing of vali + make_major_a2_train: bool, + ) -> Self { + let dataset = Self::new( + fin, + gfmt, + phe_buf, + phe_name, + cov_name, + extract_snv_buf, + extract_sample_buf, + filt_snv, + fill_missing, + make_major_a2_train, + snvs_train, + ); - let ys: Vec = io_genot::load_ys_buf(fin, gfmt, phe_buf, phe_name, &sample_id_to_n); - //let ys: Vec = io_genot::load_ys_buf(fin, gfmt, phe_buf, phe_name, &use_samples); - log::debug!("ys {}", ys[0]); - let covs: Covs = Covs::new(phe_buf, fin, gfmt, cov_name, &sample_id_to_n); + // check if ys exist + if dataset.samples().phe().is_none() { + panic!("Could not load sample phenotype."); + } + + dataset + } + + pub fn new_use_vec( + fin: &Path, + gfmt: GenotFormat, + phe_buf: Option<&[u8]>, + phe_name: Option<&str>, + cov_name: Option<&str>, + use_snvs: Option<&[bool]>, + use_samples: Option<&[bool]>, + fill_missing: bool, + make_major_a2_train: bool, + snvs_train: Option<&Snvs>, //for fill_missing and make_major_a2_train of vali + ) -> Self { + let start = Instant::now(); + + let m = match use_snvs { + Some(x) => vec::count_true(x), + None => io_genot::compute_num_snv(&fin, gfmt).unwrap(), + }; + + let n = match use_samples { + Some(x) => vec::count_true(x), + None => io_genot::compute_num_sample(fin, gfmt).unwrap(), + }; + + // do not fill missing here for validation; below + let genot = io_genot::load::generate_genot(fin, gfmt, m, n, use_snvs, use_samples, false); + + let sample_id_to_n = samples::create_sample_id_to_n(fin, gfmt, use_samples); + + let ys: Option> = + io_genot::load_ys_buf_option(fin, gfmt, phe_buf, phe_name, &sample_id_to_n); + let covs: Option = match cov_name { + Some(cov_name) => Some(Covs::new(phe_buf, fin, gfmt, cov_name, &sample_id_to_n)), + None => None, + }; //println!("covs: {:?}", covs.clone().unwrap().cov_indexs()); - let snv_indexs = snv::extract_snvs_consume(snvs_in, &use_snvs, m); + let samples_id = io_genot::load_samples_id(fin, gfmt, use_samples); + let samples = Samples::new(ys.as_deref(), Some(samples_id), covs, n); + + let snvs_in = io_genot::load_snvs(fin, gfmt); + let snv_indexs = match use_snvs { + Some(x) => snv::extract_snvs_consume(snvs_in, x, m), + None => snvs_in, + }; let snvs = Snvs::new_data_from_snv_index(snv_indexs); - // TODO: add names here, instead of using iid in score - let samples = Samples::new(&ys, None, Some(covs), n); log::debug!( "It took {} seconds to create Dataset.", @@ -122,15 +198,10 @@ impl Dataset { dataset.compute_maf(); - // FIXME: flip A1, A2 for lims2 - // TOOD: when s2 is not minor homo but major homo - // if force_a1_minor{} - - // fill missing here + // fill missing to mode here // for training; use maf assuming hwe // for validation; use the training maf - if !use_missing { - // for training + if fill_missing { if let Some(snvs_train) = snvs_train { // for val // unwrap to raise error when None @@ -142,6 +213,16 @@ impl Dataset { } } + if make_major_a2_train { + if let Some(snvs_train) = snvs_train { + // for val + dataset.set_major_a2(Some(snvs_train)); + } else { + // for training + dataset.set_major_a2(None); + } + } + dataset } @@ -300,25 +381,65 @@ impl Dataset { } pub fn fill_missing_maf(&mut self, mafs: Option<&Vec>) { - // TODO: how to avoid clone? - // create let mafs:Vec; - // mafs_ref:&[f64]=if .. - let mafs = if let Some(mafs_) = mafs { + // TODO: avoid clone() + // if not using clone(), genot_mut() is error + let mafs_v: Vec; + let mafs = if let Some(mafs) = mafs { + // for validation + mafs + //mafs_.clone() + } else { + // for training + //self.snvs().mafs().unwrap() + //self.snvs().mafs().unwrap().clone() + mafs_v = self.snvs().mafs().unwrap().clone(); + &mafs_v + }; + //let genot = self.genot_mut(); + self.genot_mut() + .iter_snv_mut() + .zip(mafs.iter()) + .par_bridge() + .for_each(|(mut g_snv, maf)| g_snv.fill_missing_mode_maf(*maf)); + //.for_each(|(mut g_snv, maf)| io_genot::load::fill_missing_maf(&mut g_snv, *maf)); + } + + pub fn set_major_a2(&mut self, snvs: Option<&Snvs>) { + // 1. set major as a2 in snvs + // 2. same in genot + // 3. update maf + + let mafs_v: Vec; + let mafs = if let Some(snvs) = snvs { // for validation - mafs_.clone() + snvs.mafs().unwrap() } else { // for training - self.snvs().mafs().unwrap().clone() + mafs_v = self.snvs().mafs().unwrap().clone(); + &mafs_v }; - let genot = self.genot_mut(); - genot + + // reverse snvs + self.snvs_mut() + .snv_indexs_mut() + .iter_mut() + .zip(mafs.iter()) + .filter(|(_, maf)| **maf > 0.5) + .par_bridge() + .for_each(|(snv_index, _)| snv_index.reverse_alleles()); + + // reverse genot + self.genot_mut() .iter_snv_mut() .zip(mafs.iter()) + .filter(|(_, maf)| **maf > 0.5) .par_bridge() - .for_each(|(mut g_snv, maf)| io_genot::load::fill_missing_maf(&mut g_snv, *maf)); - //genot.iter_snv_mut() - // .par_bridge() - // .for_each(|mut g_snv| plink::load::fill_missing_maf(&mut g_snv)); + .for_each(|(mut g_snv, _)| g_snv.reverse_allele()); + + // update maf + self.compute_maf(); + + // check all maf<0.5 for training? } pub fn compute_maf(&mut self) { @@ -333,24 +454,28 @@ impl Dataset { self.snvs_mut().set_maf(mafs) } + // for boosting + // merge partly to new() pub fn new_score( fin: &Path, gfmt: GenotFormat, phe_buf: Option<&[u8]>, //fin_phe: Option<&Path>, - phe_name: Option<&str>, + //phe_name: Option<&str>, cov_name: Option<&str>, extract_sample_buf: Option<&[u8]>, //fin_sample: Option<&Path>, //fin_cov: Option<&Path>, wgts: &mut [W], //wgts: &[WgtBoost], - use_missing: bool, // use WgtBoosts and use wgts.use_missing() + //use_missing: bool, // use WgtBoosts and use wgts.use_missing() + fill_missing: bool, + use_snv_pos: bool, ) -> Self { - let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); - log::debug!("m_in {}", m_in); - let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); - log::debug!("n_in {}", n_in); + //let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); + //log::debug!("m_in {}", m_in); + //let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); + //log::debug!("n_in {}", n_in); let (n, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); //let (n, use_samples) = sample::make_use_samples(fin_sample, fin, gfmt); @@ -362,7 +487,7 @@ impl Dataset { //let samples_id = plink::load_samples_id(fin, &use_samples); let sample_id_to_n = samples::create_sample_id_to_n(fin, gfmt, Some(&use_samples)); - let ys: Vec = io_genot::load_ys_buf(fin, gfmt, phe_buf, phe_name, &sample_id_to_n); + //let ys: Vec = io_genot::load_ys_buf(fin, gfmt, phe_buf, phe_name, &sample_id_to_n); //let ys: Vec = io_genot::load_ys(fin, gfmt, fin_phe, phe_name, &use_samples); let covs: Option = match cov_name { Some(cov_name) => Some(Covs::new(phe_buf, fin, gfmt, cov_name, &sample_id_to_n)), @@ -373,7 +498,10 @@ impl Dataset { //let covs: Option> = // cov::load_vars(fin_cov, n_in, &sample_id_to_n, cov::CovKind::Cov); - let samples = Samples::new(&ys, None, covs, n); + let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); + + let samples = Samples::new_nophe(Some(samples_id), covs, n); + //let samples = Samples::new(&ys, None, covs, n); //let mut wgts: Vec = wgt_boost::io::load_wgts(fin_wgt); //wgt_boost::io::set_covs(&mut wgts, covs.as_deref(), n); @@ -386,7 +514,8 @@ impl Dataset { wgts, n, Some(&use_samples), - use_missing, + fill_missing, + use_snv_pos, ); Dataset { @@ -403,7 +532,7 @@ impl Dataset { gfmt: GenotFormat, phe_buf: Option<&[u8]>, //fin_phe: Option<&Path>, - phe_name: Option<&str>, + //phe_name: Option<&str>, cov_name: Option<&str>, extract_sample_buf: Option<&[u8]>, //fin_sample: Option<&Path>, @@ -411,11 +540,14 @@ impl Dataset { // TODO: &mut[&mut[Wgt]], wgts: &mut [Wgts], //wgts: &[WgtBoost], - ) -> Dataset { - let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); - log::debug!("m_in {}", m_in); - let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); - log::debug!("n_in {}", n_in); + allow_nonexist_snv: bool, + use_snv_pos: bool, + // TODO: fill_as_mean + ) -> Self { + //let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); + //log::debug!("m_in {}", m_in); + //let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); + //log::debug!("n_in {}", n_in); let (n, use_samples) = sample::make_use_samples_buf(extract_sample_buf, fin, gfmt); //let (n, use_samples) = sample::make_use_samples(fin_sample, fin, gfmt); @@ -428,7 +560,7 @@ impl Dataset { let sample_id_to_n = samples::create_sample_id_to_n(fin, gfmt, Some(&use_samples)); - let ys: Vec = io_genot::load_ys_buf(fin, gfmt, phe_buf, phe_name, &sample_id_to_n); + //let ys: Vec = io_genot::load_ys_buf(fin, gfmt, phe_buf, phe_name, &sample_id_to_n); //let ys: Vec = io_genot::load_ys(fin, gfmt, fin_phe, phe_name, &use_samples); let covs: Option = match cov_name { Some(cov_name) => Some(Covs::new(phe_buf, fin, gfmt, cov_name, &sample_id_to_n)), @@ -440,14 +572,17 @@ impl Dataset { //let covs: Option> = // cov::load_vars(fin_cov, n_in, &sample_id_to_n, cov::CovKind::Cov); - let samples = Samples::new(&ys, None, covs, n); + let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); + + let samples = Samples::new_nophe(Some(samples_id), covs, n); //let mut wgts: Vec = wgt_boost::io::load_wgts(fin_wgt); //wgt_boost::io::set_covs(&mut wgts, covs.as_deref(), n); // set genotype index in wgt // TODO: argparse - let use_missing = false; + //let use_missing = false; + let fill_missing = true; let genot = io_genot::load_score::load_genotypes_for_score_multiwgts( fin, gfmt, @@ -455,9 +590,16 @@ impl Dataset { //&mut wgts, n, Some(&use_samples), - use_missing, + fill_missing, + allow_nonexist_snv, + use_snv_pos, ); + // TMP + //for snv in genot.iter_snv() { + // println!("snv {:?}", &snv.vals()[..10]); + //} + Dataset { genot, // unnecessary since index is in WgtKInd diff --git a/projects_rust/genetics/src/dataset/io_genot.rs b/projects_rust/genetics/src/dataset/io_genot.rs index 2cd6d98..bebff36 100644 --- a/projects_rust/genetics/src/dataset/io_genot.rs +++ b/projects_rust/genetics/src/dataset/io_genot.rs @@ -7,7 +7,6 @@ pub mod load; pub mod load_score; use crate::sample; -use crate::textfile::read_file_to_end; use crate::{textfile, vec, Chrom, SnvId}; pub use file_genot::FileGenot; @@ -23,10 +22,6 @@ use std::io::Read; use std::io::SeekFrom; use std::path::{Path, PathBuf}; -//use crate::alloc; //same -//mod alloc; // error -//use super::snv::Snv; // unnecessary - // 8 x 1 bit //type B8 = u8; // 4 x 2 bit @@ -370,6 +365,122 @@ pub fn check_valid_bed( Ok(bed_size) } +fn load_fam(fin: &Path, gfmt: GenotFormat) -> Vec { + match gfmt { + GenotFormat::Plink1 => { + panic!("Do not use --phe-name for plink1."); + } + GenotFormat::Plink2 | GenotFormat::Plink2Vzs => { + let fin_fam = fname_fam_exist_chrom(fin, gfmt).unwrap(); + textfile::read_file_to_end(&fin_fam, None).unwrap() + //&phe_buf_v[..] + //phe_buf_v + } + } +} + +// If phe_buf=None, phe_name=None, gfmt!=plink1, then return None +pub fn load_ys_buf_option( + fin: &Path, + gfmt: GenotFormat, + phe_buf: Option<&[u8]>, + phe_name: Option<&str>, + sample_id_to_n: &HashMap, + //use_samples: &[bool], +) -> Option> { + let phe_buf_v: Vec; + let valss = if let Some(phe_name) = phe_name { + let phe_buf = match phe_buf { + None => { + // phe is in .psam + phe_buf_v = load_fam(fin, gfmt); + &phe_buf_v[..] + //match gfmt { + // GenotFormat::Plink1 => { + // panic!("Do not use --phe-name for plink1."); + // } + // GenotFormat::Plink2 | GenotFormat::Plink2Vzs => { + // let fin_fam = fname_fam_exist_chrom(fin, gfmt).unwrap(); + // phe_buf_v = textfile::read_file_to_end(&fin_fam, None).unwrap(); + // &phe_buf_v[..] + // } + //} + } + Some(x) => { + // if phe_name is in phe_buf, use the col + // elif phe in .psam, use the col + if textfile::coli_of_header_buf(x, phe_name).is_some() { + x + } else { + phe_buf_v = load_fam(fin, gfmt); + if textfile::coli_of_header_buf(&phe_buf_v[..], phe_name).is_some() { + &phe_buf_v[..] + } else { + panic!("phe_name is not in fin-phe or psam file."); + } + } + } + }; + //TODO + let col_iid = 0; + let col_phe = textfile::coli_of_header_buf(phe_buf, phe_name) + .unwrap_or_else(|| panic!("phe_name is not in fin-phe or psam.")); + textfile::load_table_cols_buf(&phe_buf[..], &[col_iid, col_phe], true) + } else { + match gfmt { + GenotFormat::Plink1 => { + //TODO + let col_iid = 1; + let col_y = 5; + let fin_fam = fname_fam_exist_chrom(fin, gfmt).unwrap(); + textfile::load_table_cols(&fin_fam, &[col_iid, col_y], false) + } + GenotFormat::Plink2 | GenotFormat::Plink2Vzs => { + return None; + //panic!("Use fin_phe for plink2.") + } + } + }; + + // TODO: use text::coli_of_header_buf() + //let col_phe=textfile::coli_of_header_buf(buf, col) + + let vals = sample::vals_align_id(&valss[1], &valss[0], sample_id_to_n); + + log::debug!("vals[0]: {}", vals[0]); + + let uniq = vec::uniq_string(&vals); + let code_type = if uniq == HashSet::from_iter([String::from("0"), String::from("1")]) { + "01" + } else if uniq == HashSet::from_iter([String::from("1"), String::from("2")]) { + "12" + } else { + panic!("Unknown coding {:?}", uniq); + }; + + fn decode_phe(val: &str, code_type: &str) -> bool { + let code: u8 = (*val).parse::().unwrap(); + if code_type == "01" { + match code { + 0 | 1 => code != 0, + z => panic!("Unknown phenotype included: {}.", z), + } + } else if code_type == "12" { + match code { + 1 | 2 => (code - 1) != 0, + z => panic!("Unknown phenotype included: {}.", z), + } + } else { + // TODO: match + panic!("Unknown code_type {:?}", code_type); + } + } + + let ys = vals.iter().map(|x| decode_phe(x, code_type)).collect(); + + Some(ys) +} + // TODO: what if fin.fam includes phe=9? // -> let user exclude samples with --fin-sample pub fn load_ys_buf( @@ -382,7 +493,9 @@ pub fn load_ys_buf( ) -> Vec { //let mut ys: Vec = Vec::with_capacity(len_n); - let phe_buf_v: Vec; + load_ys_buf_option(fin, gfmt, phe_buf, phe_name, sample_id_to_n).unwrap() + + /* let phe_buf_v: Vec; let valss = if let Some(phe_name) = phe_name { let phe_buf = match phe_buf { None => { @@ -402,8 +515,8 @@ pub fn load_ys_buf( }; //TODO let col_iid = 0; - let col_phe = - textfile::coli_of_header_buf(phe_buf, phe_name).expect("phe_name is not in fin-phe."); + let col_phe = textfile::coli_of_header_buf(phe_buf, phe_name) + .unwrap_or_else(|| panic!("phe_name is not in fin-phe.")); //let col = vec![phe_name]; textfile::load_table_cols_buf(&phe_buf[..], &[col_iid, col_phe], true) } else { @@ -455,7 +568,7 @@ pub fn load_ys_buf( let ys = vals.iter().map(|x| decode_phe(x, code_type)).collect(); - ys + ys */ } /* // TODO: n should be Option or remove @@ -546,7 +659,7 @@ fn load_snvs_text_plink( let fin_bim = fname_plinks_snv(fin, gfmt, chrom); //let fin_bim = fname_plinks_snv(fin, gfmt, None); - let buf = read_file_to_end(&fin_bim, None); + let buf = textfile::read_file_to_end(&fin_bim, None); if buf.is_err() { return None; } @@ -608,9 +721,10 @@ fn load_snvs_tsv( compress: Option<&str>, ) -> Option> { let fin_bim = fname_plinks_snv(fin, gfmt, chrom); + //println!("fin_bim {:?}", fin_bim); let mut snvs_in: Vec = vec![]; - let buf = read_file_to_end(&fin_bim, compress); + let buf = textfile::read_file_to_end(&fin_bim, compress); if buf.is_err() { return None; } @@ -622,7 +736,8 @@ fn load_snvs_tsv( .from_reader(&buf[..]); for result in rdr.deserialize() { - let record: SnvPlink2In = result.expect(&format!("Error while reading {:?}", fin_bim)); + let record: SnvPlink2In = + result.unwrap_or_else(|_| panic!("Error while reading: {:?}", fin_bim)); snvs_in.push(record) } } else { @@ -638,7 +753,7 @@ fn load_snvs_tsv( let cols = [1usize, 0, 3, 4, 5]; for result in rdr.records() { - let record = result.expect(&format!("Error while reading {:?}", fin_bim)); + let record = result.unwrap_or_else(|_| panic!("Error while reading: {:?}", fin_bim)); println!("{:?}", record); snvs_in.push(SnvPlink2In::new( record[cols[0]].to_string(), @@ -678,7 +793,10 @@ fn load_snvs_chrom(fin: &Path, gfmt: GenotFormat, chrom: Option<&Chrom>) -> Opti // since match snv_index is not simple? pub fn load_snvs(fin: &Path, gfmt: GenotFormat) -> Vec { if !judge_split_chrom(fin) { - load_snvs_chrom(fin, gfmt, None).unwrap() + //load_snvs_chrom(fin, gfmt, None).unwrap() + + load_snvs_chrom(fin, gfmt, None) + .unwrap_or_else(|| panic!("Could not load snvs from fin: {:?}", fin)) } else { let mut snvs: Vec = vec![]; for chrom_i in Chrom::variants().iter() { @@ -734,7 +852,7 @@ fn load_samples_id_tsv(fin: &Path, gfmt: GenotFormat, use_samples: Option<&[bool for result in rdr.deserialize() { let record: SamplePlink2In = - result.expect(&format!("Error while reading {:?}", &fin_fam)); + result.unwrap_or_else(|_| panic!("Error while reading: {:?}", fin_fam)); samples_in.push(record) } } else { @@ -747,8 +865,8 @@ fn load_samples_id_tsv(fin: &Path, gfmt: GenotFormat, use_samples: Option<&[bool .unwrap(); for result in rdr.records() { - let record = result.expect(&format!("Error while reading {:?}", &fin_fam)); - println!("{:?}", record); + let record = result.unwrap_or_else(|_| panic!("Error while reading: {:?}", fin_fam)); + //println!("{:?}", record); // the first line sould be FID or IID and both are fine samples_in.push(SamplePlink2In::new(record[0].to_string())) } diff --git a/projects_rust/genetics/src/dataset/io_genot/load.rs b/projects_rust/genetics/src/dataset/io_genot/load.rs index ec68620..489a523 100644 --- a/projects_rust/genetics/src/dataset/io_genot/load.rs +++ b/projects_rust/genetics/src/dataset/io_genot/load.rs @@ -16,14 +16,15 @@ pub fn generate_genot_snv( mi: usize, n: usize, use_samples: Option<&[bool]>, - use_missing: bool, + //use_missing: bool, + fill_missing: bool, ) -> GenotSnv { match gfmt { GenotFormat::Plink1 => { - plink::generate_genot_snv_plink(fin, gfmt, mi, n, use_samples, use_missing) + plink::generate_genot_snv_plink(fin, gfmt, mi, n, use_samples, fill_missing) } GenotFormat::Plink2 | GenotFormat::Plink2Vzs => { - call_generate_genot_snv_plink2(fin, gfmt, mi, n, use_samples, use_missing) + call_generate_genot_snv_plink2(fin, gfmt, mi, n, use_samples, fill_missing) //if cfg!(feature = "plink2") { // plink2::load_snv_plink2(fin, gfmt, mi, n, use_samples, use_missing) //} else { @@ -40,12 +41,11 @@ fn call_generate_genot_snv_plink2( mi: usize, n: usize, use_samples: Option<&[bool]>, - use_missing: bool, + fill_missing: bool, ) -> GenotSnv { - plink2::generate_genot_snv_plink2(fin, gfmt, mi, n, use_samples, use_missing) + plink2::generate_genot_snv_plink2(fin, gfmt, mi, n, use_samples, fill_missing) } -// TODO: cleaner #[cfg(not(feature = "plink2"))] fn call_generate_genot_snv_plink2( _: &Path, @@ -64,31 +64,49 @@ pub fn generate_genot( gfmt: GenotFormat, m: usize, n: usize, - use_snvs: &[bool], + use_snvs: Option<&[bool]>, use_samples: Option<&[bool]>, - use_missing: bool, + //use_missing: bool, + fill_missing: bool, ) -> Genot { match gfmt { GenotFormat::Plink1 => { - plink::generate_genot_plink(fin, gfmt, m, n, use_snvs, use_samples, use_missing) + plink::generate_genot_plink(fin, gfmt, m, n, use_snvs, use_samples, fill_missing) } GenotFormat::Plink2 | GenotFormat::Plink2Vzs => { - call_generate_genot_plink2(fin, gfmt, m, n, use_snvs, use_samples, use_missing) + call_generate_genot_plink2(fin, gfmt, m, n, use_snvs, use_samples, fill_missing) } } } -#[cfg(feature = "plink2")] +/* // error: plink2 is not loaded fn call_generate_genot_plink2( fin: &Path, gfmt: GenotFormat, m: usize, n: usize, - use_snvs: &[bool], + use_snvs: Option<&[bool]>, use_samples: Option<&[bool]>, use_missing: bool, ) -> Genot { - plink2::generate_genot_plink2(fin, gfmt, m, n, use_snvs, use_samples, use_missing) + if cfg!(feature = "plink2") { + plink2::generate_genot_plink2(fin, gfmt, m, n, use_snvs, use_samples, use_missing) + } else { + panic!("Cannot use plink2 in this program feature. Use --feature plink2"); + } +} */ + +#[cfg(feature = "plink2")] +fn call_generate_genot_plink2( + fin: &Path, + gfmt: GenotFormat, + m: usize, + n: usize, + use_snvs: Option<&[bool]>, + use_samples: Option<&[bool]>, + fill_missing: bool, +) -> Genot { + plink2::generate_genot_plink2(fin, gfmt, m, n, use_snvs, use_samples, fill_missing) } // TODO: cleaner @@ -98,7 +116,7 @@ fn call_generate_genot_plink2( _: GenotFormat, _: usize, _: usize, - _: &[bool], + _: Option<&[bool]>, _: Option<&[bool]>, _: bool, ) -> Genot { @@ -107,101 +125,107 @@ fn call_generate_genot_plink2( // make missing to mode // in pred -fn fill_missing(pred: &mut GenotSnvMut) { - // count 0,1,2 - let mut counts_allele = vec![0usize; 4]; - //let mut counts_allele = Vec::with_capacity(4); - //for _ in 0..=3 { - // counts_allele.push(0); - //} - - let n = pred.n(); - for ni in 0..n { - counts_allele[pred.get_val_unchecked(ni) as usize] += 1; - } - - let mut mode: usize = 4; - let mut mode_counts = 0; - for i in 0..=2 { - if counts_allele[i] > mode_counts { - mode_counts = counts_allele[i]; - mode = i; - } - } - let mode = mode as u8; - assert_ne!(mode, 4); - - for ni in 0..n { - if pred.get_val_unchecked(ni) == 3 { - pred.set(mode, ni); - } - } - // check all are non-missing? - // -> performance... -} +// TODO: mv to BaseGenotSnvMut +//fn fill_missing_snv(pred: &mut GenotSnvMut) { +// pred.fill_missing_mode() +// //// count 0,1,2 +// //let mut counts_allele = vec![0usize; 4]; +// ////let mut counts_allele = Vec::with_capacity(4); +// ////for _ in 0..=3 { +// //// counts_allele.push(0); +// ////} +// +// //let n = pred.n(); +// //for ni in 0..n { +// // counts_allele[pred.get_val_unchecked(ni) as usize] += 1; +// //} +// +// //let mut mode: usize = 4; +// //let mut mode_counts = 0; +// //for i in 0..=2 { +// // if counts_allele[i] > mode_counts { +// // mode_counts = counts_allele[i]; +// // mode = i; +// // } +// //} +// //let mode = mode as u8; +// //assert_ne!(mode, 4); +// +// //for ni in 0..n { +// // if pred.get_val_unchecked(ni) == 3 { +// // pred.set(mode, ni); +// // } +// //} +// //// check all are non-missing? +// //// -> performance... +//} // make missing to mode // in pred -pub fn fill_missing_maf(pred: &mut GenotSnvMut, maf: f64) { - let mode: u8 = if maf < 1.0 / 3.0f64 { - 0 - } else if maf > 2.0 / 3.0f64 { - 2 - } else { - 1 - }; - - let n = pred.n(); - - for ni in 0..n { - if pred.get_val_unchecked(ni) == 3 { - pred.set(mode, ni); - } - } - // check all are non-missing? - // -> performance... -} - +// TODO: mv to BaseGenotSnvMut +//pub fn fill_missing_maf(pred: &mut GenotSnvMut, maf: f64) { +// pred.fill_missing_mode_maf(maf); +// +// //let mode: u8 = if maf < 1.0 / 3.0f64 { +// // 0 +// //} else if maf > 2.0 / 3.0f64 { +// // 2 +// //} else { +// // 1 +// //}; +// +// //let n = pred.n(); +// +// //for ni in 0..n { +// // if pred.get_val_unchecked(ni) == 3 { +// // pred.set(mode, ni); +// // } +// //} +// // check all are non-missing? +// // -> performance... +//} + +// not used? /// x: vector of minor allele (0,1,2,3) /// convert 3 (missing) to mode -pub fn missing_to_mode(x: &mut [u8]) { - // count 0,1,2,3 - let mut counts_allele = vec![0usize; 4]; - //let mut counts_allele = Vec::with_capacity(4); - //for _ in 0..=3 { - // counts_allele.push(0); - //} - - for x_v in x.iter() { - counts_allele[*x_v as usize] += 1; - } - - let mut mode: u8 = 4; - let mut mode_counts = 0; - for i in 0..=2 { - if counts_allele[i] > mode_counts { - mode_counts = counts_allele[i]; - mode = i as u8; - } - } - - assert_ne!(mode, 4); - - for x_v in x.iter_mut() { - if *x_v == 3 { - *x_v = mode; - } - } -} +//fn missing_to_mode(x: &mut [u8]) { +// // count 0,1,2,3 +// let mut counts_allele = vec![0usize; 4]; +// //let mut counts_allele = Vec::with_capacity(4); +// //for _ in 0..=3 { +// // counts_allele.push(0); +// //} +// +// for x_v in x.iter() { +// counts_allele[*x_v as usize] += 1; +// } +// +// let mut mode: u8 = 4; +// let mut mode_counts = 0; +// for i in 0..=2 { +// if counts_allele[i] > mode_counts { +// mode_counts = counts_allele[i]; +// mode = i as u8; +// } +// } +// +// assert_ne!(mode, 4); +// +// for x_v in x.iter_mut() { +// if *x_v == 3 { +// *x_v = mode; +// } +// } +//} #[cfg(test)] mod tests { use crate::GenotFormat; use super::*; + use crate::samples; use crate::{io_genot, sample, snv}; use std::path::PathBuf; - use crate::samples; /// check if _file() and _file_loadpart() are the same @@ -251,25 +275,26 @@ mod tests { (fin, ys, m, n, use_snvs, use_samples) } - #[test] - fn test_missing_to_mode() { - let mut x = vec![0, 1, 1, 2, 1, 3]; - let x_exp = vec![0, 1, 1, 2, 1, 1]; + // mv to genot_struct.rs + //#[test] + //fn test_missing_to_mode() { + // let mut x = vec![0, 1, 1, 2, 1, 3]; + // let x_exp = vec![0, 1, 1, 2, 1, 1]; - missing_to_mode(&mut x); + // missing_to_mode(&mut x); - for (x_v, x_exp_v) in x.iter().zip(x_exp.iter()) { - assert_eq!(*x_v, *x_exp_v); - } + // for (x_v, x_exp_v) in x.iter().zip(x_exp.iter()) { + // assert_eq!(*x_v, *x_exp_v); + // } - // when missing is the mode - let mut x = vec![0, 1, 1, 3, 3, 3]; - let x_exp = vec![0, 1, 1, 1, 1, 1]; + // // when missing is the mode + // let mut x = vec![0, 1, 1, 3, 3, 3]; + // let x_exp = vec![0, 1, 1, 1, 1, 1]; - missing_to_mode(&mut x); + // missing_to_mode(&mut x); - for (x_v, x_exp_v) in x.iter().zip(x_exp.iter()) { - assert_eq!(*x_v, *x_exp_v); - } - } + // for (x_v, x_exp_v) in x.iter().zip(x_exp.iter()) { + // assert_eq!(*x_v, *x_exp_v); + // } + //} } diff --git a/projects_rust/genetics/src/dataset/io_genot/load/plink.rs b/projects_rust/genetics/src/dataset/io_genot/load/plink.rs index 34875dd..32613a1 100644 --- a/projects_rust/genetics/src/dataset/io_genot/load/plink.rs +++ b/projects_rust/genetics/src/dataset/io_genot/load/plink.rs @@ -15,7 +15,8 @@ pub fn generate_genot_snv_plink( mi: usize, n: usize, use_samples: Option<&[bool]>, - use_missing: bool, + //use_missing: bool, + fill_missing: bool, ) -> GenotSnv { let reader = BufReader::new(File::open(io_genot::fname_plinks_genot(fin, gfmt, None)).unwrap()); @@ -29,8 +30,10 @@ pub fn generate_genot_snv_plink( n, ); - if !use_missing { - super::fill_missing(&mut g_snv.as_genot_snv_mut_snv()); + //if !use_missing { + if fill_missing { + //super::fill_missing_snv(&mut g_snv.as_genot_snv_mut_snv()); + g_snv.as_genot_snv_mut_snv().fill_missing_mode() } g_snv @@ -84,21 +87,21 @@ pub fn generate_genot_plink( gfmt: GenotFormat, m: usize, n: usize, - use_snvs: &[bool], + use_snvs: Option<&[bool]>, use_samples: Option<&[bool]>, - use_missing: bool, + //use_missing: bool, + fill_missing: bool, ) -> Genot { log::debug!("to prepare Genot m, n: {}, {}", m, n); let mem = alloc::get_available_memory(); log::debug!("available mem: {:?} bytes", mem); let genot_byte = Genot::byte(m, n); - log::info!("Temporary skip panic even for insufficient memory"); - /* + //log::info!("Temporary skip panic even for insufficient memory"); match mem { Some(x) => { log::debug!( - "genot vs available mem, {} bytes vs {} bytes", + "genot vs available mem, {:.3} bytes vs {:.3} bytes", genot_byte, x ); @@ -107,10 +110,17 @@ pub fn generate_genot_plink( } } None => { - log::debug!("Could not get available memory."); + log::info!("Could not get available memory."); } - } - */ + }; + + // for use_snvs=None + let use_snvs_v = vec![true; m]; + let use_snvs = match use_snvs { + Some(x) => x, + None => &use_snvs_v, + }; + //let m_in = use_snvs.len(); let mut g = Genot::new_zeros(m, n); //log::debug!("done preparing Genot"); @@ -126,17 +136,11 @@ pub fn generate_genot_plink( // TODO: somehow same value as above; why not decreased by alloc in genot? // FIXME: fix mem for pg a* - log::info!("Temporary fix mem to 64GB."); - let mem = Some(64usize * 1024 * 1024 * 1024); - /* + //log::info!("Temporary fix mem to 64GB."); + //let mem = Some(64usize * 1024 * 1024 * 1024); let mem = alloc::get_available_memory(); log::debug!("available mem: {:?} bytes", mem); - */ - //let buf_size_limit: usize = match mem { - // Some(x) => x.min(BUF_SIZE_BED_LIMIT), - // None => BUF_SIZE_BED_LIMIT, - //}; let buf_size_limit = mem.map_or_else(|| BUF_SIZE_BED_LIMIT, |x| x.min(BUF_SIZE_BED_LIMIT)); log::debug!("buf_size_limit: {:?} bytes", buf_size_limit); @@ -193,10 +197,12 @@ pub fn generate_genot_plink( } // missing - if !use_missing { + //if !use_missing { + if fill_missing { g.iter_snv_mut() .par_bridge() - .for_each(|mut g_snv| super::fill_missing(&mut g_snv)); + .for_each(|mut g_snv| g_snv.fill_missing_mode()); + //.for_each(|mut g_snv| super::fill_missing_snv(&mut g_snv)); } g @@ -252,6 +258,7 @@ fn assign_genot( //plink::check_valid_bed(fin_chrom, None, m_in_chrom, n_in).unwrap(); + // FIXME: available mem? // check if 32 GB? 16GB? remains or not // reading smaller than buf_size is not error but reason is unknown // [here](https://doc.rust-lang.org/std/io/trait.Read.html#tymethod.read) @@ -636,19 +643,19 @@ fn byte_to_count(v: B8_2, i: usize) -> u8 { // TODO: this might be clear if use .flat_map() // no rayon here -pub fn load_x(x: &mut [u8], buf_mi: &[B8_2], use_samples: &[bool]) { - let mut ni = 0; - for (n_in_i, v) in use_samples.iter().enumerate() { - if *v { - x[ni] = buf_to_count(buf_mi, n_in_i); - ni += 1; - } - } - // ng: x could be larger thant n - //assert_eq!(ni, x.len()); - - super::missing_to_mode(x); -} +//fn load_x(x: &mut [u8], buf_mi: &[B8_2], use_samples: &[bool]) { +// let mut ni = 0; +// for (n_in_i, v) in use_samples.iter().enumerate() { +// if *v { +// x[ni] = buf_to_count(buf_mi, n_in_i); +// ni += 1; +// } +// } +// // ng: x could be larger thant n +// //assert_eq!(ni, x.len()); +// +// super::missing_to_mode(x); +//} #[cfg(test)] mod tests { @@ -687,7 +694,8 @@ mod tests { let gfmt = GenotFormat::Plink1; //let use_snvs = vec![true; use_snvs.len()]; //let use_samples = vec![true; use_samples.len()]; - let g = generate_genot_plink(&fin, gfmt, m, n, &use_snvs, Some(&use_samples), true); + let g = generate_genot_plink(&fin, gfmt, m, n, Some(&use_snvs), Some(&use_samples), false); + //let g = generate_genot_plink(&fin, gfmt, m, n, Some(&use_snvs), Some(&use_samples), true); let mut giter = g.iter_snv(); assert_eq!(giter.next().unwrap().vals(), vec![2, 0, 1, 0, 0]); assert_eq!(giter.next().unwrap().vals(), vec![1, 0, 2, 1, 0]); diff --git a/projects_rust/genetics/src/dataset/io_genot/load/plink2.rs b/projects_rust/genetics/src/dataset/io_genot/load/plink2.rs index a9cd816..91e6b3c 100644 --- a/projects_rust/genetics/src/dataset/io_genot/load/plink2.rs +++ b/projects_rust/genetics/src/dataset/io_genot/load/plink2.rs @@ -1,12 +1,13 @@ use crate::genot::prelude::*; use crate::genot_index; -use crate::GenotFormat; +use crate::{alloc, GenotFormat}; use crate::{io_genot, vec, Chrom}; use pgenlib; use rayon::prelude::*; use std::ffi::CString; use std::os::unix::prelude::OsStrExt; use std::path::Path; +use std::time::Instant; fn fgenot_pgenlib(fin: &Path, gfmt: GenotFormat, chrom: Option<&Chrom>) -> CString { let fin_genot = io_genot::fname_plinks_genot(fin, gfmt, chrom); @@ -35,13 +36,9 @@ pub fn generate_genot_snv_plink2( mi: usize, n: usize, use_samples: Option<&[bool]>, - use_missing: bool, + //use_missing: bool, + fill_missing: bool, ) -> GenotSnv { - //let n_in = match use_samples { - // None => n, - // Some(usev) => usev.len(), - //}; - let n_in = match use_samples { None => n, Some(usev) => usev.len(), @@ -52,14 +49,16 @@ pub fn generate_genot_snv_plink2( let fin_genot = fgenot_pgenlib(fin, gfmt, None); - let genot_v = load_genot_snv_buf(fin_genot, mi, n_in, use_samples_idx, n); + let genot_v = load_genot_snv_buf(fin_genot, mi, n_in, &use_samples_idx, n); //println!("genot_v {:?}", genot_v); let mut g_snv = GenotSnv::new_empty(n); - assign_pred_from_genot(&mut g_snv.as_genot_snv_mut_snv(), &genot_v); + assign_pred_from_genot_i8(&mut g_snv.as_genot_snv_mut_snv(), &genot_v); - if !use_missing { - super::fill_missing(&mut g_snv.as_genot_snv_mut_snv()); + //if !use_missing { + if fill_missing { + //super::fill_missing_snv(&mut g_snv.as_genot_snv_mut_snv()); + g_snv.as_genot_snv_mut_snv().fill_missing_mode(); } g_snv @@ -69,10 +68,30 @@ fn load_genot_snv_buf( fin_genot: CString, mi: usize, n_in: usize, - mut use_samples_idx: Vec, + use_samples_idx: &[i32], + //mut use_samples_idx: Vec, n: usize, -) -> Vec { - let nthr = rayon::current_num_threads(); + //) -> Vec { +) -> Vec { + let m_start = mi; + let m_end = mi + 1; + let use_snvs = vec![true; 1]; + + let mut genot_v = vec![0i8; n]; + + load_genot_snvs_extract_buf( + fin_genot, + m_start, + m_end, + &use_snvs, + n_in, + use_samples_idx, + n, + &mut genot_v, + ); + genot_v + + /* let nthr = rayon::current_num_threads(); let mut genot_v = vec![0.0f64; n]; unsafe { let _ = pgenlib::pgenreader_load_snv( @@ -85,10 +104,10 @@ fn load_genot_snv_buf( nthr.try_into().unwrap(), ); } - genot_v + genot_v */ } -// {0:0, 1:1, 2:2, -3:3} +/* // {0:0, 1:1, 2:2, -3:3} pub fn assign_pred_from_genot(pred: &mut GenotSnvMut, buf_mi: &[f64]) { for (ni, dosage) in buf_mi.iter().enumerate() { if *dosage < 0.0 { @@ -99,23 +118,69 @@ pub fn assign_pred_from_genot(pred: &mut GenotSnvMut, buf_mi: &[f64]) { pred.set_unchecked(d, ni); } } +} */ + +pub fn assign_pred_from_genot_i8(pred: &mut GenotSnvMut, buf_mi: &[i8]) { + for (ni, dosage) in buf_mi.iter().enumerate() { + if *dosage < 0i8 { + // missing + pred.set_unchecked(3, ni); + } else { + let d = *dosage as u8; + pred.set_unchecked(d, ni); + } + } } // load whole is fastest -// TODO: use_missing -> !fill_missing // TODO: untest for split_chrom pub fn generate_genot_plink2( fin: &Path, gfmt: GenotFormat, m: usize, n: usize, - use_snvs: &[bool], + use_snvs: Option<&[bool]>, use_samples: Option<&[bool]>, - use_missing: bool, + //use_missing: bool, + fill_missing: bool, ) -> Genot { log::debug!("to prepare Genot plink2 m, n: {}, {}", m, n); + let start = Instant::now(); + + let mem = alloc::get_available_memory(); + log::debug!("available mem: {:?} bytes", mem); + //log::info!("Temporary skip panic even for insufficient memory"); + + let genot_byte = Genot::byte(m, n); + // panic if available mem < genot_byte + mem_pgen_min + let mem_pgenlib_min_check = 1usize * 1024 * 1024 * 1024; + //let mem_pgenlib_min = 16usize * 1024 * 1024 * 1024; + //log::info!("Temporary skip panic even for insufficient memory"); + match mem { + Some(x) => { + log::debug!( + "genot + min pgenlib vs available mem, {:.3} GB + {:.3} GB vs {:.3} GB", + alloc::mem_gb(genot_byte), + alloc::mem_gb(mem_pgenlib_min_check), + alloc::mem_gb(x), + ); + if genot_byte + mem_pgenlib_min_check > x { + panic!("Memory insufficient on preparing Genot.") + } + } + None => { + log::debug!("Could not get available memory."); + } + }; + // TODO: better way + let use_snvs_v = vec![true; m]; + let use_snvs = match use_snvs { + Some(x) => x, + None => &use_snvs_v, + }; let m_in = use_snvs.len(); + let n_in = match use_samples { None => n, Some(usev) => usev.len(), @@ -145,14 +210,14 @@ pub fn generate_genot_plink2( continue; } let fin_genot = fgenot_pgenlib(fin, gfmt, Some(chrom_i)); - // TODO: might be able to avoid .clone() assign_genot( &mut g.as_genot_snvs_mut(m_begin, m_end), fin_genot, m_in, n_in, - use_samples_idx.clone(), + &use_samples_idx, + //use_samples_idx.clone(), n, &use_snvs[m_in_begin..m_in_end], ); @@ -174,41 +239,132 @@ pub fn generate_genot_plink2( fin_genot, m_in, n_in, - use_samples_idx, + &use_samples_idx, n, use_snvs, ); } // missing - if !use_missing { + //if !use_missing { + if fill_missing { g.iter_snv_mut() .par_bridge() - .for_each(|mut g_snv| super::fill_missing(&mut g_snv)); + .for_each(|mut g_snv| g_snv.fill_missing_mode()); + //.for_each(|mut g_snv| super::fill_missing_snv(&mut g_snv)); } + let end = start.elapsed(); + log::info!("It took {} seconds to generate genot.", end.as_secs()); + g } // max size allocated in pgenlib -// pgenlib occupy 8 byte per one genotype count const BUF_SIZE_PGENLIB_LIMIT: usize = 64 * 1024 * 1024 * 1024; fn assign_genot( + g_chrom: &mut GenotMut, + fin_genot: CString, + m_in_chrom: usize, + n_in: usize, + use_samples_idx: &[i32], + n: usize, + use_snvs: &[bool], + // TODO: buf: Option> // for chrom +) { + //let buf_size_limit: usize = BUF_SIZE_PGENLIB_LIMIT; + + let genot_byte = Genot::byte(vec::count_true(use_snvs), n); + let buf_size_limit: usize = match alloc::get_available_memory() { + Some(x) => x.min(BUF_SIZE_PGENLIB_LIMIT), + None => { + log::debug!( + "Could not get available memory; assume there is {} GB available memory.", + alloc::mem_gb(BUF_SIZE_PGENLIB_LIMIT) + ); + BUF_SIZE_PGENLIB_LIMIT + } + }; + log::debug!("buf_size_limit: {} GB", alloc::mem_gb(buf_size_limit)); + + // 1 byte (i8) per count in pgenlib + let byte_per_snv = n * 1; + // f64: 8 byte per count in pgenlib + // let byte_per_snv = n * 8; + let buf_num_snv_limit: usize = buf_size_limit / byte_per_snv; + let buf_num_snv: usize = buf_num_snv_limit.min(m_in_chrom); + //let buf_size: usize = buf_num_snv * byte_per_snv; + //assert_eq!(buf_size % byte_per_snv, 0); + //assert!(buf_size <= buf_size_limit); + + // TMP + //let buf_num_snv = 10; + + let mut buf = vec![0i8; buf_num_snv * n]; + //let mut buf = vec![0.0f64; buf_num_snv * n]; + + let mut m_in_begin_loaded = 0; + let mut m_begin_loaded = 0; + loop { + log::debug!("m_in_begin_loaded: {}", m_in_begin_loaded); + let m_in_read = buf_num_snv.min(m_in_chrom - m_in_begin_loaded); + log::debug!("m_in_read: {}", m_in_read); + + let m_in_end_loaded = m_in_begin_loaded + m_in_read; + let use_snvs_loaded = &use_snvs[m_in_begin_loaded..m_in_end_loaded]; + let (_, m_read) = genot_index::create_m_to_m_in(use_snvs_loaded); + log::debug!("m_read: {}", m_read); + log::debug!("m_in_end_loded: {}", m_in_end_loaded); + + let m_end_loaded = m_begin_loaded + m_read; + + if m_read != 0 { + //let buf = load_genot_whole_buf(fin_genot, m_in, n_in, use_samples_idx, n); + //load_genot_snvs_buf( + load_genot_snvs_extract_buf( + fin_genot.clone(), + m_in_begin_loaded, + m_in_end_loaded, + use_snvs_loaded, + n_in, + use_samples_idx, + n, + &mut buf, + ); + //println!("buf {:?}", &buf[..10]); + + let mut g_chrom_part = g_chrom.as_genot_snvs_mut(m_begin_loaded, m_end_loaded); + //assign_genot_buf(&mut g_chrom_part, &buf, use_snvs_loaded); + assign_genot_extract_buf(&mut g_chrom_part, &buf); + } + + m_begin_loaded = m_end_loaded; + m_in_begin_loaded = m_in_end_loaded; + assert!(m_in_begin_loaded <= m_in_chrom); + if m_in_begin_loaded == m_in_chrom { + break; + } + } + assert_eq!(m_in_begin_loaded, m_in_chrom); +} + +/* fn assign_genot( g_chrom: &mut GenotMut, fin_genot: CString, m_in_chrom: usize, n_in: usize, use_samples_idx: Vec, + //use_samples_idx: Vec, n: usize, use_snvs: &[bool], - // TOOD: buf: Option> // for chrom + // TODO: buf: Option> // for chrom ) { let buf_size_limit: usize = BUF_SIZE_PGENLIB_LIMIT; // 8 byte per count in pgenlib let byte_per_snv = n * 8; - let but_num_snv_limit: usize = buf_size_limit / byte_per_snv; - let buf_num_snv: usize = but_num_snv_limit.min(m_in_chrom); + let buf_num_snv_limit: usize = buf_size_limit / byte_per_snv; + let buf_num_snv: usize = buf_num_snv_limit.min(m_in_chrom); //let buf_size: usize = buf_num_snv * byte_per_snv; //assert_eq!(buf_size % byte_per_snv, 0); //assert!(buf_size <= buf_size_limit); @@ -216,8 +372,8 @@ fn assign_genot( // TMP //let buf_num_snv = 10; - // vec![] is also fine - let mut buf = vec![0.0f64; buf_num_snv * n]; + let mut buf = vec![0i8; buf_num_snv * n]; + //let mut buf = vec![0.0f64; buf_num_snv * n]; let mut m_in_begin_loaded = 0; let mut m_begin_loaded = 0; @@ -226,7 +382,6 @@ fn assign_genot( let m_in_read = buf_num_snv.min(m_in_chrom - m_in_begin_loaded); log::debug!("m_in_read: {}", m_in_read); - let m_in_end_loaded = m_in_begin_loaded + m_in_read; let use_snvs_loaded = &use_snvs[m_in_begin_loaded..m_in_end_loaded]; let (_, m_read) = genot_index::create_m_to_m_in(use_snvs_loaded); @@ -237,15 +392,19 @@ fn assign_genot( if m_read != 0 { //let buf = load_genot_whole_buf(fin_genot, m_in, n_in, use_samples_idx, n); + //load_genot_snvs_extract_buf( load_genot_snvs_buf( fin_genot.clone(), m_in_begin_loaded, m_in_end_loaded, n_in, - use_samples_idx.clone(), + //use_samples_idx.clone(), + //&mut use_samples_idx, + &use_samples_idx, n, &mut buf, ); + //println!("buf {:?}", &buf[..10]); let mut g_chrom_part = g_chrom.as_genot_snvs_mut(m_begin_loaded, m_end_loaded); assign_genot_buf(&mut g_chrom_part, &buf, use_snvs_loaded); @@ -259,24 +418,78 @@ fn assign_genot( } } assert_eq!(m_in_begin_loaded, m_in_chrom); +} */ + +fn load_genot_snvs_extract_buf( + fin_genot: CString, + m_start: usize, + m_end: usize, + use_snvs: &[bool], + n_in: usize, + use_samples_idx: &[i32], + n: usize, + buf: &mut Vec, + //buf: &mut Vec, +) { + // too large nthr might leads mem error. + let max_threads_pgenlib = 32; + let nthr = rayon::current_num_threads().min(max_threads_pgenlib); + log::debug!("nthr in rust {}", nthr); + + let m_read = vec::count_true(use_snvs); + + buf.resize(m_read * n, 0i8); + + unsafe { + let _ = pgenlib::pgenreader_load_snvs_extract( + buf.as_mut_ptr(), + fin_genot.as_ptr(), + m_start.try_into().unwrap(), + m_end.try_into().unwrap(), + use_snvs.as_ptr(), + n_in.try_into().unwrap(), + use_samples_idx.as_ptr(), + n.try_into().unwrap(), + nthr.try_into().unwrap(), + ); + } + //println!("buf_tmp {:?}", &buf_tmp[..10]); + //println!("genot_v {:?}", genot_v); + //bufv } +#[allow(dead_code)] fn load_genot_snvs_buf( fin_genot: CString, m_start: usize, m_end: usize, n_in: usize, - mut use_samples_idx: Vec, + use_samples_idx: &Vec, n: usize, - buf: &mut Vec, - //buf: Option>, + buf: &mut Vec, ) { - // large nthr might lead mem error? - //let nthr = rayon::current_num_threads(); - let nthr=4; - //let nthr = 1; + let m_in = m_end - m_start; + let use_snvs = vec![true; m_in]; + + load_genot_snvs_extract_buf( + fin_genot, + m_start, + m_end, + &use_snvs, + n_in, + use_samples_idx, + n, + //&mut buf, + buf, + ); + //genot_v + + /* + let nthr = rayon::current_num_threads(); + //let nthr = 4; log::debug!("nthr in rust {}", nthr); + //log::debug!("buf size {}",(m_end - m_start) * n); buf.resize((m_end - m_start) * n, 0.0f64); // TODO: convert f64->i8 in pgenlib @@ -287,29 +500,45 @@ fn load_genot_snvs_buf( m_start.try_into().unwrap(), m_end.try_into().unwrap(), n_in.try_into().unwrap(), - use_samples_idx.as_mut_ptr(), + use_samples_idx.as_ptr(), + //use_samples_idx.as_mut_ptr(), n.try_into().unwrap(), nthr.try_into().unwrap(), ); } //println!("genot_v {:?}", genot_v); - //bufv + //bufv */ } -/// won't work for 1M SNVs x 300K samples +/// won't work for 1M SNVs x 300K samples for f64 +/// not tried for i8 #[allow(dead_code)] fn load_genot_whole_buf( fin_genot: CString, m_in: usize, n_in: usize, - mut use_samples_idx: Vec, + use_samples_idx: &[i32], n: usize, -) -> Vec { - //let nthr = rayon::current_num_threads(); - let nthr=4; - println!("nthr {}", nthr); +) -> Vec { + let m_start = 0; + let m_end = m_in; + let use_snvs = vec![true; m_in]; + + let mut genot_v = vec![0i8; m_in * n]; + + load_genot_snvs_extract_buf( + fin_genot, + m_start, + m_end, + &use_snvs, + n_in, + use_samples_idx, + n, + &mut genot_v, + ); + genot_v - let mut genot_v = vec![0.0f64; m_in * n]; + /* //let mut genot_v = vec![0.0f64; m_in * n]; unsafe { let _ = pgenlib::pgenreader_load_whole( genot_v.as_mut_ptr(), @@ -322,10 +551,32 @@ fn load_genot_whole_buf( ); } //println!("genot_v {:?}", genot_v); - genot_v + genot_v */ } -fn assign_genot_buf(g: &mut GenotMut, buf: &[f64], use_snvs: &[bool]) { +fn assign_genot_extract_buf(g: &mut GenotMut, buf: &[i8]) { + //let (m_to_m_in, m_read) = genot_index::create_m_to_m_in(use_snvs); + + //assert + //assert_eq!(g.m(), m_read); + let n = g.n(); + + g.iter_snv_mut() + .enumerate() + .par_bridge() + .for_each(|(mi, mut g_snv)| { + //let m_in_i = m_to_m_in[&mi]; + //let buf_mi = &buf[m_in_i * n..(m_in_i + 1) * n]; + let buf_mi = &buf[mi * n..(mi + 1) * n]; + + //println!("buf_mi {:?}", &buf_mi[..10]); + assign_pred_from_genot_i8(&mut g_snv, &buf_mi); + }); +} + +//fn assign_genot_buf(g: &mut GenotMut, buf: &[f64], use_snvs: &[bool]) { +#[allow(dead_code)] +fn assign_genot_buf(g: &mut GenotMut, buf: &[i8], use_snvs: &[bool]) { let (m_to_m_in, m_read) = genot_index::create_m_to_m_in(use_snvs); //assert @@ -339,7 +590,8 @@ fn assign_genot_buf(g: &mut GenotMut, buf: &[f64], use_snvs: &[bool]) { let m_in_i = m_to_m_in[&mi]; let buf_mi = &buf[m_in_i * n..(m_in_i + 1) * n]; - assign_pred_from_genot(&mut g_snv, &buf_mi); + //println!("buf_mi {:?}", &buf_mi[..10]); + assign_pred_from_genot_i8(&mut g_snv, &buf_mi); }); } @@ -366,7 +618,7 @@ mod tests { let m_in: usize = io_genot::compute_num_snv(&fin, gfmt).unwrap(); log::debug!("{}", m_in); let n_in: usize = io_genot::compute_num_sample(&fin, gfmt).unwrap(); - println!("{}", n_in); + log::debug!("{}", n_in); // load snvs let snvs_in = io_genot::load_snvs(&fin, gfmt); let (m, use_snvs) = snv::make_use_snvs(fin_snv, &snvs_in); @@ -399,7 +651,7 @@ mod tests { let m_in: usize = io_genot::compute_num_snv(&fin, gfmt).unwrap(); log::debug!("{}", m_in); let n_in: usize = io_genot::compute_num_sample(&fin, gfmt).unwrap(); - println!("{}", n_in); + log::debug!("{}", n_in); let m = 2; let use_snvs = vec![true, false, true]; let n = 5; @@ -430,7 +682,7 @@ mod tests { let m_in: usize = io_genot::compute_num_snv(&fin, gfmt).unwrap(); log::debug!("{}", m_in); let n_in: usize = io_genot::compute_num_sample(&fin, gfmt).unwrap(); - println!("{}", n_in); + log::debug!("{}", n_in); // load snvs let snvs_in = io_genot::load_snvs(&fin, gfmt); let (m, use_snvs) = snv::make_use_snvs(fin_snv, &snvs_in); @@ -465,7 +717,7 @@ mod tests { let m_in: usize = io_genot::compute_num_snv(&fin, gfmt).unwrap(); log::debug!("{}", m_in); let n_in: usize = io_genot::compute_num_sample(&fin, gfmt).unwrap(); - println!("{}", n_in); + log::debug!("{}", n_in); // load snvs let snvs_in = io_genot::load_snvs(&fin, gfmt); let (m, use_snvs) = snv::make_use_snvs(fin_snv, &snvs_in); @@ -487,9 +739,10 @@ mod tests { fn test_assign_pred_from_bed() { let mut g = GenotSnv::new_empty(6); // [2, 0, 3, 0, 1, 0] - let pbuf = vec![2.0f64, 0.0, 3.0, 0.0, 1.0, 0.0]; + //let pbuf = vec![2.0f64, 0.0, 3.0, 0.0, 1.0, 0.0]; + let pbuf = vec![2i8, 0, 3, 0, 1, 0]; - assign_pred_from_genot(&mut g.as_genot_snv_mut_snv(), &pbuf); + assign_pred_from_genot_i8(&mut g.as_genot_snv_mut_snv(), &pbuf); assert_eq!(g.vals(), vec![2u8, 0, 3, 0, 1, 0]); } @@ -500,7 +753,7 @@ mod tests { //let use_samples = vec![true; use_samples.len()]; //let g = generate_genot_plink(&fin, gfmt, m, n, &use_snvs, Some(&use_samples), true); let mi = 2; - let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), true); + let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), false); assert_eq!(g.vals(), vec![2, 0, 1, 0, 1, 2, 0, 1, 0, 3]); } @@ -511,7 +764,7 @@ mod tests { //let use_samples = vec![true; use_samples.len()]; //let g = generate_genot_plink(&fin, gfmt, m, n, &use_snvs, Some(&use_samples), true); let mi = 2; - let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), true); + let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), false); assert_eq!(g.vals(), vec![2, 0, 1, 0, 1, 2, 0, 1, 0, 3]); } @@ -519,7 +772,7 @@ mod tests { fn test_generate_genot_plink2vzs_ref() { let (fin, gfmt, _, _, n, _, use_samples) = setup_test3ref(); let mi = 2; - let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), true); + let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), false); assert_eq!(g.vals(), vec![0, 2, 1, 2, 1, 0, 2, 1, 2, 3]); } @@ -527,14 +780,14 @@ mod tests { fn test_generate_genot_snv_plink2vzs_part() { let (fin, gfmt, _, _, n, _, use_samples) = setup_test3_part(); let mi = 2; - let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples), true); + let g = generate_genot_snv_plink2(&fin, gfmt, mi, n, Some(&use_samples),false); assert_eq!(g.vals(), vec![0, 0, 2, 1, 3]); } #[test] fn test_generate_genot_whole_plink2vzs_part() { let (fin, gfmt, _, m, n, use_snvs, use_samples) = setup_test3_part(); - let g = generate_genot_plink2(&fin, gfmt, m, n, &use_snvs, Some(&use_samples), true); + let g = generate_genot_plink2(&fin, gfmt, m, n, Some(&use_snvs), Some(&use_samples), false); let mut giter = g.iter_snv(); // [2,1,0,0,0,2,1,0,0,3] assert_eq!(giter.next().unwrap().vals(), vec![1, 0, 2, 0, 3]); @@ -545,7 +798,7 @@ mod tests { #[test] fn test_generate_genot_whole_plink2vzs() { let (fin, gfmt, _, m, n, use_snvs, use_samples) = setup_test3(); - let g = generate_genot_plink2(&fin, gfmt, m, n, &use_snvs, Some(&use_samples), true); + let g = generate_genot_plink2(&fin, gfmt, m, n, Some(&use_snvs), Some(&use_samples), false); let mut giter = g.iter_snv(); // [2,1,0,0,0,2,1,0,0,3] assert_eq!( diff --git a/projects_rust/genetics/src/dataset/io_genot/load_score.rs b/projects_rust/genetics/src/dataset/io_genot/load_score.rs index 9c52797..c343028 100644 --- a/projects_rust/genetics/src/dataset/io_genot/load_score.rs +++ b/projects_rust/genetics/src/dataset/io_genot/load_score.rs @@ -18,14 +18,32 @@ pub fn load_genotypes_for_score( //wgts: &mut [Wgt], n: usize, use_samples: Option<&[bool]>, - use_missing: bool, + fill_missing: bool, + use_snv_pos: bool, ) -> Genot { - let (use_snvs, is_reversed) = create_wgt_to_genotype_index_sametime(fin, gfmt, wgts); + // TMP use_snv_pos; make use_snv_pos=false + let (use_snvs, is_reversed) = + create_wgt_to_genotype_index_sametime(fin, gfmt, wgts, use_snv_pos); + //let (use_snvs, is_reversed) = create_wgt_to_genotype_index_sametime(fin, gfmt, wgts, false); + + //let (use_snvs, is_reversed) = create_wgt_to_genotype_index_sametime(fin, gfmt, wgts, false); //let (wgt_to_m, use_snvs, is_flipped) = create_wgt_to_genotype_index(fin, wgts); let m = vec::count_true(&use_snvs); - let mut g = load::generate_genot(fin, gfmt, m, n, &use_snvs, use_samples, use_missing); + if m == 0 { + panic!("No variants to be loaded from genotype file.") + } + + let mut g = load::generate_genot( + fin, + gfmt, + m, + n, + Some(&use_snvs), + use_samples, + fill_missing, + ); reverse_genotypes_rayon(&mut g.as_genot_mut(), &is_reversed); @@ -117,13 +135,15 @@ pub fn load_genotypes_for_score_multiwgts( //wgts: &mut [Wgt], n: usize, use_samples: Option<&[bool]>, - use_missing: bool, + fill_missing:bool, + allow_nonexist_snv: bool, + use_snv_pos: bool, ) -> Genot { let snvs_in = io_genot::load_snvs(fin, gfmt); let mut uses_snvs = Vec::new(); for wgts in wgts_multi.iter_mut() { // length are m_in - let use_snvs = create_wgt_to_genotype_index(&snvs_in, wgts.wgts()); + let use_snvs = create_wgt_to_genotype_index(&snvs_in, wgts.wgts(), use_snv_pos); //let use_snvs = create_wgt_to_genotype_index(fin, gfmt, wgts.wgts()); uses_snvs.push(use_snvs); } @@ -143,7 +163,8 @@ pub fn load_genotypes_for_score_multiwgts( let mut useds_snvs = Vec::new(); for wgts in wgts_multi.iter_mut() { // length are m - let (is_reversed, used_snvs) = set_wgts_index_reversed(wgts.wgts_mut(), &snvs_use); + let (is_reversed, used_snvs) = + set_wgts_index_reversed(wgts.wgts_mut(), &snvs_use, allow_nonexist_snv, use_snv_pos); iss_reversed.push(is_reversed); useds_snvs.push(used_snvs); } @@ -154,8 +175,24 @@ pub fn load_genotypes_for_score_multiwgts( check_consistent_is_reversed(&is_reversed, &iss_reversed, &useds_snvs); let m = vec::count_true(&use_snvs); + if m == 0 { + panic!("Using snvs are zero. Please check weight and genotype file.") + } - let mut g = load::generate_genot(fin, gfmt, m, n, &use_snvs, use_samples, use_missing); + let mut g = load::generate_genot( + fin, + gfmt, + m, + n, + Some(&use_snvs), + use_samples, + fill_missing, + ); + + // TMP + //for snv in g.iter_snv() { + // println!("snv bfr reverse {:?}", &snv.vals()[..10]); + //} reverse_genotypes_rayon(&mut g.as_genot_mut(), &is_reversed); @@ -177,6 +214,7 @@ fn create_wgt_to_genotype_index( //fin: &Path, //gfmt: GenotFormat, wgts: &[W], + use_snv_pos: bool, ) -> Vec { //let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); //let snvs_in = io_genot::load_snvs(fin, gfmt); @@ -188,7 +226,8 @@ fn create_wgt_to_genotype_index( let mut sid_in_to_m_in = HashMap::with_capacity(m_in); for (si, s) in snvs_in.iter().enumerate() { - sid_in_to_m_in.insert(s.to_sid(), si); + sid_in_to_m_in.insert(s.vid(use_snv_pos), si); + //sid_in_to_m_in.insert(s.sid(), si); //sid_in_to_m_in.insert(s.sida(), si); } @@ -197,8 +236,8 @@ fn create_wgt_to_genotype_index( for wgt in wgts.iter() { // if not snv, skip if wgt.kind().is_snv() { - let sid = wgt.kind().snv_index().to_sid(); - if let Some(v) = sid_in_to_m_in.get(&sid) { + let vid = wgt.kind().snv_index().vid(use_snv_pos); + if let Some(v) = sid_in_to_m_in.get(&vid) { use_snvs[*v] = true; } } @@ -311,14 +350,25 @@ fn create_wgt_to_genotype_index( // use_snvs //} -fn set_wgts_index_reversed(wgts: &mut [W], snvs: &[SnvId]) -> (Vec, Vec) { +fn set_wgts_index_reversed( + wgts: &mut [W], + snvs: &[SnvId], + allow_nonexist_snv: bool, + use_snv_pos: bool, +) -> (Vec, Vec) { let m = snvs.len(); // map: sid_in -> genotype index (m) - let mut sid_to_m = HashMap::with_capacity(m); + let mut vid_to_m = HashMap::with_capacity(m); for (si, s) in snvs.iter().enumerate() { - sid_to_m.insert(s.to_sid(), si); + if vid_to_m.contains_key(s.vid(use_snv_pos)) { + panic!( + "Snv in genot is duplicated. If this is intentional, supress --use-snv_pos: {}", + s.vid(use_snv_pos) + ) + } + vid_to_m.insert(s.vid(use_snv_pos), si); } // needs to reverse genotype @@ -334,10 +384,11 @@ fn set_wgts_index_reversed(wgts: &mut [W], snvs: &[SnvId]) -> (Vec< // is_reversed will be overwritten many times but should be all right if all of the same in wgt for wgt in wgts.iter_mut() { if wgt.kind().is_snv() { - let sid = wgt.kind().snv_index().to_sid(); + let vid = wgt.kind().snv_index().vid(use_snv_pos); + //log::debug!("sida {}", sida); //log::debug!("m_in {:?}", sida_in_to_m_in.get(sida)); - if let Some(&mi) = sid_to_m.get(&sid) { + if let Some(&mi) = vid_to_m.get(&vid) { //let mi = m_in_to_m[v]; used_snvs[mi] = true; @@ -348,17 +399,16 @@ fn set_wgts_index_reversed(wgts: &mut [W], snvs: &[SnvId]) -> (Vec< let snv_wgt = wgt.kind().snv_index().clone(); // check alleles match - // otherwise alleles do not match if &snv_wgt == &snv_in { wgt.set_snv_index(Some(mi)); // rev or not // here since putting inside of if raises error - let is_rev = snv_wgt.is_rev(&snv_in); + let is_rev = snv_wgt.is_rev(&snv_in, use_snv_pos); if is_rev { log::debug!( - "Alleles are reversed in wgt and fplink {:?}, {:?}", + "Alleles are reversed in wgt and fplink: {:?}, {:?}", &snv_wgt, &snv_in ); @@ -367,14 +417,28 @@ fn set_wgts_index_reversed(wgts: &mut [W], snvs: &[SnvId]) -> (Vec< is_reversed[mi] = is_rev; //is_reversed[mi] = false; } else { + // alleles do not match log::info!( - "Ignore SNV: alleles do not match in wgt and fplink {:?}, {:?}", + "Alleles do not match in wgt and fplink: {:?}, {:?}", snv_wgt, &snv_in ); + if allow_nonexist_snv { + wgt.set_snv_index(None); + } else { + panic!("Alleles do not match in wgt and fplink. Use --allow-nonexist-snv."); + } } } else { - wgt.set_snv_index(None); + log::info!( + "SNV in wgt is not in fplink: {:?}", + wgt.kind().snv_index().clone() + ); + if allow_nonexist_snv { + wgt.set_snv_index(None); + } else { + panic!("SNV in wgt is not in fplink. Use --allow-nonexist-snv. --use-snv-pos might help as well."); + } } } @@ -386,6 +450,7 @@ fn set_wgts_index_reversed(wgts: &mut [W], snvs: &[SnvId]) -> (Vec< (is_reversed, used_snvs) } +/// TODO: merge /// old use above /// /// use in score calculation @@ -402,6 +467,7 @@ fn create_wgt_to_genotype_index_sametime( fin: &Path, gfmt: GenotFormat, wgts: &mut [W], + use_snv_pos: bool, ) -> (Vec, Vec) { let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); let snvs_in = io_genot::load_snvs(fin, gfmt); @@ -409,10 +475,10 @@ fn create_wgt_to_genotype_index_sametime( let mut use_snvs = vec![false; m_in]; // map: sid_in -> genotype index (m) - let mut sid_in_to_m_in = HashMap::with_capacity(m_in); + let mut vid_in_to_m_in = HashMap::with_capacity(m_in); for (si, s) in snvs_in.iter().enumerate() { - sid_in_to_m_in.insert(s.to_sid(), si); + vid_in_to_m_in.insert(s.vid(use_snv_pos), si); //sid_in_to_m_in.insert(s.sida(), si); } @@ -421,8 +487,8 @@ fn create_wgt_to_genotype_index_sametime( for wgt in wgts.iter() { // if not snv, skip if wgt.kind().is_snv() { - let sid = wgt.kind().snv_index().to_sid(); - if let Some(v) = sid_in_to_m_in.get(&sid) { + let sid = wgt.kind().snv_index().vid(use_snv_pos); + if let Some(v) = vid_in_to_m_in.get(&sid) { use_snvs[*v] = true; } } @@ -440,10 +506,10 @@ fn create_wgt_to_genotype_index_sametime( // is_reversed will be overwritten many times but should be all right if all of the same in wgt for wgt in wgts.iter_mut() { if wgt.kind().is_snv() { - let sid = wgt.kind().snv_index().to_sid(); + let sid = wgt.kind().snv_index().vid(use_snv_pos); //log::debug!("sida {}", sida); //log::debug!("m_in {:?}", sida_in_to_m_in.get(sida)); - if let Some(v) = sid_in_to_m_in.get(&sid) { + if let Some(v) = vid_in_to_m_in.get(&sid) { let mi = m_in_to_m[v]; // clone is necessary @@ -458,7 +524,7 @@ fn create_wgt_to_genotype_index_sametime( // rev or not // here since putting inside of if raises error - let is_rev = snv_wgt.is_rev(&snv_in); + let is_rev = snv_wgt.is_rev(&snv_in, use_snv_pos); if is_rev { log::debug!( diff --git a/projects_rust/genetics/src/dataset/samples.rs b/projects_rust/genetics/src/dataset/samples.rs index 958b2c3..6fbe80a 100644 --- a/projects_rust/genetics/src/dataset/samples.rs +++ b/projects_rust/genetics/src/dataset/samples.rs @@ -30,7 +30,7 @@ type Name = String; #[derive(Clone)] pub struct Samples { // change to Labels for several phenotypes - phe: Phe, + phe: Option, names: Option>, // move to Dataset? -> put here for a while because covs also needs names // also, Samples only can be input of fn @@ -43,12 +43,12 @@ pub struct Samples { impl Samples { pub fn new( - ys: &[bool], + ys: Option<&[bool]>, names: Option>, covs: Option, samples_n: usize, ) -> Samples { - let phe = Phe::new(ys); + let phe = ys.map(|x| Phe::new(x)); Samples { phe, names, @@ -59,15 +59,29 @@ impl Samples { pub fn new_data(ys: &[bool], names: Option>, samples_n: usize) -> Samples { let phe = Phe::new(ys); Samples { - phe, + phe: Some(phe), names, covs: None, samples_n, } } + + /// for score + pub fn new_nophe(names: Option>, covs: Option, samples_n: usize) -> Samples { + //let phe = Phe::new_empty(samples_n); + Samples { + //phe, + phe: None, + names, + covs, + samples_n, + } + } + pub fn new_empty() -> Samples { Samples { - phe: Phe::new_empty(0), + phe: None, + //phe: Phe::new_empty(0), names: None, covs: None, samples_n: 0, @@ -91,11 +105,30 @@ impl Samples { pub fn samples_n(&self) -> usize { self.samples_n } - pub fn phe(&self) -> &Phe { - &self.phe + + // TMP: merge to phe_unwrap(); + //pub fn phe(&self) -> &Phe { + // self.phe_unwrap() + // //self.phe.as_ref().unwrap() + // //&self.phe + //} + + pub fn phe(&self) -> Option<&Phe> { + self.phe.as_ref() + //self.phe.as_ref().unwrap() + //&self.phe } - pub fn ys(&self) -> &[B8] { - self.phe.phe_inner().inner() + + pub fn phe_unwrap(&self) -> &Phe { + self.phe().unwrap() + //self.phe.as_ref().unwrap() + //&self.phe + } + + pub fn ys_unwrap(&self) -> &[B8] { + self.phe_unwrap().phe_inner().inner() + //self.phe_unwrap().phe_inner().inner() + //self.phe.phe_inner().inner() //&self.phe } /* pub fn ys_f64(&self) -> Vec { @@ -223,6 +256,9 @@ pub fn create_sample_id_to_n( let mut sample_id_to_n: HashMap = HashMap::new(); for n_in_i in 0..samples_in.len() { + if sample_id_to_n.contains_key(&samples_in[n_in_i]) { + panic!("Sample id is duplicated: {}", samples_in[n_in_i]) + } sample_id_to_n.insert(samples_in[n_in_i].clone(), n_in_i); } diff --git a/projects_rust/genetics/src/dataset/samples/covs.rs b/projects_rust/genetics/src/dataset/samples/covs.rs index 0c4e29c..1e642de 100644 --- a/projects_rust/genetics/src/dataset/samples/covs.rs +++ b/projects_rust/genetics/src/dataset/samples/covs.rs @@ -374,7 +374,8 @@ impl Covs { .unwrap() .iter() .position(|x| x.name() == name) - .unwrap(); + .unwrap_or_else(|| panic!("name is not in Covs: {}", name)); + //.unwrap(); &self.vals().unwrap()[index] } @@ -451,7 +452,7 @@ fn parse_cov_name_in_cols(cov_name: &str, cols: &[String]) -> Vec { fn index_in_col(x: &str, cols: &[String]) -> usize { cols.iter() .position(|y| y == x) - .expect(&format!("Cov_name {} is not in the column of fin_phe", x)) + .unwrap_or_else(|| panic!("Cov_name {} is not in the column of fin_phe", x)) } let mut cov_name_in_col: Vec = vec![]; diff --git a/projects_rust/genetics/src/dataset/snvs.rs b/projects_rust/genetics/src/dataset/snvs.rs index 6bcd290..dcc1c01 100644 --- a/projects_rust/genetics/src/dataset/snvs.rs +++ b/projects_rust/genetics/src/dataset/snvs.rs @@ -2,8 +2,8 @@ //! It is important that all vector values can be extract without constructing. ex. mafs: Vec not Vec //! Must use fn to access data so that it is easy to use Trait -use crate::{SnvId, GenotFormat}; use crate::{io_genot, snv}; +use crate::{GenotFormat, SnvId}; use std::path::Path; #[derive(Clone)] @@ -117,6 +117,10 @@ impl Snvs { pub fn snv_indexs(&self) -> &[SnvId] { &self.snv_indexs } + + pub fn snv_indexs_mut(&mut self) -> &mut [SnvId] { + &mut self.snv_indexs + } } #[cfg(test)] diff --git a/projects_rust/genetics/src/genot/base_genot.rs b/projects_rust/genetics/src/genot/base_genot.rs index 9fee483..16be745 100644 --- a/projects_rust/genetics/src/genot/base_genot.rs +++ b/projects_rust/genetics/src/genot/base_genot.rs @@ -13,8 +13,8 @@ use super::genot_iterator::{GenotCountIter, GenotIter, GenotIterMut}; use super::{GenotMut, GenotSnvMut, GenotSnvRef}; use crate::samples::prelude::*; -use cmatrix::prelude::*; use cmatrix; +use cmatrix::prelude::*; /* // 8 x 1 bit @@ -175,7 +175,11 @@ where let c1 = p0 & (!p1); let cm = (!p0) & p1; - (cmatrix::popcnt(c2), cmatrix::popcnt(c1), cmatrix::popcnt(cm)) + ( + cmatrix::popcnt(c2), + cmatrix::popcnt(c1), + cmatrix::popcnt(cm), + ) //(pop(d2), pop(n2), pop(d1), pop(n1), pop(dm), pop(nm)) } @@ -212,7 +216,8 @@ where //println!("in table afr last: {} sec", start_time.elapsed().as_micros()); - let maf = ((c2 * 2 + c1) as f64) / (cnomissing as f64); + let maf = ((c2 * 2 + c1) as f64) / ((cnomissing * 2) as f64); + assert!((0.0 <= maf) & (maf <= 1.0)); maf } @@ -404,7 +409,7 @@ where self.genot_inner_mut().set_unchecked_v(val, ni); } - /// Only use this knowing original bit is false + /// Use this only when knowing original bit is false /// for plink bed /// code(count); b0, b1 /// 00(2); 1,1 @@ -433,4 +438,63 @@ where fn as_genot_snv_mut_snv(&mut self) -> GenotSnvMut { GenotSnvMut::new(self.genot_inner_mut().as_cvec_mut_v()) } + + fn fill_missing_mode(&mut self) { + // count 0,1,2 + let mut counts_allele = vec![0usize; 4]; + + let n = self.n(); + for ni in 0..n { + counts_allele[self.get_val_unchecked(ni) as usize] += 1; + } + + let mut mode: usize = 4; + let mut mode_counts = 0; + for i in 0..=2 { + if counts_allele[i] > mode_counts { + mode_counts = counts_allele[i]; + mode = i; + } + } + let mode = mode as u8; + assert_ne!(mode, 4); + + for ni in 0..n { + if self.get_val_unchecked(ni) == 3 { + self.set_unchecked(mode, ni); + } + } + // check all are non-missing? + // -> performance... + } + + fn fill_missing_mode_maf(&mut self, maf: f64) { + let mode: u8 = if maf < 1.0 / 3.0f64 { + 0 + } else if maf > 2.0 / 3.0f64 { + 2 + } else { + 1 + }; + + let n = self.n(); + + for ni in 0..n { + if self.get_val_unchecked(ni) == 3 { + self.set_unchecked(mode, ni); + } + } + // check all are non-missing? + // -> performance... + } + + // {0:2, 1:1, 2:0, 3:3} + const REV_AR: [u8; 4] = [2, 1, 0, 3]; + fn reverse_allele(&mut self) { + let n = self.n(); + for ni in 0..n { + let val = self.get_val_unchecked(ni); + self.set_unchecked(Self::REV_AR[val as usize], ni); + } + } } diff --git a/projects_rust/genetics/src/genot/genot_struct.rs b/projects_rust/genetics/src/genot/genot_struct.rs index 968c17b..049d730 100644 --- a/projects_rust/genetics/src/genot/genot_struct.rs +++ b/projects_rust/genetics/src/genot/genot_struct.rs @@ -196,14 +196,7 @@ mod tests { let t8 = g.stat_contingency_table_nosimd(&phe); - assert_eq!(t8.0, 2); - assert_eq!(t8.1, 1); - assert_eq!(t8.2, 1); - assert_eq!(t8.3, 0); - assert_eq!(t8.4, 0); - assert_eq!(t8.5, 2); - assert_eq!(t8.6, 2); - assert_eq!(t8.7, 1); + assert_eq!(t8, (2usize, 1, 1, 0, 0, 2, 2, 1)); } #[test] @@ -218,15 +211,6 @@ mod tests { let t8_nosimd = g.stat_contingency_table_nosimd(&phe); assert_eq!(t8, t8_nosimd); - - //assert_eq!(t8.0, 2); - //assert_eq!(t8.1, 1); - //assert_eq!(t8.2, 1); - //assert_eq!(t8.3, 0); - //assert_eq!(t8.4, 0); - //assert_eq!(t8.5, 2); - //assert_eq!(t8.6, 2); - //assert_eq!(t8.7, 1); } fn is_eq_f64(v: f64, w: f64, e: f64) -> bool { @@ -240,14 +224,14 @@ mod tests { let maf = g.maf(); - // (2*3+1)/6 - assert!(is_eq_f64(maf, 7.0f64/6.0f64,1e-7) ) + // (2*3+1)/(6*2) + assert!(is_eq_f64(maf, 7.0f64 / 12.0f64, 1e-7)) + //wrong + // (2*3+1)/6 + //assert!(is_eq_f64(maf, 7.0f64 / 6.0f64, 1e-7)) } - - - #[test] fn test_set_bed_code_init_unchecked() { let mut g = Genot::new_zeros(2, 3); @@ -284,4 +268,40 @@ mod tests { gref.set_bed_code_unchecked(1, 1); assert_eq!(gref.get_val(1), 3); } + + #[test] + fn test_fill_missing_mode() { + let vec = vec![0, 1, 1, 2, 1, 3]; + let vec_exp = vec![0u8, 1, 1, 2, 1, 1]; + let mut g = GenotSnv::new(&vec); + + g.as_genot_snv_mut_snv().fill_missing_mode(); + + assert_eq!(&g.vals(), &vec_exp); + } + + #[test] + fn test_fill_missing_mode2() { + // when missing is the mode + let vec = vec![0, 1, 1, 3, 3, 3]; + let vec_exp = vec![0, 1, 1, 1, 1, 1]; + let mut g = GenotSnv::new(&vec); + + g.as_genot_snv_mut_snv().fill_missing_mode(); + + assert_eq!(&g.vals(), &vec_exp); + } + + //TODO: fill_missing_mode_maf() + + #[test] + fn test_reverse_allele() { + let vec = vec![0, 1, 1, 2, 1, 3]; + let vec_exp = vec![2u8, 1, 1, 0, 1, 3]; + let mut g = GenotSnv::new(&vec); + + g.as_genot_snv_mut_snv().reverse_allele(); + + assert_eq!(&g.vals(), &vec_exp); + } } diff --git a/projects_rust/genetics/src/lib.rs b/projects_rust/genetics/src/lib.rs index df12913..cc4d8ab 100644 --- a/projects_rust/genetics/src/lib.rs +++ b/projects_rust/genetics/src/lib.rs @@ -1,5 +1,7 @@ // change arg name from v to ar, and for v,i +use std::collections::HashMap; + pub mod alloc; pub mod cov; pub mod dataset; @@ -8,7 +10,7 @@ pub mod genot_index; pub mod pgs; pub mod regression; pub mod sample; -mod score; +pub mod score; pub mod snv; pub mod sum_stat; pub mod textfile; @@ -27,6 +29,7 @@ mod wgts; // only make pub of Wgt pub use cov::{CovKind, Var}; pub use dataset::io_genot; +pub use dataset::io_genot::load; pub use dataset::samples; pub use dataset::samples::{CovId, Covs, Samples}; pub use dataset::snvs; @@ -34,6 +37,7 @@ pub use dataset::snvs::Snvs; pub use dataset::Dataset; pub use genot::genot_struct; pub use genot::genot_struct::{Genot, B8, B8_2, THRESHOLD_SNV}; +pub use genot::prelude::*; pub use io_genot::GenotFormat; pub use snv::{Chrom, SnvId}; use std::path::Path; @@ -47,29 +51,101 @@ extern crate assert_float_eq; // TODO: better fn fscore_from_fwgt(dscore: &Path, fwgt: &Path) -> PathBuf { - let fname_score = fwgt.file_name().unwrap().to_str().unwrap().to_owned() + ".score"; + let fname_score = fwgt.file_stem().unwrap().to_str().unwrap().to_owned() + ".score"; + //let fname_score = fwgt.file_name().unwrap().to_str().unwrap().to_owned() + ".score"; + let fout_score = dscore.join(fname_score); + fout_score +} + +fn fscore_concat(dscore: &Path, para: &str, fwgt: &Path) -> PathBuf { + // fwgt to get method name + // ex. 'clump_p-0.1_n-100' -> 'clump_p-0.1_n.score' + + let method = fwgt + .file_stem() + .unwrap() + .to_str() + .unwrap() + .split(&("_".to_string() + para + "-")) + .collect::>()[0] + .to_string(); + + // [method]_n.score + //let method = fwgt + // .file_name() + // .unwrap() + // .to_str() + // .unwrap() + // .split("_") + // .collect::>()[0] + // .to_string(); + let fname_score = method + "_" + para + ".score"; let fout_score = dscore.join(fname_score); fout_score } +fn para_from_fwgt(fwgt: &Path, para: &str) -> String { + let para = fwgt + .file_stem() // exclude .wgt here + .unwrap() + .to_str() + .unwrap() + .split(&("_".to_string() + para + "-")) + .collect::>()[1] + .to_string(); + para +} + +fn is_fwgt_concat(fwgt: &Path, concat_para: &str) -> bool { + let is_concat = fwgt + .file_stem() + .unwrap() + .to_str() + .unwrap() + .contains(&("_".to_string() + concat_para + "-")); + + if is_concat { + // check if file stem name end with _(para)-* + if fwgt + .file_stem() + .unwrap() + .to_str() + .unwrap() + .split(&("_".to_string() + concat_para + "-")) + .collect::>() + .last() + .unwrap() + .contains("_") + { + panic!("File stem name should end with _(para)-*. "); + } + }; + is_concat +} + pub fn run_score( dout_score: &Path, fin: &Path, gfmt: GenotFormat, fin_phe: Option<&Path>, - phe_name: Option<&str>, + //phe_name: Option<&str>, cov_name: Option<&str>, dout_wgt: Option<&Path>, fout_wgt: Option<&Path>, //fin_cov: Option<&Path>, fin_sample: Option<&Path>, + concat: Option<&str>, + no_concat: Option<&str>, is_resume: bool, + allow_nonexist_snv: bool, + use_snv_pos: bool, + is_nonadd: bool, ) { io_genot::check_valid_fin(fin, gfmt); //let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); - let (_, use_samples) = sample::make_use_samples(fin_sample, fin, gfmt); - let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); + //let (_, use_samples) = sample::make_use_samples(fin_sample, fin, gfmt); + //let samples_id = io_genot::load_samples_id(fin, gfmt, Some(&use_samples)); if let Some(dout_wgt) = dout_wgt { let files_wgt_ori = wgt::io::get_files_wgt(dout_wgt); @@ -78,20 +154,61 @@ pub fn run_score( panic!("No wgt files exist."); } - let files_wgt = if is_resume { - // exclude fwgt if .score already exist - let mut v = Vec::new(); - for fwgt in files_wgt_ori { - let fscore = fscore_from_fwgt(dout_score, &fwgt); - if !textfile::exist_file(&fscore) { - v.push(fwgt); - } + let files_wgt = if let Some(concat_para) = concat { + if is_resume { + log::debug!("--is-resume but --concat is set. Ignore --is-resume."); } - v - } else { + //files_wgt_ori + // only use files with concat_para files_wgt_ori + .iter() + .filter(|fwgt| is_fwgt_concat(fwgt, concat_para)) + .map(|x| x.clone()) + .collect::>() + } else if let Some(concat_para) = no_concat { + if is_resume { + log::debug!("--is-resume but --no-concat is set. Ignore --is-resume."); + } + // only use files without concat_para + files_wgt_ori + .iter() + .filter(|fwgt| !is_fwgt_concat(fwgt, concat_para)) + .map(|x| x.clone()) + .collect::>() + } else { + // concat.is_none() and no_concat.is_none() + if is_resume { + // exclude fwgt if .score already exist + let mut v = Vec::new(); + for fwgt in files_wgt_ori { + let fscore = fscore_from_fwgt(dout_score, &fwgt); + if !textfile::exist_file(&fscore) { + v.push(fwgt); + } + } + v + } else { + files_wgt_ori + } }; + //let files_wgt = if is_resume & concat.is_none() { + // // exclude fwgt if .score already exist + // let mut v = Vec::new(); + // for fwgt in files_wgt_ori { + // let fscore = fscore_from_fwgt(dout_score, &fwgt); + // if !textfile::exist_file(&fscore) { + // v.push(fwgt); + // } + // } + // v + //} else { + // if is_resume & concat.is_some() { + // log::debug!("--is-resume but --concat is set. Ignore --is-resume."); + // } + // files_wgt_ori + //}; + log::debug!("wgt files: {:?}", files_wgt); if files_wgt.len() == 0 { @@ -101,8 +218,9 @@ pub fn run_score( let mut wgts_vec: Vec = files_wgt .iter() - .map(|file_wgt| Wgts::new_from_file(file_wgt)) + .map(|file_wgt| Wgts::new_from_file(file_wgt, is_nonadd)) .collect(); + //.map(|file_wgt| Wgts::new_from_file(file_wgt, use_snv_pos, is_nonadd)) let phe_buf = fin_phe.map(|x| crate::textfile::read_file_to_end(x, None).unwrap()); let sample_buf = fin_sample.map(|x| crate::textfile::read_file_to_end(x, None).unwrap()); @@ -112,24 +230,122 @@ pub fn run_score( gfmt, phe_buf.as_deref(), //fin_phe, - phe_name, + //phe_name, cov_name, sample_buf.as_deref(), //fin_sample, &mut wgts_vec, + allow_nonexist_snv, + use_snv_pos, ); + //if let Some(concat_para) = concat { + // for fwgt in files_wgt.iter() { + // if !(fwgt + // .file_stem() + // .unwrap() + // .to_str() + // .unwrap() + // .contains(&("_".to_string() + concat_para + "-"))) + // { + // panic!("Parameter not in file name. Avoid using --concat."); + // } + + // if fwgt + // .file_stem() + // .unwrap() + // .to_str() + // .unwrap() + // .split(&("_".to_string() + concat_para + "-")) + // .collect::>() + // .last() + // .unwrap() + // .contains("_") + // { + // panic!("File stem name should end with _(para)-*. "); + // } + // } + //} + + let samples_id = dataset.samples().names(); + let mut score_paras: Vec> = vec![]; for (wgts, file_wgt) in wgts_vec.iter().zip(files_wgt.iter()) { log::debug!("file_wgt: {:?}", file_wgt); - let fout_score = fscore_from_fwgt(dout_score, file_wgt); - //let fname_score = file_wgt.file_name().unwrap().to_str().unwrap().to_owned() + ".score"; - //let fout_score = dout_score.join(fname_score); - score::score(&fout_score, &wgts, &dataset, &samples_id); + //score::score(&fout_score, &wgts, &dataset, &samples_id); + let score_v = score::score_nowrite(&wgts, &dataset); + + if concat.is_some() { + score_paras.push(score_v); + } else { + // --no-concat or no args + let fout_score = fscore_from_fwgt(dout_score, file_wgt); + score::write_scores_nopheno(&fout_score, &score_v, samples_id) + } + } + + if let Some(concat_para) = concat { + // grouping scores with the same fout_concat + // ex. ['clump_p-0.1_n-100', 'clump_p-0.1_n-200'] -> 'clump_p-0.1_n.score' + // ['clump_p-0.2_n-100', 'clump_p-0.2_n-200'] -> 'clump_p-0.2_n.score' + + let fouts_concat: Vec = files_wgt + .iter() + .map(|x| fscore_concat(dout_score, concat_para, x)) + .collect(); + //let fout_concat = fscore_concat(dout_score, concat_para, &files_wgt[0]); + //log::debug!("write to: {:?}", fout_concat); + + let paras: Vec = files_wgt + .iter() + .map(|x| para_from_fwgt(x, concat_para)) + .collect(); + + let foutd_paras: HashMap> = fouts_concat + .iter() + .zip(paras.iter()) + .fold(HashMap::new(), |mut paras_each, (fout, para)| { + paras_each + .entry(fout.clone()) + .or_insert(vec![]) + .push(para.clone()); + paras_each + }); + + let foutd_scores: HashMap>> = fouts_concat + .iter() + .zip(score_paras.iter()) + .fold(HashMap::new(), |mut scores_each, (fout, score)| { + scores_each + .entry(fout.clone()) + .or_insert(vec![]) + .push(score.clone()); + scores_each + }); + + for (fout_concat, paras) in foutd_paras.iter() { + let score_para = foutd_scores.get(fout_concat).unwrap(); + + score::write_scores_paras_nopheno( + &fout_concat, + &score_para, + concat_para, + ¶s, + samples_id, + ) + } + //score::write_scores_paras_nopheno( + // &fout_concat, + // &score_paras, + // concat_para, + // ¶s, + // samples_id, + //) } } else if let Some(file_wgt) = fout_wgt { wgt::io::check_file_wgt_exist(&file_wgt); - let wgts = Wgts::new_from_file(&file_wgt); + let wgts = Wgts::new_from_file(&file_wgt, is_nonadd); + //let wgts = Wgts::new_from_file(&file_wgt, use_snv_pos, is_nonadd); let mut wgts_vec = vec![wgts]; let phe_buf = fin_phe.map(|x| crate::textfile::read_file_to_end(x, None).unwrap()); @@ -139,18 +355,62 @@ pub fn run_score( gfmt, phe_buf.as_deref(), //fin_phe, - phe_name, + //phe_name, cov_name, sample_buf.as_deref(), //fin_sample, &mut wgts_vec, + allow_nonexist_snv, + use_snv_pos, ); let fout_score = fscore_from_fwgt(dout_score, file_wgt); //let fname_score = file_wgt.file_name().unwrap().to_str().unwrap().to_owned() + ".score"; //let fout_score = dout_score.join(fname_score); - score::score(&fout_score, &wgts_vec[0], &dataset, &samples_id); + //score::score(&fout_score, &wgts_vec[0], &dataset, &samples_id); + let score_v = score::score_nowrite(&wgts_vec[0], &dataset); + let samples_id = dataset.samples().names(); + score::write_scores_nopheno(&fout_score, &score_v, samples_id) } else { panic!("sth wrong.") } } + +pub fn test_pgenlib(fin: &Path, gfmt: GenotFormat, _fin_sample: Option<&Path>) -> Genot { + let snvs_in = io_genot::load_snvs(fin, gfmt); + + for (si, snv) in snvs_in.iter().enumerate() { + if si < 10 { + println!("snv {:?}", snv); + //log::debug!("snv {:?}", snv); + } + } + + let m_in: usize = io_genot::compute_num_snv(fin, gfmt).unwrap(); + log::debug!("m_in: {}", m_in); + let n_in: usize = io_genot::compute_num_sample(fin, gfmt).unwrap(); + log::debug!("n_in: {}", n_in); + + let use_snvs = if m_in < 100 { + //let use_snvs = vec![true; m_in]; + vec![true; m_in] + } else { + //let mut u_ = vec![true; 100]; + //u_.extend(vec![false; m_in - 100]); + let mut u_ = vec![false; m_in - 100]; + u_.extend(vec![true; 100]); + u_ + }; + + println!("to generate"); + let g = load::generate_genot(fin, gfmt, m_in, n_in, Some(&use_snvs), None, false); + println!("generated"); + + for (si, snv) in g.iter_snv().enumerate() { + if si < 10 { + //log::debug!("genot {:?}", &snv.vals()[..10]); + println!("genot {:?}", &snv.vals()[..10]); + } + } + g +} diff --git a/projects_rust/genetics/src/regression.rs b/projects_rust/genetics/src/regression.rs index 101a8d4..75df61c 100644 --- a/projects_rust/genetics/src/regression.rs +++ b/projects_rust/genetics/src/regression.rs @@ -33,103 +33,133 @@ use crate::{wgt::Coef, Wgt}; //mod logistic; - //smartcore v3 or mysmartcore pub fn logistic_regression_covs(samples: &Samples) -> Vec { - let covs_val = samples.covs(); - // TODO: return None - if let None = covs_val { - return vec![]; - } - - let covs_val = covs_val.unwrap(); - let v = covs_val.vals_row_major_norm(); - let ys = samples.phe().inner_i32(); - - let cov_n = covs_val.covs_n(); - //let cov_n=v.len(); - log::debug!("cov_n {}", cov_n); - log::debug!("ys true count {}", samples.phe().count()); - log::debug!("ys false count {}", samples.phe().count_false()); - - log::debug!("norm v"); - log::debug!("norm mt {}, {}", v[0][0], v[0][1]); - - // dense_matrix from 2d array - // https://docs.rs/smartcore/latest/src/smartcore/linalg/naive/dense_matrix.rs.html#248 - //let (v, row_n, col_n) = covs_val.vals_row_major_vec(); - //log::debug!("row, col {}, {}",row_n,col_n); - let mt = DenseMatrix::from_2d_vec(&v); - //let mt = DenseMatrix::from_2d_vec(&covs_val.vals_row_major()); - // default: alpha=0.0 - // confirmed to output the same answer as sklearn - let lr = LogisticRegression::fit(&mt, &ys, Default::default()).unwrap(); - log::debug!("logreg"); - log::debug!("intercept {:?}", lr.intercept()); - log::debug!("coef {:?}", lr.coefficients()); - - //revert normalization - let means = samples.covs().unwrap().means(); - let stds = samples.covs().unwrap().stds(); - - let coefs_norm = lr.coefficients().iter().map(|x| *x).collect::>(); - assert_eq!(coefs_norm.len(), covs_val.covs_n()); + //let covs_val = samples.covs(); + //// TODO: return None + //if let None = covs_val { + // return vec![]; + //} + if let Some(covs) = samples.covs() { + // load normed vals + let v = covs.vals_row_major_norm(); + let ys = samples.phe_unwrap().inner_i32(); + + let cov_n = covs.covs_n(); + //let cov_n=v.len(); + log::debug!("cov_n {}", cov_n); + log::debug!("ys true count {}", samples.phe_unwrap().count()); + log::debug!("ys false count {}", samples.phe_unwrap().count_false()); + + log::debug!("norm v"); + log::debug!("norm mt {}, {}", v[0][0], v[0][1]); + + // dense_matrix from 2d array + // https://docs.rs/smartcore/latest/src/smartcore/linalg/naive/dense_matrix.rs.html#248 + //let (v, row_n, col_n) = covs_val.vals_row_major_vec(); + //log::debug!("row, col {}, {}",row_n,col_n); + let mt = DenseMatrix::from_2d_vec(&v); + //let mt = DenseMatrix::from_2d_vec(&covs_val.vals_row_major()); + // default: alpha=0.0 + // confirmed to output the same result as sklearn + let lr = LogisticRegression::fit(&mt, &ys, Default::default()).unwrap(); + log::debug!("logreg"); + log::debug!("intercept {:?}", lr.intercept()); + log::debug!("coef {:?}", lr.coefficients()); + + //revert normalization + let means = samples.covs().unwrap().means(); + let stds = samples.covs().unwrap().stds(); + + let coefs_norm = lr.coefficients().iter().map(|x| *x).collect::>(); + assert_eq!(coefs_norm.len(), covs.covs_n()); + + // c=c' / std + let coefs = coefs_norm + .iter() + .zip(stds.iter()) + .map(|(c, s)| c / s) + .collect::>(); + //let coef=coefs_norm.iter().zip(means.iter()).zip(stds.iter()).map(|((c,m),s)| ) + + let intercept_norm = lr.intercept().iter().map(|x| *x).collect::>(); + log::debug!("intercept_norm {:?}", intercept_norm); + assert_eq!(intercept_norm.len(), 1); + let intercept_norm = intercept_norm[0]; + if intercept_norm.is_nan() { + panic!("Intercept is NaN."); + } - // c=c' / std - let coefs = coefs_norm - .iter() - .zip(stds.iter()) - .map(|(c, s)| c / s) - .collect::>(); - //let coef=coefs_norm.iter().zip(means.iter()).zip(stds.iter()).map(|((c,m),s)| ) + // i = i' - sum( c' * m / std) = i' - sum( c * m ) + let intercept = intercept_norm + + coefs + .iter() + .zip(means.iter()) + .map(|(c, m)| -c * m) + .sum::(); + + let mut wgts_cov: Vec = Vec::new(); + + //let mut intercept=vec![0.0f64;1]; + //let mut intercept=Vec::with_capacity(1); + //lr.intercept().copy_col_as_vec(0, &mut intercept); + //let intercept = lr.intercept().clone().to_row_vector()[0]; + + //let intercept=lr.intercept().col_iter().collect::>(); + //let intercept = lr.intercept().iter().map(|x| *x).collect::>(); + //log::debug!("intercept {:?}", intercept); + //assert_eq!(intercept.len(), 1); + //let intercept = intercept[0]; + if intercept.is_nan() { + panic!("Intercept is NaN."); + } + let cov_id = CovId::new_const(); + let wgt_const = Wgt::construct_cov(cov_id, Coef::Linear(intercept)); + wgts_cov.push(wgt_const); + + let cov_indexs = covs.cov_indexs().unwrap(); + for wgt_i in 0..covs.covs_n() { + let coef = coefs[wgt_i]; + if coef.is_nan() { + panic!("coef is NaN."); + } + let cov_id = CovId::new_cov(cov_indexs[wgt_i].name().to_owned()); + let wgt_cov = Wgt::construct_cov(cov_id, Coef::Linear(coef)); + wgts_cov.push(wgt_cov); + } - let intercept_norm = lr.intercept().iter().map(|x| *x).collect::>(); - log::debug!("intercept_norm {:?}", intercept_norm); - assert_eq!(intercept_norm.len(), 1); - let intercept_norm = intercept_norm[0]; - if intercept_norm.is_nan() { - panic!("Intercept is NaN."); - } + wgts_cov + } else { + // return vec![]; + // regress on const - // i = i' - sum( c' * m / std) = i' - sum( c * m ) - let intercept = intercept_norm - + coefs - .iter() - .zip(means.iter()) - .map(|(c, m)| -c * m) - .sum::(); + log::debug!("Regression on const value only."); - let mut wgts_cov: Vec = Vec::new(); + log::debug!("ys true count {}", samples.phe_unwrap().count()); + log::debug!("ys false count {}", samples.phe_unwrap().count_false()); - //let mut intercept=vec![0.0f64;1]; - //let mut intercept=Vec::with_capacity(1); - //lr.intercept().copy_col_as_vec(0, &mut intercept); - //let intercept = lr.intercept().clone().to_row_vector()[0]; + let y1_n = samples.phe_unwrap().count() as f64; + let y0_n = samples.phe_unwrap().count_false() as f64; + // TODO: test + let intercept = (y1_n / y0_n).ln(); - //let intercept=lr.intercept().col_iter().collect::>(); - //let intercept = lr.intercept().iter().map(|x| *x).collect::>(); - //log::debug!("intercept {:?}", intercept); - //assert_eq!(intercept.len(), 1); - //let intercept = intercept[0]; - if intercept.is_nan() { - panic!("Intercept is NaN."); - } - let cov_id = CovId::new_const(); - let wgt_const = Wgt::construct_cov(cov_id, Coef::Linear(intercept)); - wgts_cov.push(wgt_const); + // cannot run on smartcore since automatically adds const values + //let ys = samples.phe().inner_i32(); + //let sample_n = samples.samples_n(); + //let v = vec![vec![1.0f64; sample_n]]; + //let mt = DenseMatrix::from_2d_vec(&v); + //let lr = LogisticRegression::fit(&mt, &ys, Default::default()).unwrap(); - let cov_indexs = covs_val.cov_indexs().unwrap(); - for wgt_i in 0..covs_val.covs_n() { - let coef = coefs[wgt_i]; - if coef.is_nan() { - panic!("coef is NaN."); + let mut wgts_cov: Vec = Vec::new(); + if intercept.is_nan() { + panic!("Intercept is NaN."); } - let cov_id = CovId::new_cov(cov_indexs[wgt_i].name().to_owned()); - let wgt_cov = Wgt::construct_cov(cov_id, Coef::Linear(coef)); - wgts_cov.push(wgt_cov); - } + let cov_id = CovId::new_const(); + let wgt_const = Wgt::construct_cov(cov_id, Coef::Linear(intercept)); + wgts_cov.push(wgt_const); - wgts_cov + wgts_cov + } } /* // smartcore v0.2.0 @@ -388,11 +418,10 @@ pub fn logistic_regression_covs(samples: &Samples) -> Vec { wgts_cov } */ - #[cfg(test)] mod tests { //use super::*; -/* + /* #[test] fn simple_example_1() { let log_reg = LogisticRegression::default(); diff --git a/projects_rust/genetics/src/sample/io.rs b/projects_rust/genetics/src/sample/io.rs index ab31c2f..354b302 100644 --- a/projects_rust/genetics/src/sample/io.rs +++ b/projects_rust/genetics/src/sample/io.rs @@ -228,7 +228,6 @@ pub fn vals_align_id( let n = sample_id_to_n.len(); // TODO: better way? let mut vals_align: Vec = vec![String::from(""); n]; - //let mut vals_align: Vec = vec![String::from(""); n]; for (n_in_i, val) in vals.iter().enumerate() { let sample_id = ids[n_in_i].clone(); @@ -243,15 +242,23 @@ pub fn vals_align_id( // panic if any value is not assigned if vals_align.iter().any(|v| *v == "") { for n_in_i in 0..vals.len() { - let sample_id = ids[n_in_i].clone(); - //let sample_id = samples::sample_id(ids.0[n_in_i].clone(), &ids.1[n_in_i]); - - if let Some(ni) = sample_id_to_n.get(&sample_id) { - if vals_align[*ni] == "" { - panic!("Some sample is not found in fin_phe: {}.", sample_id); - } + if vals_align[n_in_i] == "" { + let sample_id = &ids[n_in_i]; + panic!("sample in genotype file is not in cov file: {}", sample_id); } } + + // this cannot raise error when value is nan and the sample is not in sample_id. + //for n_in_i in 0..vals.len() { + // let sample_id = ids[n_in_i].clone(); + // //let sample_id = samples::sample_id(ids.0[n_in_i].clone(), &ids.1[n_in_i]); + + // if let Some(ni) = sample_id_to_n.get(&sample_id) { + // if vals_align[*ni] == "" { + // panic!("Some sample is not found in fin_phe: {}.", sample_id); + // } + // } + //} } vals_align @@ -274,4 +281,17 @@ mod tests { let vals = vals_align_id(&vals, &ids, &id_to_n); assert_eq!(vals, vec!["3", "1"]); } + + #[test] + #[should_panic] + fn test_vals_align_id2() { + let vals: Vec = ["1", "2", "3"].iter().map(|x| x.to_string()).collect(); + let ids: Vec = ["id1", "id2", "id3"] + .iter() + .map(|x| x.to_string()) + .collect(); + let id_to_n: HashMap = HashMap::from([("id4".to_string(), 0)]); + + let vals = vals_align_id(&vals, &ids, &id_to_n); + } } diff --git a/projects_rust/genetics/src/score.rs b/projects_rust/genetics/src/score.rs index 5b00029..7d8668e 100644 --- a/projects_rust/genetics/src/score.rs +++ b/projects_rust/genetics/src/score.rs @@ -4,6 +4,9 @@ use crate::Covs; use crate::Dataset; use crate::Wgt; use crate::Wgts; +//use core::panic; +use std::hash::Hash; +use std::collections::HashSet; //use rayon::iter::ParallelBridge; //use rayon::prelude::*; use std::fs::File; @@ -12,6 +15,27 @@ use std::path::Path; //use crate::samples::phe::Phe; use crate::samples::prelude::*; +// (s0, s1, s2) +fn scores_add_coef(scores: &mut [f64], score_wgt: (f64, f64, f64), genot_mi: &GenotSnvRef) { + let (s0, s1, s2) = score_wgt; + scores + .iter_mut() + .zip(genot_mi.iter()) + .for_each(|(score, val)| { + let score_add = match val { + 0 => s0, + 1 => s1, + 2 => s2, + // panic for NA + //3 => sm, + _ => { + panic!("Wrong genotype. Possibly NA for linear model.") + } + }; + *score += score_add; + }) +} + // DO NOT use rayon here // since very slow since n is small pub fn add_score(scores: &mut [f64], wgt: &Wgt, genot: &Genot, covs: Option<&Covs>) { @@ -20,57 +44,69 @@ pub fn add_score(scores: &mut [f64], wgt: &Wgt, genot: &Genot, covs: Option<&Cov WgtKind::Snv(_, _, mi) => { //log::debug!("mi {}", mi.unwrap()); // This should never panic. - let genot_mi = genot.to_genot_snv(mi.unwrap()); - match wgt.wgt().model().coef() { - Coef::Linear(alpha_ti) => { - let s2 = alpha_ti * 2.0; - let s1 = alpha_ti; - let s0 = 0.0; - // not checked but this should work - scores - .iter_mut() - .zip(genot_mi.iter()) - .for_each(|(score, val)| { - let score_add = match val { - 2 => s2, - 1 => s1, - 0 => s0, - // panic for NA - //3 => sm, - _ => panic!("Wrong genotype. Possibly NA for linear model."), - }; - *score += score_add; - }) - //scores - // .iter_mut() - // .zip(genot_mi.iter()) - // .par_bridge() - // .for_each(|(score, val)| { - // let score_add = match val { - // 2 => s2, - // 1 => s1, - // 0 => s0, - // // panic for NA - // //3 => sm, - // _ => panic!("Wrong genotype. Possibly NA for linear model."), - // }; - // *score += score_add; - // }) - - //scores.par_iter_mut().enumerate().for_each(|(ni, score)| { - // let score_add = match genot_mi.get_val_unchecked(ni) { - // 2 => s2, - // 1 => s1, - // 0 => s0, - // // panic for NA - // //3 => sm, - // _ => panic!("Wrong code. Possibly NA for linear model."), - // }; - // *score += score_add; - //}) + match mi { + None => return, + Some(mi) => { + let genot_mi = genot.to_genot_snv(*mi); + match wgt.wgt().model().coef() { + Coef::Linear(alpha_ti) => { + let s2 = alpha_ti * 2.0; + let s1 = alpha_ti; + let s0 = 0.0; + scores_add_coef(scores, (s0, s1, s2), &genot_mi); + // not checked but this should work + //scores + // .iter_mut() + // .zip(genot_mi.iter()) + // .for_each(|(score, val)| { + // let score_add = match val { + // 2 => s2, + // 1 => s1, + // 0 => s0, + // // panic for NA + // //3 => sm, + // _ => { + // panic!("Wrong genotype. Possibly NA for linear model.") + // } + // }; + // *score += score_add; + // }) + } + Coef::Score3(score_wgt) => { + //let s2 = alpha_ti * 2.0; + //let s1 = alpha_ti; + //let s0 = 0.0; + scores_add_coef(scores, score_wgt, &genot_mi); + } + _ => unimplemented!(), + } } - _ => unimplemented!(), } + + //let genot_mi = genot.to_genot_snv(mi.unwrap()); + //match wgt.wgt().model().coef() { + // Coef::Linear(alpha_ti) => { + // let s2 = alpha_ti * 2.0; + // let s1 = alpha_ti; + // let s0 = 0.0; + // // not checked but this should work + // scores + // .iter_mut() + // .zip(genot_mi.iter()) + // .for_each(|(score, val)| { + // let score_add = match val { + // 2 => s2, + // 1 => s1, + // 0 => s0, + // // panic for NA + // //3 => sm, + // _ => panic!("Wrong genotype. Possibly NA for linear model."), + // }; + // *score += score_add; + // }) + // } + // _ => unimplemented!(), + //} } WgtKind::Cov(_) => { let cov_name = wgt.wgt().kind().cov().name(); @@ -135,10 +171,11 @@ pub fn add_score(scores: &mut [f64], wgt: &Wgt, genot: &Genot, covs: Option<&Cov } } +#[allow(dead_code)] pub fn score(fout_score: &Path, wgts: &Wgts, dataset: &Dataset, samples_id: &[String]) { let n = dataset.genot().n(); let genot = dataset.genot(); - let phe = dataset.samples().phe(); + let phe = dataset.samples().phe_unwrap(); let mut scores = vec![0.0f64; n]; for (wgt_i, wgt) in wgts.wgts().iter().enumerate() { @@ -153,6 +190,24 @@ pub fn score(fout_score: &Path, wgts: &Wgts, dataset: &Dataset, samples_id: &[St //io::write_scores(&fout_iteration, &scores, phe, samples_id); } +pub fn score_nowrite(wgts: &Wgts, dataset: &Dataset) -> Vec { + let n = dataset.genot().n(); + let genot = dataset.genot(); + //let phe = dataset.samples().phe(); + + let mut scores = vec![0.0f64; n]; + for (wgt_i, wgt) in wgts.wgts().iter().enumerate() { + if (wgt_i > 0) & (wgt_i % 100_000 == 0) { + log::debug!("{:?}-th SNV", wgt_i); + } + + add_score(&mut scores, wgt, genot, None); + } + + //write_scores(&fout_score, &scores, phe, samples_id); + scores +} + /* pub fn score(fout_score: &Path, wgts: &Wgts, dataset: &Dataset, samples_id: &[(String, String)]) { let n = dataset.genot().n(); let genot = dataset.genot(); @@ -172,7 +227,8 @@ pub fn score(fout_score: &Path, wgts: &Wgts, dataset: &Dataset, samples_id: &[St } */ //fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[(String, String)]) { -fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[String]) { +#[allow(dead_code)] +pub fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[String]) { let file = match File::create(&fout) { Ok(file) => file, Err(_) => panic!( @@ -206,3 +262,109 @@ fn write_scores(fout: &Path, scores: &[f64], phe: &Phe, samples_id: &[String]) { // -> .push_str seems fastest // -> could be because use with_capacity beforehand } + +pub fn write_scores_nopheno(fout: &Path, scores: &[f64], samples_id: &[String]) { + let file = match File::create(&fout) { + Ok(file) => file, + Err(_) => panic!( + "Cannot create file, possibly directory does not exist: {:?}", + &fout + ), + }; + + let mut writer = BufWriter::new(file); + // assume word count of one line is 30 + // no problem when it exceeds + let capacity = 30 * scores.len(); + let mut str = String::with_capacity(capacity); + //str.push_str("fid\tiid\tphe\trs\n"); + //str.push_str("fid\tiid\tscore\n"); + str.push_str("id\tscore\n"); + + for ni in 0..scores.len() { + str.push_str(&samples_id[ni]); + //str.push_str("\t"); + //str.push_str(&samples_id[ni]); + //str.push_str("\t"); + //str.push_str(&(phe.get_unchecked(ni) as u8).to_string()); + str.push_str("\t"); + str.push_str(&format!("{:.5}\n", scores[ni])); + } + + writer.write(str.as_bytes()).unwrap(); + + //for ni in + // use .concat(), .join()? + // https://users.rust-lang.org/t/fast-string-concatenation/4425/5 + // -> .push_str seems fastest + // -> could be because use with_capacity beforehand +} + +fn is_unique(iter: T) -> bool +where + T: IntoIterator, + T::Item: Eq + Hash, +{ + let mut uniq = HashSet::new(); + iter.into_iter().all(move |x| uniq.insert(x)) +} + +pub fn write_scores_paras_nopheno( + fout: &Path, + score_paras: &[Vec], + concat_para: &str, + paras: &[String], + samples_id: &[String], +) { + if score_paras.len() != paras.len() { + panic!("score_paras.len() != paras.len()"); + } + + if !is_unique(paras) { + panic!("paras should be unique."); + } + + let file = match File::create(&fout) { + Ok(file) => file, + Err(_) => panic!( + "Cannot create file, possibly directory does not exist: {:?}", + &fout + ), + }; + + let sample_n = score_paras[0].len(); + + let mut writer = BufWriter::new(file); + // assume word count of one line is 30 + // no problem when it exceeds + let capacity = 30 * score_paras.len() * sample_n; + let mut str = String::with_capacity(capacity); + + let header: String = "id\t".to_string() + + ¶s + .iter() + .map(|x| "score_".to_string() + concat_para + "-" + x) + .collect::>() + .join("\t") + + "\n"; + + str.push_str(&header); + //str.push_str("id\tscore\n"); + + for ni in 0..sample_n { + str.push_str(&samples_id[ni]); + //str.push_str("\t"); + for scores in score_paras { + str.push_str(&format!("\t{:.5}", scores[ni])); + } + str.push_str("\n"); + } + + writer.write(str.as_bytes()).unwrap(); + + //for ni in + // use .concat(), .join()? + // https://users.rust-lang.org/t/fast-string-concatenation/4425/5 + // -> .push_str seems fastest + // -> could be because use with_capacity beforehand +} diff --git a/projects_rust/genetics/src/snv/snv_index.rs b/projects_rust/genetics/src/snv/snv_index.rs index 869370c..611ae75 100644 --- a/projects_rust/genetics/src/snv/snv_index.rs +++ b/projects_rust/genetics/src/snv/snv_index.rs @@ -11,13 +11,17 @@ type Alleles = (String, String); /// all programs uses minor/major but input plink might have different one #[derive(Clone, Hash, Debug, Default)] pub struct SnvId { + // TODO: rs->id rs: String, chrom: Chrom, pos: usize, alleles: Alleles, // only for one letter alleles_flip: Alleles, + //alleles_ref_alt: Alleles, sida: String, + sid: String, + //vid: String, } // how to implement? @@ -42,6 +46,8 @@ impl SnvId { pos: &str, a1: String, a2: String, + // this is ok but need changes all over the codes; instead use in vid() + //use_snv_pos: bool, ) -> SnvId { let mut snv = SnvId { rs, @@ -50,10 +56,14 @@ impl SnvId { alleles: (a1, a2), alleles_flip: ("".to_owned(), "".to_owned()), sida: "".to_string(), + sid: "".to_string(), + //vid: "".to_string(), }; //snv.check_alleles(); snv.set_alleles_flip(); snv.set_sida(); + snv.set_sid(); + //snv.set_vid(use_snv_pos); snv } @@ -67,6 +77,7 @@ impl SnvId { alleles: ("".to_owned(), "".to_owned()), alleles_flip: ("".to_owned(), "".to_owned()), sida: "".to_string(), + sid: "".to_string(), }; //snv.check_alleles(); //snv.set_alleles_revcomp(); @@ -110,6 +121,17 @@ impl SnvId { + &self.a2(); } + fn set_sid(&mut self) { + self.sid = self.chrom.to_string() + ":" + &self.pos.to_string(); + } + + pub fn reverse_alleles(&mut self) { + // update alleles, alleles_flip, sida + self.alleles = (self.alleles.1.clone(), self.alleles.0.clone()); + self.set_alleles_flip(); + self.set_sida(); + } + pub fn rs(&self) -> &str { &self.rs } @@ -126,19 +148,31 @@ impl SnvId { &self.sida } - pub fn alleles(&self) -> (&str, &str) { + fn _sid(&self) -> &str { + &self.sid + } + + pub fn vid(&self, use_snv_pos: bool) -> &str { + if use_snv_pos { + &self.sid + } else { + &self.rs + } + } + + fn alleles(&self) -> (&str, &str) { (&self.alleles.0, &self.alleles.1) } - pub fn alleles_rev(&self) -> (&str, &str) { + fn alleles_rev(&self) -> (&str, &str) { (&self.alleles.1, &self.alleles.0) } - pub fn alleles_flip(&self) -> (&str, &str) { + fn alleles_flip(&self) -> (&str, &str) { (&self.alleles_flip.0, &self.alleles_flip.1) } - pub fn alleles_rev_flip(&self) -> (&str, &str) { + fn alleles_rev_flip(&self) -> (&str, &str) { (&self.alleles_flip.1, &self.alleles_flip.0) } @@ -149,17 +183,17 @@ impl SnvId { &self.alleles.1 } // flip - pub fn a1f(&self) -> &str { + fn _a1f(&self) -> &str { &self.alleles_flip.0 } // flip - pub fn a2f(&self) -> &str { + fn _a2f(&self) -> &str { &self.alleles_flip.1 } - pub fn to_sid(&self) -> String { - self.chrom.to_string() + ":" + &self.pos.to_string() - } + //pub fn to_sid(&self) -> String { + // self.chrom.to_string() + ":" + &self.pos.to_string() + //} //pub fn to_sida_rev(&self) -> String { // self.chrom.to_string() + ":" + &self.pos.to_string() + ":" + &self.a2() + ":" + &self.a1() @@ -197,12 +231,12 @@ impl SnvId { // } //} - pub fn is_one_letter(&self) -> bool { + fn is_one_letter(&self) -> bool { (self.a1().len() == 1) && (self.a2().len() == 1) } // list all candidates - pub fn flip_or_rev(&self) -> Option<((&str, &str), (&str, &str), (&str, &str), (&str, &str))> { + fn flip_or_rev(&self) -> Option<((&str, &str), (&str, &str), (&str, &str), (&str, &str))> { if !self.is_one_letter() { return None; } @@ -215,8 +249,9 @@ impl SnvId { } // to reverse genotype - pub fn is_rev(&self, snv: &SnvId) -> bool { - if self.to_sid() != snv.to_sid() { + pub fn is_rev(&self, snv: &SnvId, use_snv_pos: bool) -> bool { + //if self.sid() != snv.sid() { + if self.vid(use_snv_pos) != snv.vid(use_snv_pos) { return false; } if self.is_one_letter() { @@ -408,7 +443,7 @@ mod tests { "A".to_owned(), "C".to_owned(), ); - assert!(!snv_index1.is_rev(&snv_index2)); + assert!(!snv_index1.is_rev(&snv_index2, false)); let snv_index2 = SnvId::construct_snv_index( "rs1".to_owned(), @@ -417,7 +452,7 @@ mod tests { "T".to_owned(), "G".to_owned(), ); - assert!(!snv_index1.is_rev(&snv_index2)); + assert!(!snv_index1.is_rev(&snv_index2, false)); let snv_index2 = SnvId::construct_snv_index( "rs1".to_owned(), @@ -426,7 +461,7 @@ mod tests { "C".to_owned(), "A".to_owned(), ); - assert!(snv_index1.is_rev(&snv_index2)); + assert!(snv_index1.is_rev(&snv_index2, false)); let snv_index2 = SnvId::construct_snv_index( "rs1".to_owned(), @@ -435,7 +470,7 @@ mod tests { "G".to_owned(), "T".to_owned(), ); - assert!(snv_index1.is_rev(&snv_index2)); + assert!(snv_index1.is_rev(&snv_index2, false)); // alleles do not match let snv_index2 = SnvId::construct_snv_index( @@ -445,7 +480,7 @@ mod tests { "A".to_owned(), "G".to_owned(), ); - assert!(!snv_index1.is_rev(&snv_index2)); + assert!(!snv_index1.is_rev(&snv_index2, false)); // snv does not match let snv_index2 = SnvId::construct_snv_index( @@ -455,7 +490,7 @@ mod tests { "C".to_owned(), "A".to_owned(), ); - assert!(!snv_index1.is_rev(&snv_index2)); + assert!(!snv_index1.is_rev(&snv_index2, false)); } /// test of Display diff --git a/projects_rust/genetics/src/textfile/text.rs b/projects_rust/genetics/src/textfile/text.rs index 615abda..b9114a2 100644 --- a/projects_rust/genetics/src/textfile/text.rs +++ b/projects_rust/genetics/src/textfile/text.rs @@ -316,6 +316,7 @@ pub fn load_table_header_buf(buf: R) -> Vec { pub fn coli_of_header_buf(buf: &[u8], col: &str) -> Option { let header = load_table_header_buf(buf); + log::info!("header: {:?}",header); for (coli, col_header) in header.iter().enumerate() { if col == col_header { diff --git a/projects_rust/genetics/src/wgt/coef.rs b/projects_rust/genetics/src/wgt/coef.rs index 88f1a8e..fc322fa 100644 --- a/projects_rust/genetics/src/wgt/coef.rs +++ b/projects_rust/genetics/src/wgt/coef.rs @@ -53,7 +53,16 @@ impl Coef { } } + pub fn new_score3(scores: (f64, f64, f64)) -> Self { + Coef::Score3(scores) + } + + pub fn new_score4(scores: (f64, f64, f64, f64)) -> Self { + Coef::Score4(scores) + } + // better imple + // use coef_lr() pub fn apply_lr(self, lr: f64) -> Self { match self { Coef::Linear(x) => Coef::Linear(lr * x), diff --git a/projects_rust/genetics/src/wgt/io.rs b/projects_rust/genetics/src/wgt/io.rs index 61fdfb9..722125e 100644 --- a/projects_rust/genetics/src/wgt/io.rs +++ b/projects_rust/genetics/src/wgt/io.rs @@ -4,11 +4,14 @@ use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; -pub fn read_dir>(path: P) -> Vec { +//pub fn read_dir>(path: P) -> Vec { +pub fn read_dir(path: &Path) -> Vec { //log::info!("read_dir {:?}", fs::read_dir(&path).unwrap().filter_map(|entry| Some(entry.file_name().to_string_lossy().into_owned()))); + //fs::read_dir(path) + //.unwrap() fs::read_dir(path) - .unwrap() + .unwrap_or_else(|_| panic!("Directory does not exist: {}.", path.to_str().unwrap())) .filter_map(|entry| { let entry = entry.unwrap(); @@ -46,7 +49,8 @@ pub fn check_file_wgt_exist(fwgt: &Path) { } } -pub fn load_wgts_file(fwgt: &Path) -> Vec { +//pub fn load_wgts_file(fwgt: &Path, use_snv_pos: bool, is_nonadd: bool) -> Vec { +pub fn load_wgts_file(fwgt: &Path, is_nonadd: bool) -> Vec { let mut wgts: Vec = Vec::new(); // also load header @@ -62,22 +66,46 @@ pub fn load_wgts_file(fwgt: &Path) -> Vec { for col_i in 0..col_n { let col = columns[col_i].clone(); - let col_name = match col.as_str() { - "vid" | "sid" | "rs" | "sida" => "var", - "chrom" | "chr" => "chrom", - "pos" | "position" => "pos", - "A1" => "a1", - "A2" => "a2", - "wgt" => "alpha", - z => z, // unuse col - //_ => panic!("unknown column.") + // TODO: use hashmap + let col_name: Option = match col.as_str() { + "vid" | "sid" | "rs" | "sida" | "id" => Some("var".to_string()), + "chrom" | "chr" => Some("chrom".to_string()), + "pos" | "position" => Some("pos".to_string()), + "a1" | "A1" => Some("a1".to_string()), + "a2" | "A2" => Some("a2".to_string()), + "score0" => Some("score0".to_string()), + "score1" => Some("score1".to_string()), + "score2" => Some("score2".to_string()), + "alpha" | "wgt" => Some("alpha".to_string()), + _ => None, + //z => z, // unuse col + //_ => panic!("unknown column.") + }; + + if let Some(col_name) = col_name { + if col_to_i.contains_key(&col_name) { + panic!("Duplicated columns in wgt file.") + } else { + col_to_i.insert(col_name, col_i); + } } - .to_string(); - - col_to_i.insert(col_name, col_i); } log::debug!("col_to_i {:?}", col_to_i); + // check all variant cols are in wgt + let cols_wgt = if is_nonadd { + vec![ + "var", "chrom", "pos", "a1", "a2", "score0", "score1", "score2", + ] + } else { + vec!["var", "chrom", "pos", "a1", "a2", "alpha"] + }; + for col_wgt in cols_wgt.iter() { + if !col_to_i.contains_key(*col_wgt) { + panic!("Required column not in wgt: {}", col_wgt); + } + } + for wgt_i in 1..vss[0].len() { // for snv let snv = SnvId::construct_snv_index( @@ -88,7 +116,16 @@ pub fn load_wgts_file(fwgt: &Path) -> Vec { vss[col_to_i["a2"]][wgt_i].clone(), ); - let coef = Coef::Linear(vss[col_to_i["alpha"]][wgt_i].parse::().unwrap()); + let coef = if is_nonadd { + let scores = ( + vss[col_to_i["score0"]][wgt_i].parse::().unwrap(), + vss[col_to_i["score1"]][wgt_i].parse::().unwrap(), + vss[col_to_i["score2"]][wgt_i].parse::().unwrap(), + ); + Coef::Score3(scores) + } else { + Coef::Linear(vss[col_to_i["alpha"]][wgt_i].parse::().unwrap()) + }; let model = Model::new_coef(coef); let wgt = Wgt::construct_wgt(WgtKind::Snv(snv, None, None), model); diff --git a/projects_rust/genetics/src/wgts.rs b/projects_rust/genetics/src/wgts.rs index 18b48b4..95a1350 100644 --- a/projects_rust/genetics/src/wgts.rs +++ b/projects_rust/genetics/src/wgts.rs @@ -1,5 +1,4 @@ - -use super::{wgt,Wgt}; +use super::{wgt, Wgt}; use std::path::Path; #[derive(Debug, Clone)] @@ -10,14 +9,14 @@ pub struct Wgts { } impl Wgts { - pub fn new_from_file(fwgt: &Path) -> Self { - let wgts: Vec = wgt::io::load_wgts_file(fwgt); + //pub fn new_from_file(fwgt: &Path, use_snv_pos: bool,is_nonadd:bool) -> Self { + pub fn new_from_file(fwgt: &Path, is_nonadd: bool) -> Self { + let wgts: Vec = wgt::io::load_wgts_file(fwgt, is_nonadd); + //let wgts: Vec = wgt::io::load_wgts_file(fwgt,use_snv_pos,is_nonadd); // TODO: check colums match io::wgt_columns() - Wgts { - wgts, - } + Wgts { wgts } } pub fn wgts_mut(&mut self) -> &mut [Wgt] { @@ -27,4 +26,4 @@ impl Wgts { pub fn wgts(&self) -> &[Wgt] { &self.wgts } -} \ No newline at end of file +} From 9facc6b788b9f8c11e088af4334657cadafb5216 Mon Sep 17 00:00:00 2001 From: rickyota <22293266+rickyota@users.noreply.github.com> Date: Sun, 8 Oct 2023 13:35:36 +0900 Subject: [PATCH 4/6] add: pgenlib --- lib/pgenlib_rust/build.rs | 5 + lib/pgenlib_rust/src/pgenlib/CMakeLists.txt | 1 + lib/pgenlib_rust/src/pgenlib/main.cpp | 13 +- lib/pgenlib_rust/src/pgenlib/pgenlibr.cpp | 5 +- .../src/pgenlib/pgenlibr_wrapc.cpp | 379 +++++++++++++++--- lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.h | 32 +- projects_rust/Cargo.lock | 221 +++++++++- .../boosting/src/bin/boosting_res.rs | 2 - 8 files changed, 581 insertions(+), 77 deletions(-) diff --git a/lib/pgenlib_rust/build.rs b/lib/pgenlib_rust/build.rs index 0e87866..3a515b2 100644 --- a/lib/pgenlib_rust/build.rs +++ b/lib/pgenlib_rust/build.rs @@ -23,8 +23,13 @@ fn main() { // should be same as project name in CMakeLists.txt println!("cargo:rustc-link-lib=static=pgenlib"); + // [ref](https://stackoverflow.com/questions/50642574/how-can-i-specify-linker-flags-arguments-in-a-build-script) println!("cargo:rustc-link-arg=-fopenmp"); + println!("cargo:rustc-link-lib=gomp"); + // added 230823 + // [ref](https://github.com/rust-or/highs-sys/blob/master/build.rs) + //println!("cargo:rustc-link-lib=dylib=gomp"); diff --git a/lib/pgenlib_rust/src/pgenlib/CMakeLists.txt b/lib/pgenlib_rust/src/pgenlib/CMakeLists.txt index 64ac580..f07b9c2 100644 --- a/lib/pgenlib_rust/src/pgenlib/CMakeLists.txt +++ b/lib/pgenlib_rust/src/pgenlib/CMakeLists.txt @@ -1,5 +1,6 @@ # [ref](https://www.hiroom2.com/2016/09/07/makefile%E3%82%92cmakelists-txt%E3%81%AB%E7%BD%AE%E3%81%8D%E6%8F%9B%E3%81%88%E3%82%8B/) +# [musl-gcc](https://github.com/actions-rs/toolchain/issues/102#issuecomment-1132620977) cmake_minimum_required(VERSION 3.10) diff --git a/lib/pgenlib_rust/src/pgenlib/main.cpp b/lib/pgenlib_rust/src/pgenlib/main.cpp index 26e2f7f..ed6509f 100644 --- a/lib/pgenlib_rust/src/pgenlib/main.cpp +++ b/lib/pgenlib_rust/src/pgenlib/main.cpp @@ -1,21 +1,20 @@ #include -//#include "pgenlibr_wrapc.cpp" -//#include "pgenlibr_wrapc.h" -//#include "wrapper.h" +// #include "pgenlibr_wrapc.cpp" +// #include "pgenlibr_wrapc.h" +// #include "wrapper.h" -//#include "stdio.h" +// #include "stdio.h" int main(int argc, char *argv[]) { - //printf("cgd %d \n", gcd(6, 9)); + // printf("cgd %d \n", gcd(6, 9)); /* PgenReader *pread = pgenreader_create(); pgenreader_get_a(pread); */ - - const char* x="abc"; + const char *x = "abc"; std::string str(x); printf("cgd %d \n", 3); diff --git a/lib/pgenlib_rust/src/pgenlib/pgenlibr.cpp b/lib/pgenlib_rust/src/pgenlib/pgenlibr.cpp index 55f2045..1204b16 100644 --- a/lib/pgenlib_rust/src/pgenlib/pgenlibr.cpp +++ b/lib/pgenlib_rust/src/pgenlib/pgenlibr.cpp @@ -76,7 +76,6 @@ void PgenReader::Load(std::string filename, uint32_t cur_sample_ct, std::vector< exit(-1); } - plink2::PreinitPgfi(_info_ptr); uint32_t cur_variant_ct = UINT32_MAX; const char *fname = filename.c_str(); @@ -177,6 +176,7 @@ void PgenReader::Load(std::string filename, uint32_t cur_sample_ct, std::vector< if(!plink2::PgrGetFreadBuf(_state_ptr[i])) { plink2::aligned_free(pgr_alloc); } + // Reducing nthr might help. sprintf(errstr_buf, "PgrInit() error %d", static_cast(reterr)); fprintf(stderr, "%s\n", errstr_buf); exit(-1); @@ -365,6 +365,7 @@ void PgenReader::Read(double *buf, size_t const &n, int const &thr, int variant_ fprintf(stderr, "%s\n", errstr_buf); exit(-1); } + uint32_t dosage_ct; plink2::PglErr reterr = PgrGet1D(_subset_include_vec[thr], _subset_index[thr], _subset_size[thr], variant_idx, allele_idx, _state_ptr[thr], _pgv[thr]->genovec, _pgv[thr]->dosage_present, _pgv[thr]->dosage_main, &dosage_ct); @@ -374,6 +375,8 @@ void PgenReader::Read(double *buf, size_t const &n, int const &thr, int variant_ fprintf(stderr, "%s\n", errstr_buf); exit(-1); } + + // this part raise error on omp; -> due to shared buf plink2::Dosage16ToDoubles(kGenoRDoublePairs, _pgv[thr]->genovec, _pgv[thr]->dosage_present, _pgv[thr]->dosage_main, _subset_size[thr], dosage_ct, buf); } diff --git a/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.cpp b/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.cpp index 2e016f3..a86a630 100644 --- a/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.cpp +++ b/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.cpp @@ -37,7 +37,9 @@ MyClass::~MyClass() {} MyClass *myclass_create() { return new MyClass(); } int myclass_get_abc(MyClass *m) { return m->get_abc(); } void myclass_delete(MyClass *m) {} - +/* +// size_t pgenreader_load_whole(double *genot, const char *filename, int snv_n_in, int sample_n_in, int *sample_subset, int sample_subset_n, int nthr) +// { int pgenreader_load_whole(double *genot, const char *filename, int snv_n_in, int sample_n_in, int *sample_subset, int sample_subset_n, int nthr) { PgenReader *pread = new PgenReader(); std::string fname(filename); @@ -48,26 +50,41 @@ int pgenreader_load_whole(double *genot, const char *filename, int snv_n_in, int for(int &d : sample_subset_1based_v) { d += 1; } - // printf("sample_subset_1based %d\n", sample_subset_1based_v[0]); pread->Load(fname, sample_n_in, sample_subset_1based_v, nthr); - // printf("outside of openmp\n"); - int num_threads = 1; - int thread_num = 0; - int mi; + size_t mi; +#ifdef _OPENMP + omp_set_num_threads(nthr); +#endif + +#ifdef _OPENMP + size_t const thrn = omp_get_max_threads(); +#else + size_t const thrn = 1; +#endif + double **buf_thrs = static_cast(malloc(sizeof(double *) * thrn)); + for(size_t thri = 0; thri < thrn; thri++) { + double *buf_thr = static_cast(malloc(sizeof(double) * sample_subset_n)); + buf_thrs[thri] = buf_thr; + } + #ifdef _OPENMP - //omp_set_num_threads(nthr); - //omp_get_num_threads(); - //printf("#threads %d\n", omp_get_num_threads()); #pragma omp parallel for private(mi) #endif for(mi = 0; mi < snv_n_in; mi++) { #ifdef _OPENMP - thread_num = omp_get_thread_num(); - //printf("thread# %d\n", thread_num); + size_t thri = omp_get_thread_num(); +#else + size_t thri = 0; #endif - pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi, 1); + + double *buf_thr = buf_thrs[thri]; + + pread->Read(buf_thr, sample_subset_n, thri, mi, 1); + std::copy(buf_thr, buf_thr + sample_subset_n, genot + mi * sample_subset_n); + + // pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi, 1); } // run in parallel @@ -75,10 +92,13 @@ int pgenreader_load_whole(double *genot, const char *filename, int snv_n_in, int // pread->Read(genot, sample_n, 0, 0, 1); // pread->ReadHardcalls(genot, sample_n, 0, 1, 1); return nthr; -} - -int pgenreader_load_snvs(double *genot, const char *filename, int snv_start, int snv_end, int sample_n_in, int *sample_subset, int sample_subset_n, - int nthr) { +} */ +/* +// TODO: use_snvs; this can speed up a lot by omitting Read() +// size_t pgenreader_load_snvs(double *genot, const char *filename, size_t snv_start, size_t snv_end, size_t sample_n_in, int *sample_subset, +// size_t sample_subset_n, size_t nthr) { +int pgenreader_load_snvs(double *genot, const char *filename, int snv_start, int snv_end, int sample_n_in, int const *const sample_subset, + int sample_subset_n, int nthr) { PgenReader *pread = new PgenReader(); std::string fname(filename); @@ -94,51 +114,304 @@ int pgenreader_load_snvs(double *genot, const char *filename, int snv_start, int pread->Load(fname, sample_n_in, sample_subset_1based_v, nthr); - /* // no error - int thread_num = 0; - int mi; - for(mi = 0; mi < snv_end - snv_start; mi++) { - pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi + snv_start, 1); - } */ + size_t mi; +#ifdef _OPENMP + // disable dynamic + // https://stackoverflow.com/questions/11095309/openmp-set-num-threads-is-not-working + // omp_set_dynamic(0); + omp_set_num_threads(nthr); +#endif - // still error even for nthread=1 - // int num_threads = 1; - int mi; - // TODO: create Vec> for buf -//#ifdef _OPENMP -// omp_set_dynamic(0); -// printf("cpp nthr %d\n", nthr); -// omp_set_num_threads(nthr); -// printf("omp #threads %d\n", omp_get_num_threads()); -//#pragma omp parallel for private(mi) -//#endif +#ifdef _OPENMP + size_t const thrn = omp_get_max_threads(); +#else + size_t const thrn = 1; +#endif + double **buf_thrs = static_cast(malloc(sizeof(double *) * thrn)); + for(size_t thri = 0; thri < thrn; thri++) { + double *buf_thr = static_cast(malloc(sizeof(double) * sample_subset_n)); + buf_thrs[thri] = buf_thr; + } + +#ifdef _OPENMP +#pragma omp parallel for private(mi) +#endif for(mi = 0; mi < snv_end - snv_start; mi++) { - int thread_num = 0; -//#ifdef _OPENMP -// thread_num = omp_get_thread_num(); -// if(mi % 10000 == 0) { -// printf("omp thread# %d\n", thread_num); -// } -//#endif - pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi + snv_start, 1); +#ifdef _OPENMP + size_t thri = omp_get_thread_num(); +#else + size_t thri = 0; +#endif + + double *buf_thr = buf_thrs[thri]; // should use ReadHardcalls? - // load buf to genot + pread->Read(buf_thr, sample_subset_n, thri, mi + snv_start, 1); + std::copy(buf_thr, buf_thr + sample_subset_n, genot + mi * sample_subset_n); + + // pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi + snv_start, 1); + } + + return nthr; +} */ + +// genot double* : #snvs(true in use_snvs) * sample_subset_n +int pgenreader_load_snvs_extract(int8_t *genot, const char *filename, int snv_start, int snv_end, bool const *const use_snvs, int sample_n_in, + int const *const sample_subset, int sample_subset_n, int nthr) { + + PgenReader *pread = new PgenReader(); + std::string fname(filename); + + // might have better way + std::vector sample_subset_1based_v(sample_subset_n); + std::copy(sample_subset, sample_subset + sample_subset_n, sample_subset_1based_v.begin()); + for(int &d : sample_subset_1based_v) { + d += 1; + } + + size_t const snv_len = snv_end - snv_start; + + // TODO: use map + // std::unordered_map snv_index; + size_t *snv_index = static_cast(malloc(sizeof(size_t) * snv_len)); + size_t use_index = 0; + for(size_t mi = 0; mi < snv_len; mi++) { + if(use_snvs[mi]) { + snv_index[mi] = use_index; + use_index++; + } else { + // otherwise not initialized + snv_index[mi] = SIZE_MAX; + } + } + + //printf("use snvs %zu\n", use_index); + + //printf("cpp nthr %d\n", nthr); + pread->Load(fname, sample_n_in, sample_subset_1based_v, nthr); + + size_t mi; +#ifdef _OPENMP + // disable dynamic + // https://stackoverflow.com/questions/11095309/openmp-set-num-threads-is-not-working + // omp_set_dynamic(0); + omp_set_num_threads(nthr); +#endif + +#ifdef _OPENMP + size_t const thrn = omp_get_max_threads(); +#else + size_t const thrn = 1; +#endif + double **buf_thrs = static_cast(malloc(sizeof(double *) * thrn)); + for(size_t thri = 0; thri < thrn; thri++) { + buf_thrs[thri] = static_cast(malloc(sizeof(double) * sample_subset_n)); } - // this is wrong way - /* int mi; - #ifdef _OPENMP - printf("in openmp\n"); - #pragma omp parallel for private(mi) - #endif - for(mi = 0; mi < snv_end - snv_start; mi++) { - pread->Read(genot + mi * sample_subset_n, sample_subset_n, 0, mi + snv_start, 1); - } */ +#ifdef _OPENMP +#pragma omp parallel for private(mi) +#endif + for(mi = 0; mi < snv_len; mi++) { + // for(mi = 0; mi < snv_end - snv_start; mi++) { + + if(!use_snvs[mi]) { + continue; + } + +#ifdef _OPENMP + size_t const thri = omp_get_thread_num(); +#else + size_t const thri = 0; +#endif + + double *buf_thr = buf_thrs[thri]; + // should use ReadHardcalls? + pread->Read(buf_thr, sample_subset_n, thri, mi + snv_start, 1); + + size_t const use_index = snv_index[mi]; + // printf("mi, use_index %zu, %zu\n", mi, use_index); + + for(size_t samplei = 0; samplei < sample_subset_n; samplei++) { + genot[use_index * sample_subset_n + samplei] = (int8_t)buf_thr[samplei]; + } + + // pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi + snv_start, 1); + } return nthr; } -int pgenreader_load_snv(double *genot, const char *filename, int snv_i, int sample_n_in, int *sample_subset, int sample_subset_n, int nthr) { +// Problem was due to int overflow +// mi * sample_n was overflowed +// int pgenreader_load_snvs_debug(double *genot, const char *filename, int snv_start, int snv_end, int sample_n_in, int *sample_subset, int +// sample_subset_n, +// int nthr) { +// PgenReader *pread = new PgenReader(); +// std::string fname(filename); +// +// // might have better way +// std::vector sample_subset_1based_v(sample_subset_n); +// std::copy(sample_subset, sample_subset + sample_subset_n, sample_subset_1based_v.begin()); +// for(int &d : sample_subset_1based_v) { +// d += 1; +// } +// // printf("sample_subset_1based %d\n", sample_subset_1based_v[0]); +// +// printf("cpp nthr %d\n", nthr); +// pread->Load(fname, sample_n_in, sample_subset_1based_v, nthr); +// +// /* // no error +// int thread_num = 0; +// int mi; +// for(mi = 0; mi < snv_end - snv_start; mi++) { +// pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi + snv_start, 1); +// } */ +// +// // still error even for nthread=1 +// // int num_threads = 1; +// size_t mi; +// // TODO: create Vec> for buf +// // omp_set_dynamic(0); +// printf("bfr in openmp %d\n", nthr); +// #ifdef _OPENMP +// // disable dynamic +// // https://stackoverflow.com/questions/11095309/openmp-set-num-threads-is-not-working +// omp_set_dynamic(0); +// printf("in openmp %d\n", nthr); +// printf("cpp nthr %d\n", nthr); +// // value of OMP_NUM_THREADS=8 +// printf("bfr set omp max #threads %d\n", omp_get_max_threads()); +// // printf("bfr set omp #threads %d\n", omp_get_num_threads()); +// omp_set_num_threads(nthr); +// printf("omp max #threads %d\n", omp_get_max_threads()); +// // always 1 since outside of parallel?? +// // printf("omp #threads %d\n", omp_get_num_threads()); +// #endif +// +// // THIS WORKED!! +// // int mtmp; +// // #ifdef _OPENMP +// // #pragma omp parallel for +// // #endif +// // for(mtmp = 0; mtmp < 2; mtmp++) { +// // int thread_num = omp_get_thread_num(); +// // printf("mtmp omp thread# %d / %d\n", thread_num, omp_get_num_threads()); +// // } +// +// // ok +// // int mitest; +// // #ifdef _OPENMP +// // #pragma omp parallel for +// // #endif +// // for(mitest = 0; mitest < snv_end - snv_start; mitest++) { +// // int thread_num = omp_get_thread_num(); +// // printf("mitest omp thread# %d / %d\n", thread_num, omp_get_num_threads()); +// // } +// +// // #ifdef _OPENMP +// // size_t const thrn = omp_get_max_threads(); +// // size_t const thrn = omp_get_num_threads(); +// // #else +// // size_t const thrn = 1; +// // #endif +// size_t thrn = nthr; +// //printf("num thrn %d\n", omp_get_num_threads()); +// //printf("max thrn %d\n", omp_get_max_threads()); +// //printf("thrn %d\n", thrn); +// // correct? +// double **buf_thrs = static_cast(malloc(sizeof(double *) * thrn)); +// for(size_t thri = 0; thri < thrn; thri++) { +// double *buf_thr = static_cast(malloc(sizeof(double) * sample_subset_n)); +// buf_thrs[thri] = buf_thr; +// //printf("buf_thr buf_thr %d, %f\n", thri, buf_thr[0]); +// } +// +// // this is ok! +//// int mitest; +////#ifdef _OPENMP +////#pragma omp parallel for private(mitest) +////#endif +//// for(mitest = 0; mitest < 100; mitest++) { +//// int thread_num = omp_get_thread_num(); +//// printf("mitest omp thread# %d / %d\n", thread_num, omp_get_num_threads()); +//// // ok +//// // double *buf_thr = static_cast(malloc(sizeof(double) * sample_subset_n)); +//// // ng +//// // size_t thri = omp_get_thread_num(); +//// // double *buf_thr = buf_thrs[thri]; +//// // printf("mitest buf_thr %f\n", buf_thr[0]); +//// // ok ; why ? +//// // std::copy(buf_thr, buf_thr + sample_subset_n, genot + mitest * sample_subset_n); +//// } +//// +//// printf("mi max %d\n", snv_end - snv_start); +// +//// this section makes abortion when #pragma is on +// #ifdef _OPENMP +// #pragma omp parallel for private(mi) +// #endif +// for(mi = 0; mi < snv_end - snv_start; mi++) { +// int thread_num = 0; +// #ifdef _OPENMP +// thread_num = omp_get_thread_num(); +// // if(mi % 10000 == 0) { +// //if(mi < 20) { +// // printf("omp thread# %d / %d\n", thread_num, omp_get_num_threads()); +// //} +// // printf("omp thread# %d / %d\n", thread_num, omp_get_num_threads()); +// #endif +// // printf("args %d, %d, %d, %d\n", sample_subset_n, thread_num, mi + snv_start, 1); +// +// // ok; no omp error +// // double *buf_thr = static_cast(malloc(sizeof(double) * sample_subset_n)); +// // ok; no omp error +// size_t thri = omp_get_thread_num(); +// double *buf_thr = buf_thrs[thri]; +// pread->Read(buf_thr, sample_subset_n, thread_num, mi + snv_start, 1); +// +// // pread->Read(genot + mi * sample_subset_n, sample_subset_n, thread_num, mi + snv_start, 1); +// +// // ng +// std::copy(buf_thr, buf_thr + sample_subset_n, genot + mi * sample_subset_n); +// // if(mi < 20) { +// //if(thri == 1) { +// // printf("thri buf_thr %d, %f\n", thri, buf_thr[0]); +// //} +// //} +// +// // ok +// // if(thri == 0) { +// // printf("thr0 copying mi %d\n", mi); +// // std::copy(buf_thr, buf_thr + sample_subset_n, genot + mi * sample_subset_n); +// //} +// +// // ng -> due to int overflow +// // mi, sample_subset_n was int, so overflowed +// //if(thri == 1) { +// // printf("thr1 copying mi %d\n", mi); +// // // std::copy(buf_thr, buf_thr + sample_subset_n, genot + mi * sample_subset_n); +// // printf("index %zu", mi * sample_subset_n); +// // // printf("access %d", genot[mi * sample_subset_n]); +// //} +// +// // ng +// // for(size_t samplei = 0; samplei < sample_subset_n; samplei++) { +// // genot[mi * sample_subset_n + samplei] = buf_thr[samplei]; +// //} +// +// // if(thri == 0) { +// // printf("thr0 for copying mi %d\n", mi); +// // for(size_t samplei = 0; samplei < sample_subset_n; samplei++) { +// // genot[mi * sample_subset_n + samplei] = buf_thr[samplei]; +// // } +// // } +// +// // should use ReadHardcalls? +// } +// +// return nthr; +// } + +/* int pgenreader_load_snv(double *genot, const char *filename, int snv_i, int sample_n_in, int *sample_subset, int sample_subset_n, int nthr) { PgenReader *pread = new PgenReader(); std::string fname(filename); @@ -155,7 +428,7 @@ int pgenreader_load_snv(double *genot, const char *filename, int snv_i, int samp pread->Read(genot, sample_subset_n, 0, snv_i, 1); return nthr; -} +} */ int pgenreader_get_a() { PgenReader *pread = new PgenReader(); diff --git a/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.h b/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.h index ee8b0c6..fd83f83 100644 --- a/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.h +++ b/lib/pgenlib_rust/src/pgenlib/pgenlibr_wrapc.h @@ -2,9 +2,12 @@ // https://akitsu-sanae.hatenablog.com/entry/2016/12/21/010321 // https://nachtimwald.com/2017/08/18/wrapping-c-objects-in-() -//#pragma once -//#include "pgenlibr.h" -//#include +// #pragma once +// #include "pgenlibr.h" +// #include + +// for int8_t +#include void foo(); int gcd(int a, int b); @@ -30,9 +33,25 @@ int myclass_get_abc(MyClass *m); int pgenreader_get_a(); -int pgenreader_load_whole(double *genot, const char *filename,int snv_n, int sample_n, int *sample_subset, int sample_subset_n, int nthr); -int pgenreader_load_snvs(double *genot, const char *filename,int snv_start,int snv_end, int sample_n, int *sample_subset, int sample_subset_n, int nthr); -int pgenreader_load_snv(double *genot, const char *filename,int snv_i, int sample_n, int *sample_subset, int sample_subset_n, int nthr); +// TODO: how to use size_t?? +// std::size_t pgenreader_load_snvs_extract(double *genot, const char *filename,std::size_t snv_start,std::size_t snv_end, std::size_t sample_n, int +// *sample_subset, std::size_t sample_subset_n, std::size_t nthr); + +// only make this since debugging in C is troublesome +// create pgenreader_load_whole() etc. in rust +// signed char is i32 in rust +//int pgenreader_load_snvs_extract(signed int *genot, const char *filename, int snv_start, int snv_end, bool const *const use_snvs, int sample_n_in, +int pgenreader_load_snvs_extract(int8_t *genot, const char *filename, int snv_start, int snv_end, bool const *const use_snvs, int sample_n_in, + int const *const sample_subset, int sample_subset_n, int nthr); + +// int pgenreader_load_whole(double *genot, const char *filename, int snv_n, int sample_n_in, int *sample_subset, int sample_subset_n, int nthr); +// int pgenreader_load_snvs(double *genot, const char *filename, int snv_start, int snv_end, int sample_n, int const *const sample_subset, +// int sample_subset_n, int nthr); +// int pgenreader_load_snv(double *genot, const char *filename, int snv_i, int sample_n_in, int *sample_subset, int sample_subset_n, int nthr); + +/* int pgenreader_load_whole(double *genot, const char *filename,int snv_n, int sample_n, int *sample_subset, int sample_subset_n, int nthr); +int pgenreader_load_snvs(double *genot, const char *filename,int snv_start,int snv_end, int sample_n, int *sample_subset, int sample_subset_n, int +nthr); int pgenreader_load_snv(double *genot, const char *filename,int snv_i, int sample_n, int *sample_subset, int sample_subset_n, int nthr); */ /* PgenReader *pgenreader_create(); void pgenreader_delete(PgenReader *pread); @@ -41,7 +60,6 @@ int pgenreader_int_abc(PgenReader *pread, int a); int pgenreader_get_a(PgenReader *pread); */ /* void pgenreader_loadallsample(PgenReader *pread, const char *fname); */ - /* #ifdef __cplusplus extern "C" { diff --git a/projects_rust/Cargo.lock b/projects_rust/Cargo.lock index d988fa7..d379274 100644 --- a/projects_rust/Cargo.lock +++ b/projects_rust/Cargo.lock @@ -2,6 +2,17 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "1.0.1" @@ -111,7 +122,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn", + "syn 2.0.18", "which", ] @@ -229,7 +240,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.18", ] [[package]] @@ -348,7 +359,7 @@ dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset", + "memoffset 0.8.0", "scopeguard", ] @@ -454,6 +465,15 @@ dependencies = [ "zstd", ] +[[package]] +name = "genetics_for_py" +version = "0.1.0" +dependencies = [ + "genetics", + "numpy", + "pyo3", +] + [[package]] name = "getrandom" version = "0.2.10" @@ -513,6 +533,12 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "indoc" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" + [[package]] name = "io-lifetimes" version = "1.0.11" @@ -603,6 +629,16 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.18" @@ -631,6 +667,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + [[package]] name = "memoffset" version = "0.8.0" @@ -775,6 +820,21 @@ dependencies = [ "libc", ] +[[package]] +name = "numpy" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a462c1af5ba1fddec1488c4646993a23ae7931f9e170ccba23e9c7c834277797" +dependencies = [ + "ahash", + "libc", + "ndarray", + "num-complex", + "num-integer", + "num-traits", + "pyo3", +] + [[package]] name = "once_cell" version = "1.17.2" @@ -787,6 +847,29 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -808,6 +891,18 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +[[package]] +name = "playground" +version = "0.1.0" +dependencies = [ + "criterion", + "env_logger", + "log", + "rayon", + "sysinfo", + "zstd", +] + [[package]] name = "plotters" version = "0.3.4" @@ -849,7 +944,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43ded2b5b204571f065ab8540367d738dfe1b3606ab9eb669dcfb5e7a3a07501" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.18", ] [[package]] @@ -861,6 +956,66 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pyo3" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "268be0c73583c183f2b14052337465768c07726936a260f480f0857cb95ba543" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset 0.6.5", + "parking_lot", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28fcd1e73f06ec85bf3280c48c67e731d8290ad3d730f8be9dc07946923005c8" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f6cb136e222e49115b3c51c32792886defbfb0adead26a688142b346a0b9ffc" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94144a1266e236b1c932682136dc35a9dee8d3589728f68130c7c3861ef96b28" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8df9be978a2d2f0cdebabb03206ed73b11314701a5bfe71b0d753b81997777f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "quote" version = "1.0.28" @@ -928,6 +1083,15 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.8.3" @@ -1013,7 +1177,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.18", ] [[package]] @@ -1033,12 +1197,29 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +[[package]] +name = "smallvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" + [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.18" @@ -1065,6 +1246,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "target-lexicon" +version = "0.12.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d0e916b1148c8e263850e1ebcbd046f333e0683c724876bb0da63ea4373dc8a" + [[package]] name = "termcolor" version = "1.2.0" @@ -1074,6 +1261,14 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "test_pgenlib" +version = "0.1.0" +dependencies = [ + "bindgen", + "pgenlib_rust", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -1105,12 +1300,24 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +[[package]] +name = "unindent" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" + [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "walkdir" version = "2.3.3" @@ -1148,7 +1355,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.18", "wasm-bindgen-shared", ] @@ -1170,7 +1377,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.18", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/projects_rust/boosting/src/bin/boosting_res.rs b/projects_rust/boosting/src/bin/boosting_res.rs index da3497f..100c509 100644 --- a/projects_rust/boosting/src/bin/boosting_res.rs +++ b/projects_rust/boosting/src/bin/boosting_res.rs @@ -29,8 +29,6 @@ // samples indicate both fid and iid // TODO: trimming sample weights: only use samples with large weights on choosing SNVs: Friedman, J., Hastie, T. and Tibshirani, R. (2000) ‘Additive logistic regression: a statistical view of boosting (With discussion and a rejoinder by the authors)’, Annals of statistics, 28(2), pp. 337–407. doi:10.1214/aos/1016218223. // split main for genoboost_pruning, ssgenoboost -// TODO: use ref/alt not A1/A2 and fix lims2 -// TODO: format of use_snvs, use_sample should be the same as plink --extract use clap::{ArgGroup, Args, Parser, Subcommand, ValueEnum}; //use crate::boosting::{BoostMethod, BoostParam, IterationNumber}; From 83951d2749a5784f5d996d96fafcb85c1e1c4d1e Mon Sep 17 00:00:00 2001 From: rickyota <22293266+rickyota@users.noreply.github.com> Date: Sun, 8 Oct 2023 13:41:05 +0900 Subject: [PATCH 5/6] add: bash --- genoboost.cv.sh | 2 +- genoboost.docker.sh | 3 +- genoboost.sh | 2 +- projects_rust/Cargo.lock | 221 ++------------------------------------- 4 files changed, 11 insertions(+), 217 deletions(-) diff --git a/genoboost.cv.sh b/genoboost.cv.sh index d307636..5a73b9b 100644 --- a/genoboost.cv.sh +++ b/genoboost.cv.sh @@ -24,7 +24,7 @@ cp ./projects_rust/target/release/genoboost ./genoboost --file-phe "$file_cov" \ --cov age,sex \ --cross-validation 5 \ - --major_a2_train + --major-a2-train # score ./genoboost score \ diff --git a/genoboost.docker.sh b/genoboost.docker.sh index b236f51..dea0076 100644 --- a/genoboost.docker.sh +++ b/genoboost.docker.sh @@ -22,7 +22,8 @@ function genoboost-docker() { --dir "$dir_wgt" \ --file-genot "$file_plink" \ --file-phe "$file_cov" \ - --cov age,sex + --cov age,sex \ + --major-a2-train # score ./genoboost-docker score \ diff --git a/genoboost.sh b/genoboost.sh index 84fa5f9..8e06401 100644 --- a/genoboost.sh +++ b/genoboost.sh @@ -26,7 +26,7 @@ cp ./projects_rust/target/release/genoboost ./genoboost --file-genot "$file_plink" \ --file-phe "$file_cov" \ --cov age,sex \ - --major_a2_train + --major-a2-train # score ./genoboost score \ diff --git a/projects_rust/Cargo.lock b/projects_rust/Cargo.lock index d379274..d988fa7 100644 --- a/projects_rust/Cargo.lock +++ b/projects_rust/Cargo.lock @@ -2,17 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "ahash" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - [[package]] name = "aho-corasick" version = "1.0.1" @@ -122,7 +111,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.18", + "syn", "which", ] @@ -240,7 +229,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.18", + "syn", ] [[package]] @@ -359,7 +348,7 @@ dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", + "memoffset", "scopeguard", ] @@ -465,15 +454,6 @@ dependencies = [ "zstd", ] -[[package]] -name = "genetics_for_py" -version = "0.1.0" -dependencies = [ - "genetics", - "numpy", - "pyo3", -] - [[package]] name = "getrandom" version = "0.2.10" @@ -533,12 +513,6 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" -[[package]] -name = "indoc" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" - [[package]] name = "io-lifetimes" version = "1.0.11" @@ -629,16 +603,6 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" -[[package]] -name = "lock_api" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" -dependencies = [ - "autocfg", - "scopeguard", -] - [[package]] name = "log" version = "0.4.18" @@ -667,15 +631,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memoffset" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" -dependencies = [ - "autocfg", -] - [[package]] name = "memoffset" version = "0.8.0" @@ -820,21 +775,6 @@ dependencies = [ "libc", ] -[[package]] -name = "numpy" -version = "0.17.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a462c1af5ba1fddec1488c4646993a23ae7931f9e170ccba23e9c7c834277797" -dependencies = [ - "ahash", - "libc", - "ndarray", - "num-complex", - "num-integer", - "num-traits", - "pyo3", -] - [[package]] name = "once_cell" version = "1.17.2" @@ -847,29 +787,6 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets", -] - [[package]] name = "peeking_take_while" version = "0.1.2" @@ -891,18 +808,6 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" -[[package]] -name = "playground" -version = "0.1.0" -dependencies = [ - "criterion", - "env_logger", - "log", - "rayon", - "sysinfo", - "zstd", -] - [[package]] name = "plotters" version = "0.3.4" @@ -944,7 +849,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43ded2b5b204571f065ab8540367d738dfe1b3606ab9eb669dcfb5e7a3a07501" dependencies = [ "proc-macro2", - "syn 2.0.18", + "syn", ] [[package]] @@ -956,66 +861,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "pyo3" -version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "268be0c73583c183f2b14052337465768c07726936a260f480f0857cb95ba543" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset 0.6.5", - "parking_lot", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28fcd1e73f06ec85bf3280c48c67e731d8290ad3d730f8be9dc07946923005c8" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f6cb136e222e49115b3c51c32792886defbfb0adead26a688142b346a0b9ffc" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94144a1266e236b1c932682136dc35a9dee8d3589728f68130c7c3861ef96b28" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.17.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8df9be978a2d2f0cdebabb03206ed73b11314701a5bfe71b0d753b81997777f" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "quote" version = "1.0.28" @@ -1083,15 +928,6 @@ dependencies = [ "num_cpus", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags", -] - [[package]] name = "regex" version = "1.8.3" @@ -1177,7 +1013,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn", ] [[package]] @@ -1197,29 +1033,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" -[[package]] -name = "smallvec" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" - [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - [[package]] name = "syn" version = "2.0.18" @@ -1246,12 +1065,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "target-lexicon" -version = "0.12.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0e916b1148c8e263850e1ebcbd046f333e0683c724876bb0da63ea4373dc8a" - [[package]] name = "termcolor" version = "1.2.0" @@ -1261,14 +1074,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "test_pgenlib" -version = "0.1.0" -dependencies = [ - "bindgen", - "pgenlib_rust", -] - [[package]] name = "textwrap" version = "0.11.0" @@ -1300,24 +1105,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" -[[package]] -name = "unindent" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" - [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - [[package]] name = "walkdir" version = "2.3.3" @@ -1355,7 +1148,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.18", + "syn", "wasm-bindgen-shared", ] @@ -1377,7 +1170,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] From de7d9ecf3233e350f307f5415c6643af49cec894 Mon Sep 17 00:00:00 2001 From: rickyota <22293266+rickyota@users.noreply.github.com> Date: Sun, 8 Oct 2023 13:52:07 +0900 Subject: [PATCH 6/6] add: README --- README.md | 73 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 16fd7c9..e4d89fe 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# GenoBoost v1.0.0 +# GenoBoost v1.0.2 [![GenoBoost](https://github.com/rickyota/genoboost/actions/workflows/genoboost.yml/badge.svg)](https://github.com/rickyota/genoboost/actions/workflows/genoboost.yml) [![Release](https://github.com/rickyota/genoboost/actions/workflows/publish.yml/badge.svg)](https://github.com/rickyota/genoboost/actions/workflows/publish.yml) @@ -13,7 +13,8 @@ $ genoboost train \ --dir ./result \ --file-genot ./example/genot \ --file-phe ./example/genot.cov \ - --cov age,sex + --cov age,sex \ + --major-a2-train ``` ## Table of Contents @@ -21,26 +22,26 @@ $ genoboost train \ - [Getting Started](#started) - [Introduction](#introduction) - [Users' Guide](#user-guide) - - [Installation](#install) - - [Plink1 Input](#install-plink1) - - [Plink2 Input](#install-plink2) - - [Advaned install](#install-advanced) - - [Train GenoBoost Model](#train) - - [Simplest Usage](#train-simple) - - [Without Validation](#train-train-only) - - [Input Plink2](#train-plink2) - - [Cross-validation](#train-cv) - - [Options for Training](#train-option) - - [Calculate Sample Scores](#score) - - [Simplest Usage](#score-simple) - - [Without Validation](#score-train-only) - - [Input Plink2](#score-plink2) - - [Cross-validation](#score-cv) - - [Options for Score](#score-option) + - [Installation](#install) + - [Plink1 Input](#install-plink1) + - [Plink2 Input](#install-plink2) + - [Advaned install](#install-advanced) + - [Train GenoBoost Model](#train) + - [Simplest Usage](#train-simple) + - [Without Validation](#train-train-only) + - [Input Plink2](#train-plink2) + - [Cross-validation](#train-cv) + - [Options for Training](#train-option) + - [Calculate Sample Scores](#score) + - [Simplest Usage](#score-simple) + - [Without Validation](#score-train-only) + - [Input Plink2](#score-plink2) + - [Cross-validation](#score-cv) + - [Options for Score](#score-option) - [Advanced Guide](#advanced-guide) - - [Installation](#advanced-install) - - [Docker](#docker) - - [Singularity](#singularity) + - [Installation](#advanced-install) + - [Docker](#docker) + - [Singularity](#singularity) ## Introduction @@ -82,7 +83,6 @@ cp ./projects_rust/target/release/genoboost ./genoboost and you can use `genoboost` program. This should take less than 5 minutes. - #### Advanced Install See [Advanced Guide](#advanced-guide) for docker or singularity users. @@ -93,7 +93,6 @@ GenoBoost returns the SNV weights file with $s_0, s_1, s_2$ for each SNV in one - #### Simplest Usage You can run GenoBoost at least with plink1 genotype files and, in most cases, a covariates file. @@ -105,13 +104,15 @@ See `./example/` for reference of file format. For example, the covariates file With the minimum options, GenoBoost produces SNV weights list with the best parameter. SNV weights list is computed from randomly extracted training samples, and the best parameter is determined in the remaining validation samples. Write the column name to be used in covariates file after `--cov`. +It is important that major allele is set to a2 by `--major-a2-train`since $s_2$ is winsorized. This option is unnecessary if major allele is already set as reference allele in genotype file. ```bash $ genoboost train \ --dir ./result \ --file-genot ./example/genot \ --file-phe ./example/genot.cov \ - --cov age,sex + --cov age,sex \ + --major-a2-train ``` #### Without Validation @@ -124,6 +125,7 @@ $ genoboost train \ --file-genot ./example/genot \ --file-phe ./example/genot.cov \ --cov age,sex \ + --major-a2-train \ --train-only \ --iter-snv 10000 ``` @@ -143,7 +145,8 @@ $ genoboost train \ --genot-format plink2-vzs \ --file-phe ./example/genot2.phe \ --phe-name PHENO1 \ - --cov age,sex + --cov age,sex \ + --major-a2-train ``` #### Cross-validation @@ -156,6 +159,7 @@ $ genoboost train \ --file-genot ./example/genot \ --file-phe ./example/genot.cov \ --cov age,sex \ + --major-a2-train \ --cross-validation 5 \ --seed 51 ``` @@ -180,6 +184,8 @@ $ genoboost train \ `--file-snv [FILE]`: Snv file for training. One line for one SNV id. +`--major-a2-train`: Set major allele as a2 in training dataset. + `--iter-snv [NUMBER]`, `--iter [NUMBER]` : Maximum number of SNVs or iterations for training. `--learning-rates [NUMBERS]`: Learning rates in space-delimited format. Default value is `"0.5 0.2 0.1 0.05"`. @@ -226,7 +232,6 @@ $ genoboost score \ --iters "10 20 50" ``` - #### Input Plink2 Use `--genot-format`, `--file-phe` etc. for plink2 as shown in [training phase](#train-plink2). @@ -253,20 +258,19 @@ $ genoboost score \ --cross-validation 5 ``` - #### Options for Score `--dir ` : Directory to output score files. -`--dir-wgt [DIR]` : Same directory specified on training. +`--dir-wgt [DIR]` : Same directory specified on training. -`--file-wgt [FILE]` : Use this specific SNV weight file. +`--file-wgt [FILE]` : Use this specific SNV weight file. `--file-genot `: Prefix of a plink1 or plink2 file (.bed, .fam, .bim or .pgen, .psam, .pvar/.pvar.zst should exist). `--genot-format [FORMAT]`: {`plink`, `plink2`, `plink2-vzs`}. Genotype format. Default is `plink`. -`--file-phe [FILE]`: Covariates file. +`--file-phe [FILE]`: Covariates file. `--cov [NAMES]`: Covariates names in comma-delimited format. ex. `age,sex,PC1-PC10`. @@ -286,10 +290,10 @@ $ genoboost score \ `--verbose`: Let GenoBoost speak more! - ## Advanced Guide ### Advanced Installation + Using docker or singularity is recommended. #### Docker @@ -301,7 +305,8 @@ $ docker run -it rickyota/genoboost:latest \ --dir ./result \ --file-genot ./example/genot \ --file-phe ./example/genot.cov \ - --cov age,sex + --cov age,sex \ + --major-a2-train ``` #### Singularity @@ -313,13 +318,13 @@ $ singularity run genoboost.sif \ --dir ./result \ --file-genot ./example/genot \ --file-phe ./example/genot.cov \ - --cov age,sex + --cov age,sex \ + --major-a2-train ``` ### Computational Time For ~216 thousands training samples and ~1.1 million SNVs for 10,000 unique SNVs, GenoBoost would take 10 hours. - [release]: https://github.com/rickyota/genoboost/releases [rust-install]: https://www.rust-lang.org/tools/install