From 8f96632dd7d95f9caaebb28b89b903cbdff1530d Mon Sep 17 00:00:00 2001
From: sktron <simo.kitanovski@uni-due.de>
Date: Sat, 23 Mar 2024 22:44:09 +0100
Subject: [PATCH] minor bugfixes

---
 DESCRIPTION               |   5 +-
 inst/stan/dgu.stan        |  28 +++++-----
 inst/stan/dgu_rep.stan    |  32 +++++------
 inst/stan/gu.stan         |  24 ++++----
 inst/stan/gu_rep.stan     |  26 ++++-----
 man/DGU.Rd                |   6 +-
 man/LOO.Rd                |   5 +-
 vignettes/User_Manual.Rmd | 114 +++++++++++---------------------------
 8 files changed, 94 insertions(+), 146 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 4562060..a508a3f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: IgGeneUsage
 Type: Package
 Title: Differential gene usage in immune repertoires
-Version: 1.17.14
+Version: 1.17.15
 Authors@R: 
     person(given = "Simo",  
            family = "Kitanovski",
@@ -33,8 +33,7 @@ Suggests:
     ggplot2,
     ggforce,
     gridExtra,
-    ggrepel,
-    MASS
+    ggrepel
 LinkingTo: 
     BH (>= 1.66.0),
     Rcpp (>= 0.12.0),
diff --git a/inst/stan/dgu.stan b/inst/stan/dgu.stan
index 7b589a5..efc97bf 100644
--- a/inst/stan/dgu.stan
+++ b/inst/stan/dgu.stan
@@ -33,16 +33,16 @@ data {
   int<lower=0> N_gene;                     // gene
   int<lower=0> N_individual;               // number of individuals
   int<lower=0> N_condition;                // number of conditions
-  array[N_individual] int N;                   // number of tries (repertoire size)
-  array[N_gene, N_individual] int Y;           // number of heads for each coin
-  array[N_individual] int condition_id;    // id of conditions
-  array[N_individual] int individual_id;       // id of replicate
+  array [N_individual] int N;                   // number of tries (repertoire size)
+  array [N_gene, N_individual] int Y;           // number of heads for each coin
+  array [N_individual] int condition_id;    // id of conditions
+  array [N_individual] int individual_id;       // id of replicate
 }
 
 transformed data {
   // convert int N -> real N fo convenient division
   // in generated quantities block
-  real Nr [N_individual];
+  array [N_individual] real Nr;
   Nr = N;
 }
 
@@ -55,14 +55,14 @@ parameters {
   vector <lower=0> [N_condition] sigma_condition;
   vector <lower=0> [N_condition] sigma_individual;
   
-  array[N_individual] vector [N_gene] z_beta_individual;
-  array[N_condition] vector [N_gene] z_beta_condition;
+  array [N_individual] vector [N_gene] z_beta_individual;
+  array [N_condition] vector [N_gene] z_beta_condition;
 }
 
 transformed parameters {
-  array[N_individual] vector <lower=0, upper=1> [N_gene] theta;
-  array[N_individual] vector [N_gene] beta_individual;
-  array[N_condition] vector [N_gene] beta_condition;
+  array [N_individual] vector <lower=0, upper=1> [N_gene] theta;
+  array [N_individual] vector [N_gene] beta_individual;
+  array [N_condition] vector [N_gene] beta_condition;
   
   for(i in 1:N_condition) {
     beta_condition[i] = 0 + sigma_condition[i] * z_beta_condition[i];
@@ -98,16 +98,16 @@ model {
 
 generated quantities {
   // PPC: count usage (repertoire-level)
-  int Yhat_rep [N_gene, N_individual];
+  array [N_gene, N_individual] int Yhat_rep;
   
   // PPC: proportion usage (repertoire-level)
-  real Yhat_rep_prop [N_gene, N_individual];
+  array [N_gene, N_individual] real Yhat_rep_prop;
   
   // PPC: proportion usage at a gene level in condition
-  vector [N_gene] Yhat_condition_prop [N_condition];
+  array [N_condition] vector [N_gene] Yhat_condition_prop;
   
   // LOG-LIK
-  vector [N_gene] log_lik [N_individual];
+  array [N_individual] vector [N_gene] log_lik;
   
   // DGU matrix
   matrix [N_gene, N_condition*(N_condition-1)/2] dgu;
diff --git a/inst/stan/dgu_rep.stan b/inst/stan/dgu_rep.stan
index dc2a72e..d49bf50 100644
--- a/inst/stan/dgu_rep.stan
+++ b/inst/stan/dgu_rep.stan
@@ -32,16 +32,16 @@ data {
   int<lower=0> N_gene;                     // gene
   int<lower=0> N_individual;               // number of individuals
   int<lower=0> N_condition;                // number of conditions
-  array[N_sample] int N;                   // number of tries (repertoire size)
-  array[N_gene, N_sample] int Y;           // number of heads for each coin
-  array[N_individual] int condition_id;    // id of conditions
-  array[N_sample] int individual_id;       // id of replicate
+  array [N_sample] int N;                   // number of tries (repertoire size)
+  array [N_gene, N_sample] int Y;           // number of heads for each coin
+  array [N_individual] int condition_id;    // id of conditions
+  array [N_sample] int individual_id;       // id of replicate
 }
 
 transformed data {
   // convert int N -> real N fo convenient division
   // in generated quantities block
-  real Nr [N_sample];
+  array [N_sample] real Nr;
   Nr = N;
 }
 
@@ -55,17 +55,17 @@ parameters {
   vector <lower=0> [N_condition] sigma_individual;
   real <lower=0> sigma_replicate;
   
-  array[N_sample] vector [N_gene] z_beta_sample;
-  array[N_individual] vector [N_gene] z_beta_individual;
-  array[N_condition] vector [N_gene] z_beta_condition;
+  array [N_sample] vector [N_gene] z_beta_sample;
+  array [N_individual] vector [N_gene] z_beta_individual;
+  array [N_condition] vector [N_gene] z_beta_condition;
 }
 
 
 transformed parameters {
-  array[N_sample] vector <lower=0, upper=1> [N_gene] theta;
-  array[N_sample] vector [N_gene] beta_sample;
-  array[N_condition] vector [N_gene] beta_condition;
-  array[N_individual] vector [N_gene] beta_individual;
+  array [N_sample] vector <lower=0, upper=1> [N_gene] theta;
+  array [N_sample] vector [N_gene] beta_sample;
+  array [N_condition] vector [N_gene] beta_condition;
+  array [N_individual] vector [N_gene] beta_individual;
   
   for(i in 1:N_condition) {
     beta_condition[i] = 0 + sigma_condition[i] * z_beta_condition[i];
@@ -109,16 +109,16 @@ model {
 
 generated quantities {
   // PPC: count usage (repertoire-level)
-  int Yhat_rep [N_gene, N_sample];
+  array [N_gene, N_sample] int Yhat_rep;
 
   // PPC: proportion usage (repertoire-level)
-  real Yhat_rep_prop [N_gene, N_sample];
+  array [N_gene, N_sample] real Yhat_rep_prop;
 
   // PPC: proportion usage at a gene level in condition
-  vector [N_gene] Yhat_condition_prop [N_condition];
+  array [N_condition] vector [N_gene] Yhat_condition_prop;
 
   // LOG-LIK
-  vector [N_gene] log_lik [N_sample];
+  array [N_sample] vector [N_gene] log_lik;
 
   // DGU matrix
   matrix [N_gene, N_condition*(N_condition-1)/2] dgu;
diff --git a/inst/stan/gu.stan b/inst/stan/gu.stan
index f409ca7..dd08124 100644
--- a/inst/stan/gu.stan
+++ b/inst/stan/gu.stan
@@ -33,16 +33,16 @@ data {
   int<lower=0> N_gene;                     // gene
   int<lower=0> N_individual;               // number of individuals
   int<lower=0> N_condition;                // number of conditions
-  array[N_individual] int N;               // number of tries (repertoire size)
-  array[N_gene, N_individual] int Y;       // number of heads for each coin
-  array[N_individual] int condition_id;    // id of conditions
-  array[N_individual] int individual_id;   // id of replicate
+  array [N_individual] int N;               // number of tries (repertoire size)
+  array [N_gene, N_individual] int Y;       // number of heads for each coin
+  array [N_individual] int condition_id;    // id of conditions
+  array [N_individual] int individual_id;   // id of replicate
 }
 
 transformed data {
   // convert int N -> real N fo convenient division
   // in generated quantities block
-  real Nr [N_individual];
+  array [N_individual] real Nr;
   Nr = N;
 }
 
@@ -50,14 +50,14 @@ parameters {
   real <lower=0> phi;
   real <lower=0, upper=1> kappa;
   real <lower=0> sigma_individual;
-  array[N_individual] vector [N_gene] z_beta_individual;
+  array [N_individual] vector [N_gene] z_beta_individual;
   vector [N_gene] beta_condition;
 }
 
 transformed parameters {
-  array[N_individual] vector <lower=0, upper=1> [N_gene] theta;
-  array[N_individual] vector [N_gene] beta_sample;
-  array[N_individual] vector [N_gene] beta_individual;
+  array [N_individual] vector <lower=0, upper=1> [N_gene] theta;
+  array [N_individual] vector [N_gene] beta_sample;
+  array [N_individual] vector [N_gene] beta_individual;
   
   for(i in 1:N_individual) {
     beta_individual[i] = beta_condition + sigma_individual * z_beta_individual[i];
@@ -85,16 +85,16 @@ model {
 
 generated quantities {
   // PPC: count usage (repertoire-level)
-  int Yhat_rep [N_gene, N_individual];
+  array [N_gene, N_individual] int Yhat_rep;
   
   // PPC: proportion usage (repertoire-level)
-  real Yhat_rep_prop [N_gene, N_individual];
+  array [N_gene, N_individual] real Yhat_rep_prop;
   
   // PPC: proportion usage at a gene level in condition
   vector [N_gene] Yhat_condition_prop;
   
   // LOG-LIK
-  vector [N_gene] log_lik [N_individual];
+  array [N_individual] vector [N_gene] log_lik;
 
   //TODO: speedup, run in C++ not big factor on performance
   for(j in 1:N_gene) {
diff --git a/inst/stan/gu_rep.stan b/inst/stan/gu_rep.stan
index 4dca227..b99d5dc 100644
--- a/inst/stan/gu_rep.stan
+++ b/inst/stan/gu_rep.stan
@@ -32,16 +32,16 @@ data {
   int<lower=0> N_gene;                     // gene
   int<lower=0> N_individual;               // number of individuals
   int<lower=0> N_condition;                // number of conditions
-  array[N_sample] int N;                   // number of tries (repertoire size)
-  array[N_gene, N_sample] int Y;           // number of heads for each coin
-  array[N_individual] int condition_id;    // id of conditions
-  array[N_sample] int individual_id;       // id of replicate
+  array [N_sample] int N;                   // number of tries (repertoire size)
+  array [N_gene, N_sample] int Y;           // number of heads for each coin
+  array [N_individual] int condition_id;    // id of conditions
+  array [N_sample] int individual_id;       // id of replicate
 }
 
 transformed data {
   // convert int N -> real N fo convenient division
   // in generated quantities block
-  real Nr [N_sample];
+  array [N_sample] real Nr;
   Nr = N;
 }
 
@@ -52,17 +52,17 @@ parameters {
   real <lower=0> sigma_individual;
   real <lower=0> sigma_replicate;
   
-  array[N_sample] vector [N_gene] z_beta_sample;
-  array[N_individual] vector [N_gene] z_beta_individual;
+  array [N_sample] vector [N_gene] z_beta_sample;
+  array [N_individual] vector [N_gene] z_beta_individual;
   
   vector [N_gene] beta_condition;
 }
 
 
 transformed parameters {
-  array[N_sample] vector <lower=0, upper=1> [N_gene] theta;
-  array[N_sample] vector [N_gene] beta_sample;
-  array[N_individual] vector [N_gene] beta_individual;
+  array [N_sample] vector <lower=0, upper=1> [N_gene] theta;
+  array [N_sample] vector [N_gene] beta_sample;
+  array [N_individual] vector [N_gene] beta_individual;
   
   for(i in 1:N_individual) {
     beta_individual[i] = beta_condition + sigma_individual * z_beta_individual[i];
@@ -98,16 +98,16 @@ model {
 
 generated quantities {
   // PPC: count usage (repertoire-level)
-  int Yhat_rep [N_gene, N_sample];
+  array [N_gene, N_sample] int Yhat_rep;
   
   // PPC: proportion usage (repertoire-level)
-  real Yhat_rep_prop [N_gene, N_sample];
+  array [N_gene, N_sample] real Yhat_rep_prop;
   
   // PPC: proportion usage at a gene level in condition
   vector [N_gene] Yhat_condition_prop;
   
   // LOG-LIK
-  vector [N_gene] log_lik [N_sample];
+  array [N_sample] vector [N_gene] log_lik;
 
   //TODO: speedup, run in C++ not big factor on performance
   for(j in 1:N_gene) {
diff --git a/man/DGU.Rd b/man/DGU.Rd
index 883e7bb..e035aa5 100644
--- a/man/DGU.Rd
+++ b/man/DGU.Rd
@@ -22,15 +22,15 @@ DGU(ud,
 }
 
 \arguments{
-\item{ud}{Data.frame with 4 columns: 
+\item{ud}{Data.frame with 4 or 5 columns: 
 \itemize{
 \item 'individual_id' = character, name of the donor (e.g. Pt1)
 \item 'condition' = character, name of biological conditions (e.g. tumor)
 \item 'gene_name' = character, Ig gene name (e.g. IGHV1-69)
 \item 'gene_usage_count' = number, frequency (=usage) of rearrangements from 
        individual_id x condition x gene_name
-\item [optional] 'repertoire' = character or number, if more than one 
-      repertoire (biological replicates) is available per individual 
+\item [optional] 'replicate' = character or number. Replicate id, if more than 
+      one repertoire (biological replicates) is available per individual 
 }
 ud can also be be a SummarizedExperiment object. See examplary data 
 'data(Ig_SE)' for more information.
diff --git a/man/LOO.Rd b/man/LOO.Rd
index e4b0c1d..f4823b0 100644
--- a/man/LOO.Rd
+++ b/man/LOO.Rd
@@ -50,9 +50,8 @@ LOO(ud,
 \item 'gene_name' = character, Ig gene name (e.g. IGHV1-69)
 \item 'gene_usage_count' = number, frequency (=usage) of rearrangements from 
        individual_id x condition x gene_name
-
-\item [optional] 'repertoire' = character or number, if more than one 
-      repertoire (biological replicates) is available per individual 
+\item [optional] 'replicate' = character or number. Replicate id, if more than 
+      one repertoire (biological replicates) is available per individual 
 }
 ud can also be be a SummarizedExperiment object. See dataset
 'data(Ig_SE)' for more information.
diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd
index a75ee9d..39261b7 100644
--- a/vignettes/User_Manual.Rmd
+++ b/vignettes/User_Manual.Rmd
@@ -28,92 +28,51 @@ require(ggforce)
 require(gridExtra)
 require(ggrepel)
 require(reshape2)
-require(MASS)
 ```
 
 
 
 # Introduction
-Decoding the properties of immune repertoires is key in understanding the 
-response of adaptive immunity to challenges such as viral infection. One 
-important property is biases in immunoglobulin (Ig) gene usage between 
-biological conditions (e.g. healthy vs tumor). Yet, most analyses for 
-differential gene usage (DGU) are performed qualitatively, or with 
-inadequate statistical methods. Here we introduce `r Biocpkg("IgGeneUsage")`,
-a computational tool for DGU analysis.
+Decoding the properties of immune receptor repertoires (IRRs) is key to 
+understanding how our adaptive immune system responds to challenges, such 
+as viral infection or cancer. One important quantitative property of IRRs 
+is their immunoglobulin (Ig) gene usage, i.e. how often are the differnt 
+Igs that make up the immune receptors used in a given IRR. Furthermore, we
+may ask: is there differential gene usage (DGU) between IRRs from different
+biological conditions (e.g. healthy vs tumor). 
+
+Both of these questions can be answered quantitatively by are answered by 
+`r Biocpkg("IgGeneUsage")`.
 
 
 # Input
 The main input of `r Biocpkg("IgGeneUsage")` is a data.frame that has the 
 following columns:
 
-  1. individual_id: name of the repertoire (e.g. Patient-1)
-  2. condition: name of the condition to which each repertoire 
+  1. **individual_id**: name of the repertoire (e.g. Patient-1)
+  2. **condition**: name of the condition to which each repertoire 
   belongs (healthy, tumor_A, tumor_B, ...)
-  3. gene_name: gene name (e.g. IGHV1-10 or family TRVB1)
-  4. gene_usage_count: numeric (count) of usage related in individual x gene x 
-     condition specified in columns 1-3
-  5. [optional] replicates: character/numeric if multiple biological replicates
-  are available for each individual
-
-The sum of all gene usage counts (column 4) for a given repertoire is equal to
-the repertoire size (number of cells in the repertoire).
-
+  3. **gene_name**: gene name (e.g. IGHV1-10 or family TRVB1)
+  4. **gene_usage_count**: numeric (count) of usage related in individual x 
+     gene x condition specified in columns 1-3
+  5. [optional] **repertoire**: character/numeric identifier that tags the
+     different biological replicates if they are available for a specific
+     individual
 
 # Model
-`r Biocpkg("IgGeneUsage")` transforms the provided input in the following 
-way. Given $R$ repertoires, each having $G$ genes, `r Biocpkg("IgGeneUsage")` 
+`r Biocpkg("IgGeneUsage")` transforms the input data in the following 
+way. 
+
+First, given $R$ repertoires, each having $G$ genes, `r Biocpkg("IgGeneUsage")` 
 generates a gene usage matrix $Y^{R \times G}$. Row sums in $Y$ define the 
 total usage in each repertoire ($N$). 
 
-For the analysis of DGU between biological conditions, we designed the 
-following Bayesian model ($M$) for zero-inflated beta-binomial regression. 
-This model can fit over-dispersed gene usage data. The immune repertoire data 
-is also not exhaustive, i.e. some Ig genes that are systematically rearranged 
-at low probability might not be sampled, and certain Ig genes are not encoded 
-(or dysfunctional) in some individuals. The zero-inflated component of our 
-model accounts for this.
-
-\begin{align}
-p(Y_{ijk} \mid M) = \begin{cases} 
-\kappa_{j} + (1 - \kappa) \operatorname{BB}\left(0 \mid N_{ijk}, \theta_{ijk}, 
-\phi \right), & \text{if $Y_{ijk}$ = 0} \\
-(1 - \kappa_{j}) \operatorname{BB}\left(Y_{ijk} \mid N_{ijk}, \theta_{ijk}, 
-\phi \right), & \text{if $Y_{ijk}$ > 0}
-\end{cases}\\
-\theta_{ij}=\operatorname{logit^{-1}}\left(\alpha_{j}+\beta_{ijk}\right)\\
-\beta_{ijk}\sim\operatorname{Normal}\left(\gamma_{jk},\sigma_{k}\right)\\
-\gamma_{jk}\sim\operatorname{Normal}\left(0,\tau_{k}\right)\\
-\alpha_{j}\sim\operatorname{Normal}\left(\mu_{\alpha},\sigma_{\alpha}\right)\\
-\mu_{\alpha} \sim \operatorname{Normal}\left(0, 5\right)\\
-\sigma_{k}, \tau_{k}, \sigma_{\alpha} \sim 
-\operatorname{Cauchy^{+}}\left(0, 1\right) \\
-\phi \sim \operatorname{Exponential}\left(\tau\right) \\
-\tau \sim \operatorname{Gamma}\left(3, 0.1\right) \\
-\kappa_{j} \sim \operatorname{Beta}\left({0.1, 1.0\right)
-\end{align}
-
-Model $M$ legend:
-
-  * $i$ and $j$: index of different repertoires and genes, respectively 
-  * $\kappa$: zero-inflation probability
-  * $\theta$: probability of gene usage
-  * $\phi$: dispersion
-  * $\alpha$: intercept/baseline gene usage
-  * $\beta$: slope/within-repertoire DGU coefficient
-  * $\gamma$, $\gamma_{\sigma}$: slope/gene-specific DGU coefficient; 
-  standard deviation
-  * $\hat{\gamma}$, $\hat{\gamma}_{\sigma}$: mean and standard deviation of 
-  the population of gene-specific DGU coefficients
-  * $\hat{\alpha}$, $\hat{\alpha}_{\sigma}$: mean and standard deviation of 
-  the population of gene-specific baseline usages
-  * $\operatorname{BB}$: beta-binomial probability mass function (pmf)
-  * $\operatorname{Normal}$: normal probability density function (pdf)
-  * $\operatorname{Cauchy^{+}}$: half-Cauchy pdf
-  * $\operatorname{Exponential}$: exponential pdf
-  * $\operatorname{Gamma}$: gamma pdf
-  * $\operatorname{Beta}$: beta pdf
-  * $\operatorname{logit^{-1}}$: inverse logistic function
+Second, for the analysis of DGU between biological conditions, we use a 
+Bayesian model ($M$) for zero-inflated beta-binomial regression. Empirically,
+we know that Ig gene usage data can be noisy also not exhaustive, i.e. some 
+Ig genes that are systematically rearranged at low probability might not be 
+sampled, and certain Ig genes are not encoded (or dysfunctional) in some 
+individuals. $M$ can fit over-dispersed and zero-inflated Ig gene usage data.
 
 In the output of `r Biocpkg("IgGeneUsage")`, we report the mean effect 
 size (es or $\gamma$) and its 95% highest density interval (HDI). Genes with 
@@ -480,22 +439,13 @@ ggplot(data = L_gu)+
   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
-# Multidimensional scaling (MDS) analaysis based 
+# Hierarchical clustering analaysis 
 
-```{r, fig.width=5, fig.height=4}
+```{r, fig.width=6, fig.height=4}
 # x <- M$theta
-x <- acast(individual_id~gene_name, 
-           data = M$theta, value.var = "theta_mean")
-
-nmds <- isoMDS(dist(x), k=2)
-
-x <- data.frame(x = nmds$points[,1], y = nmds$points[,2], id = row.names(x))
+x <- acast(individual_id~gene_name, data = M$theta, value.var = "theta_mean")
 
-ggplot(data = x)+
-  geom_point(aes(x = x, y = y), shape = 21, size = 2)+
-  geom_text_repel(aes(x = x, y = y, label = id), 
-                  size = 3.5, min.segment.length = 0)+
-  theme_bw()
+plot(hclust(dist(x, method = "euclidean"), method = "average"))
 ```