Merge branch 'master' into dev

jlab · Nov 18, 2024 · 09383f4 · 09383f4
2 parents 671e61d + 66840ed
commit 09383f4
Show file tree

Hide file tree

Showing 7 changed files with 295 additions and 6 deletions.
diff --git a/.github/workflows/github_tests.yml b/.github/workflows/github_tests.yml
@@ -31,7 +31,7 @@ jobs:
 
     - name: Run tests with pytest
       run: |
-          $CONDA/bin/pytest tests --doctest-modules --cov=src/marbel --cov-report=xml
+          $CONDA/bin/pytest tests --ignore ./tests/test_functional.py --doctest-modules --cov=src/marbel --cov-report=xml
 
     - name: Convert coverage to lcov format
       run: |
@@ -42,3 +42,57 @@ jobs:
 #      with:
 #        github-token: ${{ secrets.GITHUB_TOKEN }}
 #        path-to-lcov: "coverage.lcov"
+  simulate-data:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v2
+      with:
+          lfs: true
+    - name: Checkout LFS objects
+      run: git lfs checkout
+    - name: Set up Python
+      uses: actions/setup-python@v2
+    - name: Create conda env
+      run: |
+        conda create -n marbel python=3.10 r-base
+    - name: Install the package
+      run: |
+        pip install -e .
+    - name: Simulate reads
+      run: |
+        marbel --n-species 10 --n-orthogroups 200 --n-samples 5 10 --library-size 10000 --outdir run_spec10_orth200_samp5-10/
+    - name: Archive simulation results
+      uses: actions/upload-artifact@v4
+      with:
+        name: simulated data
+        path: run_spec10_orth200_samp5-10
+
+
+  tests-functional:
+    needs: simulate-data
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v2
+      with:
+          lfs: true
+    - name: Create conda test-env
+      run: |
+        conda create -n marbel-tests scikit-bio
+    - name: Download simulated data
+      uses: actions/download-artifact@v4
+      with:
+        name: simulated data
+        path: run_spec10_orth200_samp5-10/
+    - name: Execute functional tests
+      run: |
+        conda init
+        eval "$(conda shell.bash hook)"
+        conda activate marbel-tests
+        python tests/test_functional.py run_spec10_orth200_samp5-10/
diff --git a/src/marbel/data/orthologues_processed_combined_all.parquet b/src/marbel/data/orthologues_processed_combined_all.parquet
diff --git a/src/marbel/data_generations.py b/src/marbel/data_generations.py
@@ -550,6 +550,8 @@ def draw_dge_factors(dge_ratio, number_of_selected_genes):
     Returns:
         numpy.ndarray: The differentialy expressed factors
     """
+    dge_ratio = dge_ratio / 2
+
     if number_of_selected_genes == 0:
         return np.array([])
 

diff --git a/src/marbel/meta_tran_sim.py b/src/marbel/meta_tran_sim.py
@@ -76,6 +76,7 @@ def bar_next(bar):
     bar.next()
     print()
 
+
 @app.command()
 def main(n_species: Annotated[int, typer.Option(callback=species_callback,
                                                 help="Number of species to be drawn for the metatranscriptomic in silico dataset")] = 20,
@@ -94,7 +95,8 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
                                                           + "with a more diverse phylogenetic distance.")] = None,
          min_identity: Annotated[float, typer.Option(help="Minimum mean sequence identity score for an orthologous groups."
                                                           + "Specify for more ")] = None,
-         dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback, help="Ratio of up and down regulated genes. Must be between 0 and 1")] = 0.1,
+         dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback, help="Ratio of up and down regulated genes. Must be between 0 and 1."
+                                                  "This is a random drawing process from normal distribution, so the actual ratio might vary.")] = 0.1,
          seed: Annotated[int, typer.Option(help="Seed for the sampling. Set for reproducibility")] = None,
          error_model: Annotated[ErrorModel, typer.Option(help="Sequencer model for the reads, use basic or perfect (no errors) for custom read length")] = ErrorModel.HiSeq,
          compressed: Annotated[bool, typer.Option(help="Compress the output fastq files")] = True,
@@ -113,7 +115,6 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
     number_of_orthogous_groups = n_orthogroups
     number_of_species = n_species
     number_of_sample = n_samples
-    dge_ratio = dge_ratio / 2
     # maybe change to synthetic species later on, for now just use the available species
     # generate some plots so the user can see the distribution
 

diff --git a/tests/env_functional.yaml b/tests/env_functional.yaml
@@ -0,0 +1,8 @@
+name: marbel-tests
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - scikit-bio
+  - pandas
diff --git a/tests/test_data_generations.py b/tests/test_data_generations.py
@@ -133,7 +133,7 @@ def test_draw_dge_factors_output_shape(dge_ratio, num_genes):
     assert result.shape[0] == num_genes
 
 
-@pytest.mark.parametrize("dge_ratio", [-1, -0.01, 0.5, 0.75, 1.0])
+@pytest.mark.parametrize("dge_ratio", [-1, -0.01, 1.0, 1.5, 2.0])
 def test_draw_dge_factors_ratio_outdie_threshold_raises_value_error(dge_ratio):
     with pytest.raises(ValueError):
         draw_dge_factors(dge_ratio, 100)

diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -0,0 +1,224 @@
+import inspect
+import sys
+import pandas as pd
+from os.path import join, basename
+import skbio
+import gzip
+
+
+def read_fastqGZ(fp_fastq):
+    def _push(sequences, header, sequence, qualities):
+        sequences.append({'header': header,
+                          'sequence': sequence,
+                          'qualities': qualities})
+    OFFSET = 33
+    sequences = []
+    with gzip.open(fp_fastq, 'rb') as f:
+        header = None
+        sequence = ""
+        qualities = []
+        for i, l in enumerate(f.readlines()):
+            line = l.decode("utf-8")
+            if (i % 4 == 0) and (line.startswith('@')):
+                if header is not None:
+                    _push(sequences, header, sequence, qualities)
+                header = line[1:].strip()
+            elif (i % 4 == 1):
+                sequence = line.strip()
+            elif (i % 4 == 3):
+                qualities = pd.Series(
+                    map(lambda x: ord(x) - OFFSET, line.strip()))
+        if header is not None:
+            _push(sequences, header, sequence, qualities)
+    return sequences
+
+
+def read_parameters(fp_basedir):
+    params = pd.read_csv(
+        join(fp_basedir, 'summary', 'marbel_params.txt'),
+        index_col=0, sep=": ", header=None,
+        names=['parameter', 'value'], engine='python')['value'].to_dict()
+
+    # fix data types
+    params['Number of species'] = int(params['Number of species'])
+    params['Number of orthogroups'] = int(params['Number of orthogroups'])
+    params['Number of samples'] = list(
+        map(int, params['Number of samples']
+            .replace('(', '')
+            .replace(')', '')
+            .replace(' ', '').split(',')))
+    params['Ratio of up and down regulated genes'] = float(
+        params['Ratio of up and down regulated genes'])
+
+    return params
+
+
+def test_gene_summary(genes, params):
+    counts = genes[[s for s in genes.columns if s.startswith('sample_')]]
+
+    assert len(genes.index) == len(set(genes.index)), \
+           "genes are ambiguous"
+
+    assert len(sorted(genes["origin_species"].unique())) == \
+           params['Number of species'], \
+           "number of species does not match"
+
+    genes['speciesID'] = list(map(lambda x: x.split('_')[0], genes.index))
+    assert genes.groupby(['speciesID', 'origin_species']).size().shape[0] == \
+           params['Number of species'], \
+           "gene_name prefix does not match origin_species"
+
+    genes['cdsID'] = list(map(lambda x: x.split('_')[-1], genes.index))
+    assert genes['orthogroup'].unique().shape[0] == \
+           params['Number of orthogroups'], \
+           "number orthogroups does not match"
+
+    assert genes.groupby('orthogroup').size().describe()['max'] > 1, \
+           "orthogroups consists of only 1 CDS"
+    data = (genes.groupby('orthogroup').size() > 1).value_counts()
+    assert data[True] > data[False] * 0.2, \
+           "surprisingly few orthogroups consists of more than one CDS"
+
+    data = (counts.stack() == 0).value_counts()
+    assert data[True] / data.sum() > 0.1, \
+           "surprisingly low sparseness in count matrix!"
+
+#    assert len(set(counts.sum())) == sum(params['Number of samples']), \
+#           "not all samples have different coverage"
+    assert counts.min().min() >= 0, \
+           "negative count values?!"
+
+    # foldchange <=0.5 (down) >=2 (up) (log2 foldchange<= -1 / >=1 )
+    data = genes['fold_change_ratio'].apply(
+        lambda x: 0.5 <= x <= 2).value_counts()
+    if params['Number of orthogroups'] >= 1000:
+        epsilon = (200 / params['Number of orthogroups'])
+    else:
+        epsilon = 0.4
+    # print(params['Ratio of up and down regulated genes'])
+    # print(data[False] / data.sum())
+    # print(data)
+    # print(params['Ratio of up and down regulated genes'] * (1 - epsilon) )
+    # print(params['Ratio of up and down regulated genes'] * (1 + epsilon) )
+    assert params['Ratio of up and down regulated genes'] * (1 - epsilon) <= \
+           (data[False] / data.sum()) <= \
+           params['Ratio of up and down regulated genes'] * (1 + epsilon), \
+           "ratio of UP (or Down) regulated genes does not match parameters"
+
+    print(len(set(list(map(lambda x: x.split('_')[1], genes.index)))),
+          "number CDS?!")
+    print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)
+
+
+def test_metaT_reference(fp_basedir, genes, params):
+    seqs = dict()
+    for seq in skbio.io.read(join(fp_basedir, 'summary',
+                             'metatranscriptome_reference.fasta'),
+                             format='fasta'):
+        seqs[seq.metadata['id']] = seq
+    assert len(set(seqs.keys()) & set(genes.index)) == genes.shape[0], \
+           "mismatch in gene identifiers between fasta file and summary table!"
+
+    genes['gene_length'] = list(map(lambda x: len(seqs[x]), genes.index))
+    seqLens = pd.Series([len(seq) for seq in seqs.values()], name='gene_length')
+    data = (seqLens >= 300).value_counts()
+    assert data[True] > data[False], \
+           "too few gene lengths are shorter than single Illumina " \
+           "reads of 300bp!"
+
+    # for most orthogroups with more than one member, I expect to have
+    # different gene sequences
+    data = (pd.Series([g['gene_length'].unique().shape[0]
+            for og, g in
+            genes.groupby('orthogroup')
+            if g.shape[0] > 1]) > 1).value_counts()
+    if False in data:
+        assert data[False] < data[True] * 0.1, \
+            "too few orthogroups with more than one member have identical" \
+            " length sequences"
+    print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)
+
+
+def test_tree(fp_basedir, genes):
+    tree = skbio.tree.TreeNode.read(
+        join(fp_basedir, 'summary', 'species_tree.newick'), format='newick')
+    og_counts = pd.pivot_table(
+        data=genes, index='origin_species', columns='orthogroup',
+        values='read_mean_count').applymap(lambda x: 1 if pd.notnull(x) else 0)
+
+    # TreeNode.read converts underscore to space?!
+    og_counts.index = list(map(lambda x: x.replace('_', ' '), og_counts.index))
+    og_faith = [skbio.diversity.alpha.faith_pd(
+                og_counts.iloc[:, i], otu_ids=og_counts.index, tree=tree,
+                validate=True)
+                for i in range(og_counts.shape[1])]
+    data = (pd.Series(og_faith) / tree.descending_branch_length()).describe()
+
+    assert tree.descending_branch_length() > 0, \
+           "species tree seems to have no branch length"
+    assert data['min'] > 0, \
+           "there seems to be an orthogroup with no member in any species?!"
+    assert data['max'] >= 0.99999, \
+           "I would expect at least one orthogroup to be present in all " \
+           "species, like a housekeeping gene"
+
+    print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)
+
+
+def test_fastq(fp_basedir, genes):
+    # check fastq files
+    fastqs = []
+    for group in [1, 2]:
+        for sample in range(1, params['Number of samples'][group - 1] + 1):
+            for direction in [1, 2]:
+                fastqs.append(
+                    join(fp_basedir, 'sample_%i_group%i_R%i.fastq.gz' % (
+                        sample, group, direction)))
+
+    phredscores = dict()
+    print('Loading sequencing data: ', end="", file=sys.stderr)
+    for fq in fastqs:
+        print('.', end="", file=sys.stderr)
+        seqs = skbio.io.read(fq, format='fastq', variant='illumina1.8')
+        quals = []
+        for i, seq in enumerate(seqs):
+            x = seq.positional_metadata['quality']
+            x.name = seq.metadata['id']
+            quals.append(x)
+            if i > 1000:
+                break
+        quals = pd.concat(quals, axis=1)
+        phredscores[basename(fq)] = quals
+    print(' done.', file=sys.stderr)
+
+    for fp, quals in phredscores.items():
+        data = quals.mean(axis=1)
+        assert data.iloc[10:20].mean() > data.iloc[-10:].mean(), \
+            "surprisingly, read starts have higher phred scores than ends for" \
+            " sample %s" % fp
+        if '_R1.' in fp:
+            dataR2 = phredscores[fp.replace('_R1.', '_R2.')].mean(axis=1)
+            assert data.mean() > dataR2.mean(), \
+                   "R2 phreds should be worse than R2 for sample %s" % fp
+
+    print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)
+
+
+if __name__ == "__main__":
+    fp_basedir = sys.argv[1]
+
+    # read used marbel parameters from file
+    params = read_parameters(fp_basedir)
+
+    # read gene summary information from file
+    genes = pd.read_csv(join(fp_basedir, 'summary', 'gene_summary.csv'),
+                        index_col=0)
+
+    # execute tests focussing on file gene_summary.csv
+    test_gene_summary(genes, params)
+
+    test_metaT_reference(fp_basedir, genes, params)
+
+    test_tree(fp_basedir, genes)
+
+    test_fastq(fp_basedir, genes)