Skip to content

Commit

Permalink
Merge branch 'master' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
tensulin committed Nov 18, 2024
2 parents 671e61d + 66840ed commit 09383f4
Show file tree
Hide file tree
Showing 7 changed files with 295 additions and 6 deletions.
56 changes: 55 additions & 1 deletion .github/workflows/github_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- name: Run tests with pytest
run: |
$CONDA/bin/pytest tests --doctest-modules --cov=src/marbel --cov-report=xml
$CONDA/bin/pytest tests --ignore ./tests/test_functional.py --doctest-modules --cov=src/marbel --cov-report=xml
- name: Convert coverage to lcov format
run: |
Expand All @@ -42,3 +42,57 @@ jobs:
# with:
# github-token: ${{ secrets.GITHUB_TOKEN }}
# path-to-lcov: "coverage.lcov"
simulate-data:
runs-on: ubuntu-latest

steps:
- name: Checkout Repo
uses: actions/checkout@v2
with:
lfs: true
- name: Checkout LFS objects
run: git lfs checkout
- name: Set up Python
uses: actions/setup-python@v2
- name: Create conda env
run: |
conda create -n marbel python=3.10 r-base
- name: Install the package
run: |
pip install -e .
- name: Simulate reads
run: |
marbel --n-species 10 --n-orthogroups 200 --n-samples 5 10 --library-size 10000 --outdir run_spec10_orth200_samp5-10/
- name: Archive simulation results
uses: actions/upload-artifact@v4
with:
name: simulated data
path: run_spec10_orth200_samp5-10


tests-functional:
needs: simulate-data
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}

steps:
- name: Checkout Repo
uses: actions/checkout@v2
with:
lfs: true
- name: Create conda test-env
run: |
conda create -n marbel-tests scikit-bio
- name: Download simulated data
uses: actions/download-artifact@v4
with:
name: simulated data
path: run_spec10_orth200_samp5-10/
- name: Execute functional tests
run: |
conda init
eval "$(conda shell.bash hook)"
conda activate marbel-tests
python tests/test_functional.py run_spec10_orth200_samp5-10/
4 changes: 2 additions & 2 deletions src/marbel/data/orthologues_processed_combined_all.parquet
Git LFS file not shown
2 changes: 2 additions & 0 deletions src/marbel/data_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,8 @@ def draw_dge_factors(dge_ratio, number_of_selected_genes):
Returns:
numpy.ndarray: The differentialy expressed factors
"""
dge_ratio = dge_ratio / 2

if number_of_selected_genes == 0:
return np.array([])

Expand Down
5 changes: 3 additions & 2 deletions src/marbel/meta_tran_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def bar_next(bar):
bar.next()
print()


@app.command()
def main(n_species: Annotated[int, typer.Option(callback=species_callback,
help="Number of species to be drawn for the metatranscriptomic in silico dataset")] = 20,
Expand All @@ -94,7 +95,8 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
+ "with a more diverse phylogenetic distance.")] = None,
min_identity: Annotated[float, typer.Option(help="Minimum mean sequence identity score for an orthologous groups."
+ "Specify for more ")] = None,
dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback, help="Ratio of up and down regulated genes. Must be between 0 and 1")] = 0.1,
dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback, help="Ratio of up and down regulated genes. Must be between 0 and 1."
"This is a random drawing process from normal distribution, so the actual ratio might vary.")] = 0.1,
seed: Annotated[int, typer.Option(help="Seed for the sampling. Set for reproducibility")] = None,
error_model: Annotated[ErrorModel, typer.Option(help="Sequencer model for the reads, use basic or perfect (no errors) for custom read length")] = ErrorModel.HiSeq,
compressed: Annotated[bool, typer.Option(help="Compress the output fastq files")] = True,
Expand All @@ -113,7 +115,6 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
number_of_orthogous_groups = n_orthogroups
number_of_species = n_species
number_of_sample = n_samples
dge_ratio = dge_ratio / 2
# maybe change to synthetic species later on, for now just use the available species
# generate some plots so the user can see the distribution

Expand Down
8 changes: 8 additions & 0 deletions tests/env_functional.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: marbel-tests
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- scikit-bio
- pandas
2 changes: 1 addition & 1 deletion tests/test_data_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def test_draw_dge_factors_output_shape(dge_ratio, num_genes):
assert result.shape[0] == num_genes


@pytest.mark.parametrize("dge_ratio", [-1, -0.01, 0.5, 0.75, 1.0])
@pytest.mark.parametrize("dge_ratio", [-1, -0.01, 1.0, 1.5, 2.0])
def test_draw_dge_factors_ratio_outdie_threshold_raises_value_error(dge_ratio):
with pytest.raises(ValueError):
draw_dge_factors(dge_ratio, 100)
Expand Down
224 changes: 224 additions & 0 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import inspect
import sys
import pandas as pd
from os.path import join, basename
import skbio
import gzip


def read_fastqGZ(fp_fastq):
def _push(sequences, header, sequence, qualities):
sequences.append({'header': header,
'sequence': sequence,
'qualities': qualities})
OFFSET = 33
sequences = []
with gzip.open(fp_fastq, 'rb') as f:
header = None
sequence = ""
qualities = []
for i, l in enumerate(f.readlines()):
line = l.decode("utf-8")
if (i % 4 == 0) and (line.startswith('@')):
if header is not None:
_push(sequences, header, sequence, qualities)
header = line[1:].strip()
elif (i % 4 == 1):
sequence = line.strip()
elif (i % 4 == 3):
qualities = pd.Series(
map(lambda x: ord(x) - OFFSET, line.strip()))
if header is not None:
_push(sequences, header, sequence, qualities)
return sequences


def read_parameters(fp_basedir):
params = pd.read_csv(
join(fp_basedir, 'summary', 'marbel_params.txt'),
index_col=0, sep=": ", header=None,
names=['parameter', 'value'], engine='python')['value'].to_dict()

# fix data types
params['Number of species'] = int(params['Number of species'])
params['Number of orthogroups'] = int(params['Number of orthogroups'])
params['Number of samples'] = list(
map(int, params['Number of samples']
.replace('(', '')
.replace(')', '')
.replace(' ', '').split(',')))
params['Ratio of up and down regulated genes'] = float(
params['Ratio of up and down regulated genes'])

return params


def test_gene_summary(genes, params):
counts = genes[[s for s in genes.columns if s.startswith('sample_')]]

assert len(genes.index) == len(set(genes.index)), \
"genes are ambiguous"

assert len(sorted(genes["origin_species"].unique())) == \
params['Number of species'], \
"number of species does not match"

genes['speciesID'] = list(map(lambda x: x.split('_')[0], genes.index))
assert genes.groupby(['speciesID', 'origin_species']).size().shape[0] == \
params['Number of species'], \
"gene_name prefix does not match origin_species"

genes['cdsID'] = list(map(lambda x: x.split('_')[-1], genes.index))
assert genes['orthogroup'].unique().shape[0] == \
params['Number of orthogroups'], \
"number orthogroups does not match"

assert genes.groupby('orthogroup').size().describe()['max'] > 1, \
"orthogroups consists of only 1 CDS"
data = (genes.groupby('orthogroup').size() > 1).value_counts()
assert data[True] > data[False] * 0.2, \
"surprisingly few orthogroups consists of more than one CDS"

data = (counts.stack() == 0).value_counts()
assert data[True] / data.sum() > 0.1, \
"surprisingly low sparseness in count matrix!"

# assert len(set(counts.sum())) == sum(params['Number of samples']), \
# "not all samples have different coverage"
assert counts.min().min() >= 0, \
"negative count values?!"

# foldchange <=0.5 (down) >=2 (up) (log2 foldchange<= -1 / >=1 )
data = genes['fold_change_ratio'].apply(
lambda x: 0.5 <= x <= 2).value_counts()
if params['Number of orthogroups'] >= 1000:
epsilon = (200 / params['Number of orthogroups'])
else:
epsilon = 0.4
# print(params['Ratio of up and down regulated genes'])
# print(data[False] / data.sum())
# print(data)
# print(params['Ratio of up and down regulated genes'] * (1 - epsilon) )
# print(params['Ratio of up and down regulated genes'] * (1 + epsilon) )
assert params['Ratio of up and down regulated genes'] * (1 - epsilon) <= \
(data[False] / data.sum()) <= \
params['Ratio of up and down regulated genes'] * (1 + epsilon), \
"ratio of UP (or Down) regulated genes does not match parameters"

print(len(set(list(map(lambda x: x.split('_')[1], genes.index)))),
"number CDS?!")
print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)


def test_metaT_reference(fp_basedir, genes, params):
seqs = dict()
for seq in skbio.io.read(join(fp_basedir, 'summary',
'metatranscriptome_reference.fasta'),
format='fasta'):
seqs[seq.metadata['id']] = seq
assert len(set(seqs.keys()) & set(genes.index)) == genes.shape[0], \
"mismatch in gene identifiers between fasta file and summary table!"

genes['gene_length'] = list(map(lambda x: len(seqs[x]), genes.index))
seqLens = pd.Series([len(seq) for seq in seqs.values()], name='gene_length')
data = (seqLens >= 300).value_counts()
assert data[True] > data[False], \
"too few gene lengths are shorter than single Illumina " \
"reads of 300bp!"

# for most orthogroups with more than one member, I expect to have
# different gene sequences
data = (pd.Series([g['gene_length'].unique().shape[0]
for og, g in
genes.groupby('orthogroup')
if g.shape[0] > 1]) > 1).value_counts()
if False in data:
assert data[False] < data[True] * 0.1, \
"too few orthogroups with more than one member have identical" \
" length sequences"
print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)


def test_tree(fp_basedir, genes):
tree = skbio.tree.TreeNode.read(
join(fp_basedir, 'summary', 'species_tree.newick'), format='newick')
og_counts = pd.pivot_table(
data=genes, index='origin_species', columns='orthogroup',
values='read_mean_count').applymap(lambda x: 1 if pd.notnull(x) else 0)

# TreeNode.read converts underscore to space?!
og_counts.index = list(map(lambda x: x.replace('_', ' '), og_counts.index))
og_faith = [skbio.diversity.alpha.faith_pd(
og_counts.iloc[:, i], otu_ids=og_counts.index, tree=tree,
validate=True)
for i in range(og_counts.shape[1])]
data = (pd.Series(og_faith) / tree.descending_branch_length()).describe()

assert tree.descending_branch_length() > 0, \
"species tree seems to have no branch length"
assert data['min'] > 0, \
"there seems to be an orthogroup with no member in any species?!"
assert data['max'] >= 0.99999, \
"I would expect at least one orthogroup to be present in all " \
"species, like a housekeeping gene"

print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)


def test_fastq(fp_basedir, genes):
# check fastq files
fastqs = []
for group in [1, 2]:
for sample in range(1, params['Number of samples'][group - 1] + 1):
for direction in [1, 2]:
fastqs.append(
join(fp_basedir, 'sample_%i_group%i_R%i.fastq.gz' % (
sample, group, direction)))

phredscores = dict()
print('Loading sequencing data: ', end="", file=sys.stderr)
for fq in fastqs:
print('.', end="", file=sys.stderr)
seqs = skbio.io.read(fq, format='fastq', variant='illumina1.8')
quals = []
for i, seq in enumerate(seqs):
x = seq.positional_metadata['quality']
x.name = seq.metadata['id']
quals.append(x)
if i > 1000:
break
quals = pd.concat(quals, axis=1)
phredscores[basename(fq)] = quals
print(' done.', file=sys.stderr)

for fp, quals in phredscores.items():
data = quals.mean(axis=1)
assert data.iloc[10:20].mean() > data.iloc[-10:].mean(), \
"surprisingly, read starts have higher phred scores than ends for" \
" sample %s" % fp
if '_R1.' in fp:
dataR2 = phredscores[fp.replace('_R1.', '_R2.')].mean(axis=1)
assert data.mean() > dataR2.mean(), \
"R2 phreds should be worse than R2 for sample %s" % fp

print("[OK] '%s' passed" % inspect.currentframe().f_code.co_name)


if __name__ == "__main__":
fp_basedir = sys.argv[1]

# read used marbel parameters from file
params = read_parameters(fp_basedir)

# read gene summary information from file
genes = pd.read_csv(join(fp_basedir, 'summary', 'gene_summary.csv'),
index_col=0)

# execute tests focussing on file gene_summary.csv
test_gene_summary(genes, params)

test_metaT_reference(fp_basedir, genes, params)

test_tree(fp_basedir, genes)

test_fastq(fp_basedir, genes)

0 comments on commit 09383f4

Please sign in to comment.