diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 48817547..d38244d4 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,4 +1,3 @@ - ## Getting Started If you are new to the project a good way to get started is by adding to the documentation, or adding unit tests where @@ -47,7 +46,6 @@ mkdocs build The contents of the user manual can then be viewed by opening the build-docs/index.html in any available web browser (i.e. google-chrome, firefox, etc.) - ## Deploy to PyPi Install deployment dependencies @@ -68,13 +66,11 @@ Use twine to upload twine upload -r pypi dist/* ``` - ## Reporting a Bug Please make sure to search through the issues before reporting a bug to ensure there isn't already an open issue. - ## Conventions ### Linting @@ -82,7 +78,7 @@ already an open issue. Use [black](https://github.com/psf/black) with strings off and line length 100 ```bash -black mavis -S -l 100 +black src/mavis -S -l 100 ``` ### Docstrings @@ -112,7 +108,6 @@ def some_function(some_arg: List[str]) -> None: any column name which may appear in any of the intermediate or final output files must be defined in `mavis.constants.COLUMNS` as well as added to the [columns glossary](../outputs/columns) - ### Tests - all new code must have unit tests in the tests subdirectory diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3421fd4b..83ccec48 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -3,15 +3,20 @@ name: build -on: [push, pull_request] +on: + push: + branches: + - master + - develop + pull_request: jobs: build: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] - + python-version: ["3.7", "3.8", "3.9", "3.10"] + name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 - name: install machine dependencies @@ -22,18 +27,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip "setuptools<58" wheel - pip install .[test] - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 mavis --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Lint with black - run: | - pip install black - # stop the build if black needs to be run - black mavis -S -l 100 --check + python -m pip install --upgrade pip setuptools + pip install -e .[test] # need editable to make sure the coverage reports correctly - name: install bwa run: | git clone https://github.com/lh3/bwa.git @@ -45,20 +40,24 @@ jobs: run: | wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat chmod a+x blat - - name: run short tests with pytest + - name: set up .pth file run: | - export PATH=$PATH:$(pwd):$(pwd)/bwa - pytest tests -v --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov mavis --cov-report term --cov-report xml --durations=10 - env: - RUN_FULL: 0 - if: github.event_name != 'pull_request' + python tests/setup_subprocess_cov.py - name: run full tests with pytest run: | export PATH=$PATH:$(pwd):$(pwd)/bwa - pytest tests -v --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov mavis --cov-report term --cov-report xml --durations=10 + export COVERAGE_PROCESS_START=$(pwd)/.coveragerc + + pytest tests -v \ + --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ + --cov mavis \ + --cov tools.convert_annotations_format \ + --cov-report term-missing \ + --cov-report xml \ + --durations=10 \ + --cov-branch env: RUN_FULL: 1 - if: github.event_name == 'pull_request' - name: Upload pytest test results uses: actions/upload-artifact@master with: @@ -76,3 +75,34 @@ jobs: name: codecov-umbrella fail_ci_if_error: true if: matrix.python-version == 3.8 + docker: + runs-on: ubuntu-latest + name: docker build + steps: + - uses: actions/checkout@v2 + - name: build the docker container + run: | + docker build --file Dockerfile --tag bcgsc/mavis:latest . + - name: test the help menu + run: | + docker run bcgsc/mavis -h + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install workflow dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + pip install mavis_config pandas snakemake + - uses: eWaterCycle/setup-singularity@v6 + with: + singularity-version: 3.6.4 + - name: docker2singularity + run: + docker run --mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock --mount type=bind,source="$(pwd)",target=/output --privileged -t --rm singularityware/docker2singularity bcgsc/mavis:latest + - name: Run analysis with snakemake & singularity + run: | + # get the SIMG filename + export SNAKEMAKE_CONTAINER=$(ls *mavis*.simg) + snakemake -j 2 --configfile tests/mini-tutorial.config.json --use-singularity + if: always() diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 35bf9a9d..3b19260f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,10 +8,8 @@ on: types: [created] jobs: - deploy: - + pypi: runs-on: ubuntu-latest - steps: - uses: actions/checkout@v2 - name: Set up Python @@ -30,3 +28,15 @@ jobs: python setup.py sdist bdist_wheel install twine check dist/* twine upload dist/* + docker: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD + env: + DOCKER_USER: ${{ secrets.DOCKER_USER }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + - run: | + docker build --file Dockerfile --tag bcgsc/mavis:latest --tag bcgsc/mavis:${{ github.event.release.tag_name }} . + - run: docker push bcgsc/mavis:latest + - run: docker push bcgsc/mavis:${{ github.event.release.tag_name }} diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml new file mode 100644 index 00000000..a1d333b4 --- /dev/null +++ b/.github/workflows/quick-tests.yml @@ -0,0 +1,53 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: quick-tests + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10"] + name: python-${{ matrix.python-version }} quick + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + pip install .[test] + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 src/mavis --count --select=E9,F63,F7,F82 --show-source --statistics + - name: Lint with black + run: | + pip install black + # stop the build if black needs to be run + black src/mavis -S -l 100 --check + - name: install bwa + run: | + git clone https://github.com/lh3/bwa.git + cd bwa + git checkout v0.7.17 + make + cd .. + - name: install blat + run: | + wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat + chmod a+x blat + - name: run short tests with pytest + run: | + export PATH=$PATH:$(pwd):$(pwd)/bwa + pytest tests -v \ + --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ + --durations=10 + env: + RUN_FULL: 0 diff --git a/.gitignore b/.gitignore index 01fe3a1e..c3cad29a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # python generated files /.eggs /coverage -/venv +/venv* /.coverage *.pyc *__pycache__ @@ -15,6 +15,7 @@ junit .tox *eggs/ .mypy_cache +.snakemake # aligners blat @@ -32,3 +33,12 @@ junit /docs/package/mavis/*/*.md # don't ignore subpackage summary files !/docs/package/mavis/*/index.md +docs/configuration/settings.md + +.snakemake +output_dir* +bin +dag* +tutorial_data +reference_inputs +tmp diff --git a/.readthedocs.yml b/.readthedocs.yml index aa35ae55..12ffe7f3 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -20,4 +20,4 @@ python: - method: pip path: . extra_requirements: - - docs + - doc diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..2d0dae78 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,55 @@ +FROM python:3.7-slim-buster + +WORKDIR /app + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y git wget make gcc libz-dev + +# pysam dependencies +RUN apt-get install -y libncurses5-dev zlib1g-dev libbz2-dev libncursesw5-dev liblzma-dev + +# install BWA +RUN git clone https://github.com/lh3/bwa.git && \ + cd bwa && \ + git checkout v0.7.17 && \ + make && \ + cd .. && \ + mv bwa/bwa /usr/local/bin + +# install minimap2 +RUN git clone https://github.com/lh3/minimap2.git && \ + cd minimap2 && \ + git checkout v2.24 && \ + make && \ + cd .. && \ + mv minimap2/minimap2.1 /usr/local/bin + +# install blat dependencies +RUN apt-get install -y libcurl4 + +# install blat +RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ + chmod a+x blat && \ + mv blat /usr/local/bin + +# install wtdbg2 +RUN git clone https://github.com/ruanjue/wtdbg2.git && \ + cd wtdbg2 && \ + make && \ + cd .. && \ + mv wtdbg2/wtdbg2 /usr/local/bin + +COPY setup.py setup.py +COPY setup.cfg setup.cfg +COPY MANIFEST.in MANIFEST.in +COPY pyproject.toml pyproject.toml +COPY src src +COPY LICENSE LICENSE +COPY README.md README.md + +# install python package +RUN pip install -U setuptools pip wheel +RUN pip install . +RUN which mavis +ENTRYPOINT [ "mavis" ] diff --git a/MANIFEST.in b/MANIFEST.in index 1e842b2d..16491603 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,10 +1,6 @@ -recursive-include docs * -recursive-include tests *.py -include tests/*/data/* -recursive-include mavis *.py -recursive-include tools *.pl *.py *.pm -recursive-include tab *.py +recursive-include src *.py *.json +include src/mavis/py.typed include README.md include LICENSE -prune docs/build -prune docs/source/auto +prune docs +prune tests diff --git a/README.md b/README.md index c82b6738..b6e6ef45 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,8 @@
- ![PyPi](https://img.shields.io/pypi/v/mavis.svg) ![build](https://github.com/bcgsc/mavis/workflows/build/badge.svg?branch=master) [![codecov](https://codecov.io/gh/bcgsc/mavis/branch/master/graph/badge.svg)](https://codecov.io/gh/bcgsc/mavis) ![ReadTheDocs](https://readthedocs.org/projects/pip/badge/) - ## About [MAVIS](http://mavis.bcgsc.ca) is python command-line tool for the post-processing of structural variant calls. @@ -39,87 +37,28 @@ Common problems and questions are addressed on the [wiki](https://github.com/bcg If you have a question or issue that is not answered there (or already a github issue) please submit a github issue to our [github page](https://github.com/bcgsc/mavis/issues) or contact us by email at [mavis@bcgsc.ca](mailto:mavis@bcgsc.ca) -## Install Instructions - -There are 3 major steps to setting up and installing [MAVIS](http://mavis.bcgsc.ca). If you are a developer contributing to mavis, please see the [instructions for developers page](https://mavis.readthedocs.io/en/latest/development) instead - -### 1. Install Aligner - -In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed. -Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa). -For MAVIS to run successfully the aligner must be installed and accessible on the path. -If you have a non-standard install you may find it useful to edit the PATH environment variable. For example - -``` bash -export PATH=/path/to/directory/containing/blat/binary:$PATH -``` - -[blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) as a default instead, use the -[MAVIS environment variables](https://mavis.readthedocs.io/en/latest/configuration/settings/). Make sure to specify BOTH of the variables below to change the default aligner. - -``` bash -export MAVIS_ALIGNER='bwa mem' -export MAVIS_ALIGNER_REFERENCE=/path/to/mem/fasta/ref/file -``` - -After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/) - -### 2. Install MAVIS - -#### Install using pip - -The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads) - -Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies - -``` bash -pip install --upgrade pip "setuptools<58" -``` - -or (for Anaconda users) - -``` bash -conda update pip "setuptools<58" -``` - -If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html) - -Then install mavis itself - -``` bash -pip install mavis -``` - -This will install mavis and its python dependencies. +## Getting Started -#### Install using Buildout +The simplest way to use MAVIS is via Singularity. The MAVIS docker container used +by singularity will take care of installing the aligner as well. -Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis - -``` bash -git clone https://github.com/bcgsc/mavis.git -cd mavis -pip install zc.buildout -python bootstrap.py -bin/buildout +```bash +pip install -U setuptools pip +pip install mavis_config # also installs snakemake ``` -This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis +Now you will run mavis via Snakemake as follows -### 3. Build or Download Reference Files - -After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files and generate a MAVIS environment file is provided under mavis/tools for convenience. - -``` bash -cd /path/to/where/you/want/to/put/the/files -wget https://raw.githubusercontent.com/bcgsc/mavis/master/tools/get_hg19_reference_files.sh -bash get_hg19_reference_files.sh -source reference_inputs/hg19_env.sh +```bash +snakemake \ + -j \ + --configfile \ + --use-singularity \ + -s Snakefile ``` -Once the above 3 steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run. -See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS. - +For other installation options which do not use docker/singularity see the comprehensive install +instructions in the [user manual](https://mavis.readthedocs.io/en/latest/install) ## Citation diff --git a/Snakefile b/Snakefile new file mode 100644 index 00000000..c361c753 --- /dev/null +++ b/Snakefile @@ -0,0 +1,236 @@ +from snakemake.exceptions import WorkflowError +import os +import json +from mavis_config import ( + count_total_rows, + get_library_inputs, + get_singularity_bindings, + guess_total_batches, + validate_config, +) +from mavis_config.constants import SUBCOMMAND + +# env variable mainly for CI/CD +CONTAINER = os.environ.get('SNAKEMAKE_CONTAINER', 'docker://bcgsc/mavis:v3.0.0') +MAX_TIME = 57600 +DEFAULT_MEMORY_MB = 16000 + + +if 'output_dir' not in config: + raise WorkflowError('output_dir is a required property of the configfile') + + +def output_dir(*paths): + return os.path.join(config['output_dir'], *paths) + + +INITIALIZED_CONFIG = output_dir('config.json') +LOG_DIR = output_dir('logs') + +# external schedulers will not create the log dir if it does not already exist +if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR, exist_ok=True) + +try: + validate_config(config, stage=SUBCOMMAND.SETUP) +except Exception as err: + short_msg = ' '.join(str(err).split('\n')[:2]) # these can get super long + raise WorkflowError(short_msg) + +# ADD bindings for singularity +workflow.singularity_args = f'-B {",".join(get_singularity_bindings(config))}' + +libraries = sorted(list(config['libraries'])) +VALIDATE_OUTPUT = output_dir('{library}/validate/batch-{job_id}/validation-passed.tab') +CLUSTER_OUTPUT = output_dir('{library}/cluster/batch-{job_id}.tab') + + +for library in libraries: + if 'total_batches' in config['libraries'][library]: + continue + + # if not input by user, estimate the clusters based on the input files + config['libraries'][library]['total_batches'] = guess_total_batches( + config, get_library_inputs(config, library) + ) + + +libs_args = [] +jobs_args = [] +for library in libraries: + for job_id in range(1, config['libraries'][library]['total_batches'] + 1): + libs_args.append(library) + jobs_args.append(job_id) + + +rule all: + input: output_dir('summary/MAVIS.COMPLETE') + + +rule copy_config: + output: output_dir('config.raw.json') + resources: + time_limit=MAX_TIME, + mem_mb=4000, + cpus=1, + log_dir=LOG_DIR + log: os.path.join(LOG_DIR, 'copy_config.snakemake.log.txt') + run: + with open(output_dir('config.raw.json'), 'w') as fh: + fh.write(json.dumps(config, sort_keys=True, indent=' ')) + +# adds the bam stats and default settings +rule init_config: + input: rules.copy_config.output + output: INITIALIZED_CONFIG + container: CONTAINER + log: os.path.join(LOG_DIR, 'init_config.snakemake.log.txt') + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR + shell: 'mavis setup --config {input} --outputfile {output}' + + +rule convert: + output: output_dir('converted_outputs/{alias}.tab') + input: rules.init_config.output + log: os.path.join(LOG_DIR, 'convert.snakemake.{alias}.log.txt') + params: + file_type=lambda w: config['convert'][w.alias]['file_type'], + strand_specific=lambda w: config['convert'][w.alias]['strand_specific'], + assume_no_untemplated=lambda w: config['convert'][w.alias]['assume_no_untemplated'], + input_files=lambda w: config['convert'][w.alias]['inputs'] + container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR + shell: + 'mavis convert --file_type {params.file_type}' + + ' --strand_specific {params.strand_specific}' + + ' --assume_no_untemplated {params.assume_no_untemplated}' + + ' --inputs {params.input_files}' + + ' --outputfile {output}' + + ' &> {log}' + + +def get_cluster_inputs(w): + conversions = config['convert'] + inputs = [] + for assignment in config['libraries'][w.library]['assign']: + if assignment in conversions: + inputs.extend(expand(rules.convert.output, alias=assignment)) + else: + inputs.append(assignment) + + return inputs + + +rule cluster: + input: files=get_cluster_inputs, + config=rules.init_config.output + output: directory(output_dir('{library}/cluster')) + log: os.path.join(LOG_DIR, 'snakemake.cluster.{library}.log.txt') + container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR + shell: + 'mavis cluster --config {input.config}' + + ' --library {wildcards.library}' + + ' --inputs {input.files}' + + ' --output {output}' + + ' &> {log}' + + +if not config['skip_stage.validate']: + rule validate: + input: rules.cluster.output + params: + dirname=lambda w: output_dir(f'{w.library}/validate/batch-{w.job_id}'), + inputfile=lambda w: expand(CLUSTER_OUTPUT, library=[w.library], job_id=[w.job_id]) + output: VALIDATE_OUTPUT + log: os.path.join(LOG_DIR, '{library}.validate.snakemake.batch-{job_id}.log.txt') + container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=18000, + cpus=2, + log_dir=LOG_DIR + shell: + 'mavis validate --config {rules.init_config.output}' + + ' --library {wildcards.library}' + + ' --inputs {params.inputfile}' + + ' --output {params.dirname}' + + ' &> {log}' + + +def get_annotate_input_file(wildcards): + if not config['skip_stage.validate']: + return expand(rules.validate.output, library=[wildcards.library], job_id=[wildcards.job_id]) + return expand(CLUSTER_OUTPUT, library=[wildcards.library], job_id=[wildcards.job_id]) + + +rule annotate: + input: rules.validate.output if not config['skip_stage.validate'] else rules.cluster.output + output: stamp=output_dir('{library}/annotate/batch-{job_id}/MAVIS.COMPLETE'), + result=output_dir('{library}/annotate/batch-{job_id}/annotations.tab') + params: + inputfile=get_annotate_input_file + log: os.path.join(LOG_DIR, '{library}.annotate.snakemake.batch-{job_id}.log.txt') + container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=2, + log_dir=LOG_DIR + shell: + 'mavis annotate --config {rules.init_config.output}' + + ' --library {wildcards.library}' + + ' --inputs {params.inputfile}' + + ' --output ' + output_dir('{wildcards.library}/annotate/batch-{wildcards.job_id}') + + ' &> {log}' + + +rule pairing: + input: expand(rules.annotate.output.result, zip, library=libs_args, job_id=jobs_args) + output: stamp=output_dir('pairing/MAVIS.COMPLETE'), + result=output_dir('pairing/mavis_paired.tab') + params: + dirname=output_dir('pairing') + log: os.path.join(LOG_DIR, output_dir('snakemake.pairing.log.txt')) + container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR + shell: + 'mavis pairing --config {rules.init_config.output}' + + ' --inputs {input}' + + ' --output {params.dirname}' + + ' &> {log}' + + +rule summary: + input: rules.pairing.output.result, + output: output_dir('summary/MAVIS.COMPLETE') + params: + dirname=output_dir('summary') + log: os.path.join(LOG_DIR, 'snakemake.summary.log.txt') + container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR + shell: + 'mavis summary --config {rules.init_config.output}' + + ' --inputs {input}' + + ' --output {params.dirname}' + + ' &> {log}' diff --git a/docs/background/.pages b/docs/background/.pages new file mode 100644 index 00000000..278d1410 --- /dev/null +++ b/docs/background/.pages @@ -0,0 +1,3 @@ +nav: + - theory.md + - citations.md diff --git a/docs/background/theory.md b/docs/background/theory.md index ac4ab2a7..d63210a4 100644 --- a/docs/background/theory.md +++ b/docs/background/theory.md @@ -6,26 +6,20 @@ In MAVIS structural variants (SVs) are defined by a pair of breakpoints And a breakpoint is defined by -1. chromosome -2. base-pair range (start, end). This has a length of 1 for exact calls - and more for uncertain/non-specific calls -3. [orientation](../../glossary/#orientation). This is Left or Right - with respect to the positive/forward strand. This defines which - portion of the genome is 'retained' -4. [strand](../../glossary/#strand). (only applicable to - stranded transcriptome libraries) +1. chromosome +2. base-pair range (start, end). This has a length of 1 for exact calls and more for uncertain/non-specific calls +3. [orientation](../../glossary/#orientation). This is Left or Right with respect to the positive/forward strand. This defines which portion of the genome is 'retained' +4. [strand](../../glossary/#strand). (only applicable to stranded transcriptome libraries) So then a breakpoint pair is any two intervals on the reference genome which are adjacent in the mutant genome - - ## Evidence There are many ways that single reads or paired-end reads can act as support for an SV call. -![](../images/read_evidence.svg) +![read evidence](../images/read_evidence.svg) In the figure above the red rectangle represents a deletion structural variant. The arrows are types of single or paired-end reads supporting @@ -58,7 +52,7 @@ For a deletion, we expect the flanking reads to be in the normal orientation but that the fragment size should be abnormal (for large deletions). -![](../images/read_pairs_deletion.svg) +![deletion supporting read pairs](../images/read_pairs_deletion.svg) Flanking read pair evidence for a deletion event. the read pairs will have a larger than expected fragment size when mapped to the reference @@ -71,10 +65,9 @@ on the positive strand and the second read in the pair would be on the negative/reverse strand. - #### Insertion -![](../images/read_pairs_insertion.svg) +![insertion supporting read pairs](../images/read_pairs_insertion.svg) Flanking read pair evidence for an insertion event. The read pairs will have a smaller than expected fragment size when mapped to the @@ -87,10 +80,9 @@ on the positive strand and the second read in the pair would be on the negative/reverse strand. - #### Duplication -![](../images/read_pairs_duplication.svg) +![duplication support read pairs](../images/read_pairs_duplication.svg) Flanking read pair evidence for a tandem duplication event. The read pairs will have an abnormal orientation but still the same strands as @@ -99,57 +91,48 @@ strand and have a right orientation. (B2) The second breakpoint will be on the positive strand and have a left orientation. - #### Inversion -![](../images/read_pairs_inversion_LL.svg) +![inversion supporting read pairs](../images/read_pairs_inversion_LL.svg) Flanking read pair evidence for an inversion. Both breakpoints have left orientation. - -![](../images/read_pairs_inversion_RR.svg) +![inversion supporting read pairs](../images/read_pairs_inversion_RR.svg) Flanking read pair evidence for an inversion. Both breakpoints have right orientation. - #### Translocation -![](../images/read_pairs_translocation_LR.svg) +![translocation supporting read pairs](../images/read_pairs_translocation_LR.svg) Flanking read pair evidence for a translocation. (B1) the first breakpoint with a left orientation. (B2) the second breakpoint with a right orientation. - -![](../images/read_pairs_translocation_RL.svg) +![translocation supporting read pairs](../images/read_pairs_translocation_RL.svg) Flanking read pair evidence for a translocation. (B1) the first breakpoint with a right orientation. (B2) the second breakpoint with a left orientation. - #### Inverted Translocation -![](../images/read_pairs_translocated_inversion_LL.svg) +![translocation supporting read pairs](../images/read_pairs_translocated_inversion_LL.svg) Flanking read pair evidence for an inverted translocation. Both breakpoints have left orientation. - -![](../images/read_pairs_translocated_inversion_RR.svg) +![translocation supporting read pairs](../images/read_pairs_translocated_inversion_RR.svg) Flanking read pair evidence for an inverted translocation. Both breakpoints have right orientation. - - - ### Compatible Flanking Pairs For insertion and duplication events compatible flanking pairs are @@ -158,7 +141,7 @@ be used as compatible flanking evidence for an insertion (in the same region) and similarly flanking pairs which support an insertion may be compatible flanking evidence for a duplication -![](../images/compatible_flanking_pairs.svg) +![compatible flanking pairs](../images/compatible_flanking_pairs.svg) The event depicted above may be called as either a duplication or an insertion (depending on the input call). If the even were called as a @@ -167,30 +150,24 @@ reads in blue would be given as compatible flanking support. If the event were called as an insertion the reverse would apply. - - - ### Calculating the Evidence Window -![](../images/read_pair_definitions.svg) +![read pair defn terms](../images/read_pair_definitions.svg) Basic Terms used in describing read pairs are shown above: fragment size: the distance between the pair; read length: the length of the read; fragment size: the combined length of both reads and the fragment size - We make some base assumptions with regards to paired-end read data: !!! note the distribution of fragment sizes approximately follows a normal distribution - !!! note the most common fragment size is the unmutated 'normal' fragment - With the above assumptions we take the median fragment size to be the expected normal. @@ -217,7 +194,7 @@ stdev = math.sqrt(sum(X) / len(X)) This gives us an idea of when to judge an fragment size as abnormal and where we expect our normal read pairs fragment sizes to fall. -![](../images/fragment_sizes_histogram.svg) +![read pair fragment size histogram](../images/fragment_sizes_histogram.svg) Distribution of fragment sizes (absolute values) of proper read pairs. The black curve representings the fit for a normal distribution using @@ -227,14 +204,13 @@ thick vertical black line is the median and the thin black lines are standard deviations away from the median. - As we can see from the diagram above, removing the outliers reproduces the observed distribution better than using all data points We use this in two ways -1. to find flanking evidence supporting deletions and insertions -2. to estimate the window size for where we will need to read from the +1. to find flanking evidence supporting deletions and insertions +2. to estimate the window size for where we will need to read from the bam when looking for evidence for a given event The @@ -250,8 +226,6 @@ complicated and we must take into account the possible annotations when calculating the evidence window. see `mavis.validate.evidence.TranscriptomeEvidence._generate_window` for more - - ### Calling Breakpoints by Flanking Evidence Breakpoints are called by contig, split-read, or flanking pairs @@ -273,8 +247,6 @@ outline, no fill) demonstrates the read length used to narrow the right side bound of the [estimated breakpoint interval. - - ### Determining Flanking support ![flanking support](../images/flanking_pairs_fragment_sizes_deletion.svg) @@ -290,7 +262,6 @@ The shaded portion of the graph represents the range in fragment sizes we expect for flanking pairs supporting the deletion event. - ## Classifying Events The following decision tree is used in classifying events based on their @@ -317,8 +288,6 @@ reverse complement are assembled into contigs using a [DeBruijn graph](../../glossary/#debruijn-graph). For strand specific events, we then attempt to resolve the sequence strand of the contig. - - ## Annotating Events We make the following assumptions when determining the annotations for @@ -328,15 +297,12 @@ each event If both breakpoints are in the same gene, they must also be in the same transcript - !!! note If the breakpoint intervals overlap we do not annotate encompassed genes - !!! note Encompassed and 'nearest' genes are reported without respect to strand - There are specific questions we want annotation to answer. We collect gene level annotations which describes things like what gene is near the breakpoint (useful in the case of a potential promoter swap); what genes @@ -357,8 +323,6 @@ computed. This is translated to a putative amino acid sequence from which protein metrics such as the possible ORFs and domain sequences can be computed. - - ## Predicting Splicing Patterns After the events have been called and an annotation has been attached, @@ -392,28 +356,17 @@ is paired with the 2nd donor site More complex examples are drawn below. There are five classifications -(`mavis.constants.SPLICE_TYPE`) for the +([`mavis.constants.SPLICE_TYPE`](../../package/mavis/constants/#class-mavisconstantssplice_type)) for the different splicing patterns: -1. Retained intron - (`mavis.constants.SPLICE_TYPE.RETAIN`{.interpreted-text - role="class"}) -2. Skipped exon (`mavis.constants.SPLICE_TYPE.SKIP`{.interpreted-text - role="attr"}) -3. Multiple retained introns - (`mavis.constants.SPLICE_TYPE.MULTI_RETAIN`{.interpreted-text - role="attr"}) -4. Multiple skipped exons - (`mavis.constants.SPLICE_TYPE.MULTI_SKIP`{.interpreted-text - role="attr"}) -5. Some combination of retained introns and skipped exons - (`mavis.constants.SPLICE_TYPE.COMPLEX`{.interpreted-text - role="attr"}) +1. Retained intron +2. Skipped exon +3. Multiple retained introns +4. Multiple skipped exons +5. Some combination of retained introns and skipped exons ![Splicing scenarios](../images/splicing_model.svg) - - ## Pairing Similar Events After breakpoints have been called and annotated we often need to see if @@ -430,7 +383,6 @@ rise to the following basic cases. breakpoint, or it is the same as the nearest retained donor/acceptor to the breakpoint. - ![exonic splicing](../images/breakpoint_prediction_exonic.svg) (A-D) The breakpoint lands in an exon and the five prime portion of diff --git a/docs/configuration/general.md b/docs/configuration/general.md index 443cedee..dea8801f 100644 --- a/docs/configuration/general.md +++ b/docs/configuration/general.md @@ -1,57 +1,34 @@ # Getting Started -An exhaustive list of the various configurable settings can be found [here](../settings) +An exhaustive list of the various configurable settings can be found [here](../settings). Alternatively you can view them through the [online schema explorer](https://json-schema.app/view?url=https://raw.githubusercontent.com/bcgsc/mavis_config/master/src/mavis_config/config.json) ## Pipeline Configuration File -The pipeline can be run in steps or it can be configured using a +The pipeline can be run in steps or it can be configured using a JSON configuration file and setup in a single step. Scripts will be generated -to run all steps following clustering. The configuration file can be -built from scratch or a template can be output as shown below +to run all steps following clustering. -```bash -mavis config --write template.cfg -``` +The config schema is found in the mavis package under `src/mavis/schemas/config.json` -This will create a template config file called template.cfg which can -then be edited by the user. However this will be a simple config with no -library information. To generate a configuration file with the library -information as well as estimates for the fragment size parameters more -inputs are required (see -[generating the config file](../../tutorials/full/#generating-the-config-file) for more information). - -## Environment Variables - -Most of the default settings can be changed by using environment -variables. The value given by the environment variables will be used as -the new default. Config or command-line parameters will still override -these settings. - -All environment variables are prefixed with MAVIS and an underscore. -Otherwise the variable name is the same as that used for the command -line parameter or config setting (uppercased). For example to change the -default minimum mapping quality used during the validate stage - -```bash -export MAVIS_MIN_MAPPING_QUALITY=10 -``` +Top level settings follow the pattern `
.`. The convert and library +sections are nested objects. ## Adjusting the Resource Requirements ### Choosing the Number of Validation/Annotation Jobs MAVIS chooses the number of jobs to split validate/annotate stages into -based on two settings: [max_files](../../configuration/settings/#max_files) and -[min_clusters_per_file](../../configuration/settings/#min-clusters-per-file). +based on two settings: [cluster.max_files](../../configuration/settings/#clustermax_files) and +[cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file). For example, in the following situation say you have: 1000 clusters, -`max_files=10`, and `min_clusters_per_file=10`. Then MAVIS will set up +`cluster.max_files=10`, and `cluster.min_clusters_per_file=10`. Then MAVIS will set up 10 validation jobs each with 100 events. -However, if `min_clusters_per_file=500`, then MAVIS would only set up 2 +However, if `cluster.min_clusters_per_file=500`, then MAVIS would only set up 2 jobs each with 500 events. This is because -[min_clusters_per_file](../../configuration/settings/#min-clusters-per-file) takes precedence -over [max_files](../../configuration/settings/#max_files). +[cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file) takes precedence +over [custer.max_files](../../configuration/settings/#clustermax_files). Splitting into more jobs will lower the resource requirements per job (see [resource requirements](../performance/)). The memory and time requirements for validation are linear @@ -60,27 +37,8 @@ with respect to the number of events to be validated. ### Uninformative Filter For example, if the user is only interested in events in genes, then the -[uninformative_filter](../../configuration/settings/#uninformative_filter) can be used. This +[cluster.uninformative_filter](../../configuration/settings/#clusteruninformative_filter) can be used. This will drop all events that are not within a certain distance -([max_proximity](../../configuration/settings/#max_proximity)) to any annotation in +([cluster.max_proximity](../../configuration/settings/#clustermax_proximity)) to any annotation in the annotations reference file. These events will be dropped prior to the validation stage which results in significant speed up. - -This can be set using the environment variable - -```bash -export MAVIS_UNINFORMATIVE_FILTER=True -``` - -or in the pipeline config file - -```text -[cluster] -uninformative_filter = True -``` - -or as a command line argument to the cluster stage - -```bash -mavis cluster --uninformative_filter True .... -``` diff --git a/docs/configuration/performance.md b/docs/configuration/performance.md index e666c10a..a5cb9e7c 100644 --- a/docs/configuration/performance.md +++ b/docs/configuration/performance.md @@ -12,7 +12,7 @@ cpu requirements depending on what the user is trying to analyze. See ## Validation Resources -![](../images/colo829_tumour_validation_resource_req.png) +![validation resources](../images/colo829_tumour_validation_resource_req.png) Resource Requirements (MAVIS 1.8.0) for each validation job of the COLO829 tumour genome. The BAM file for the tumour genome is 127GB. @@ -21,7 +21,6 @@ structural variant validations per job. The effect of number of events validated on both memory and time is plotted above. - ## Annotation Resources Similar trends were observed for the annotation step (see below) with @@ -29,7 +28,7 @@ regards to time elapsed. However the memory requirements remained more constant which is expected since, unlike validation, anntotation does not read more data in for more events. -![](../images/colo829_tumour_annotation_resource_req.png) +![annotation resources](../images/colo829_tumour_annotation_resource_req.png) Resource Requirements (MAVIS 1.8.0) for each annotation job of the COLO829 tumour genome. The events which passed validation (see above) diff --git a/docs/configuration/pipeline.md b/docs/configuration/pipeline.md index 73e2126e..a79e16f2 100644 --- a/docs/configuration/pipeline.md +++ b/docs/configuration/pipeline.md @@ -2,135 +2,32 @@ ## Running MAVIS using a Job Scheduler -The setup step of MAVIS is set up to use a job scheduler on a -compute cluster. will generate submission scripts and a wrapper bash -script for the user to execute on their cluster head node. - -![](../images/pipeline_options.svg) +MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling +and setup The MAVIS pipeline is highly configurable. Some pipeline steps (cluster, validate) are optional and can be automatically skipped. The standard pipeline is far-left. - -The most common use case is -[auto-generating a configuration file](../../tutorials/full/#generating-the-config-file) and then running the pipeline setup step. The pipeline setup -step will run clustering and create scripts for running the other steps. +The most common use case is running the pipeline through snakemake ```bash -mavis config .... -w config.cfg -mavis setup config.cfg -o /path/to/top/output_dir +snakemake -j --configfile -s Snakefile ``` -This will create the build.cfg configuration file, which is used by the -scheduler to submit jobs. To use a particular scheduler you will need to -set the `MAVIS_SCHEDULER` environment variable. After the -build configuration file has been created you can run the mavis schedule -option to submit your jobs +If you are submitting to a cluster, use the [snakemake profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) ```bash -ssh cluster_head_node -mavis schedule -o /path/to/output_dir --submit +snakemake -j --configfile --profile -s Snakefile ``` This will submit a series of jobs with dependencies. -![](../images/pipeline_dependency_graph.svg) - -Dependency graph of MAVIS jobs for the standard pipeline setup. The -notation on the arrows indicates the SLURM setting on the job to add the -dependency on the previous -job. - - -### Configuring Scheduler Settings - -There are multiple ways to configure the scheduler settings. Some of the -configurable options are listed below - -- [MAVIS_QUEUE](../../configuration/settings/#queue) -- [MAVIS_MEMORY_LIMIT](../../configuration/settings/#memory_limit) -- [MAVIS_TIME_LIMIT](../../configuration/settings/#time_limit) -- [MAVIS_IMPORT_ENV](../../configuration/settings/#import_env) -- [MAVIS_SCHEDULER](../../configuration/settings/#scheduler) - -For example to set the job queue default using an -[environment variable](../../configuration/general/#environment-variables) - -```bash -export MAVIS_QUEUE=QUEUENAME -``` - -Or it can also be added to the config file manually - - [schedule] - queue = QUEUENAME - -### Troubleshooting Dependency Failures - -The most common error to occur when running MAVIS on the cluster is a -memory or time limit exception. These can be detected by running the -schedule step or looking for dependency failures reported on the -cluster. The suffix of the job name will be a number and will correspond -to the suffix of the job directory. +To use the mavis docker container through singularity, instead of installing mavis via pip, add the +[`--use-singularity`](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers) +flag. ```bash -mavis schedule -o /path/to/output/dir +snakemake -j --configfile --profile --use-singularity -s Snakefile` ``` - -This will report any failed jobs. For example if this were a crash -report for one of the validation jobs we might expect to see something -like below in the schedule output - - [2018-05-31 13:02:06] validate - MV__- () is FAILED - CRASH: - -Any jobs in an error, failed, etc. state can be resubmitted by running -mavis schedule with the resubmit flag - -```bash -mavis schedule -o /path/to/output/dir --resubmit -``` - -If a job has failed due to memory or time limits, editing the -`/path/to/output/dir/build.cfg` file can allow the user to change a job -without resetting up and rerunning the other jobs. For example, below is -the configuration for a validation job - - [MV_mock-A47933_batch-D2nTiy9AhGye4UZNapAik6] - stage = validate - job_ident = 1691742 - name = MV_mock-A47933_batch-D2nTiy9AhGye4UZNapAik6 - dependencies = - script = /path/to/output/dir/mock-A47933_diseased_transcriptome/validate/submit.sh - status = FAILED - output_dir = /path/to/output/dir/mock-A47933_diseased_transcriptome/validate/batch-D2nTiy9AhGye4UZNapAik6-{task_ident} - stdout = /path/to/output/dir/mock-A47933_diseased_transcriptome/validate/batch-D2nTiy9AhGye4UZNapAik6-{task_ident}/job-{name}-{job_ident}-{task_ident}.log - created_at = 1527641526 - status_comment = - memory_limit = 18000 - queue = short - time_limit = 57600 - import_env = True - mail_user = - mail_type = NONE - concurrency_limit = None - task_list = 1 - 2 - 3 - -The memory\_limit is in Mb and the time\_limit is in seconds. Editing -the values here will cause the job to be resubmitted with the new -values. - -!!! warning - Incorrectly editing the build.cfg file may have unanticipated results - and require re-setting up MAVIS to fix. Generally the user should ONLY - edit `memory_limit` and `time_limit` values. - - If memory errors are frequent then it would be better to adjust the - default values ([trans_validation_memory](../../configuration/settings/#trans_validation_memory), - [validation_memory](../../configuration/settings/#validation_memory), - [time_limit](../../configuration/settings/#time_limit)) diff --git a/docs/configuration/settings.md b/docs/configuration/settings.md deleted file mode 100644 index 1921e93b..00000000 --- a/docs/configuration/settings.md +++ /dev/null @@ -1,1211 +0,0 @@ - - -# Configurable Settings -## aligner - -**type**: `#!python mavis.align.SUPPORTED_ALIGNER` - -**environment variable**: `MAVIS_ALIGNER` - -**default**: `#!python 'blat'` - -**accepted values**: `'bwa mem'`, `'blat'` - - -The aligner to use to map the contigs/reads back to the reference e.g blat or bwa - - -## aligner_reference - -**type**: `#!python filepath` - -**environment variable**: `MAVIS_ALIGNER_REFERENCE` - -**default**: `#!python None` - -Path to the aligner reference file used for aligning the contig sequences - - -## annotation_filters - -**type**: `#!python str` - -**environment variable**: `MAVIS_ANNOTATION_FILTERS` - -**default**: `#!python 'choose_more_annotated,choose_transcripts_by_priority'` - -A comma separated list of filters to apply to putative annotations - - -## annotation_memory - -**type**: `#!python int` - -**environment variable**: `MAVIS_ANNOTATION_MEMORY` - -**default**: `#!python 12000` - -Default memory limit (mb) for the annotation stage - - -## annotations - -**type**: `#!python filepath` - -**environment variable**: `MAVIS_ANNOTATIONS` - -**default**: `#!python []` - -Path to the reference annotations of genes, transcript, exons, domains, etc - - -## assembly_kmer_size - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_ASSEMBLY_KMER_SIZE` - -**default**: `#!python 0.74` - -The percent of the read length to make kmers for assembly - - -## assembly_max_paths - -**type**: `#!python int` - -**environment variable**: `MAVIS_ASSEMBLY_MAX_PATHS` - -**default**: `#!python 8` - -The maximum number of paths to resolve. this is used to limit when there is a messy assembly graph to resolve. the assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater than the given setting - - -## assembly_min_edge_trim_weight - -**type**: `#!python int` - -**environment variable**: `MAVIS_ASSEMBLY_MIN_EDGE_TRIM_WEIGHT` - -**default**: `#!python 3` - -This is used to simplify the debruijn graph before path finding. edges with less than this frequency will be discarded if they are non-cutting, at a fork, or the end of a path - - -## assembly_min_exact_match_to_remap - -**type**: `#!python int` - -**environment variable**: `MAVIS_ASSEMBLY_MIN_EXACT_MATCH_TO_REMAP` - -**default**: `#!python 15` - -The minimum length of exact matches to initiate remapping a read to a contig - - -## assembly_min_remap_coverage - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_ASSEMBLY_MIN_REMAP_COVERAGE` - -**default**: `#!python 0.9` - -Minimum fraction of the contig sequence which the remapped sequences must align over - - -## assembly_min_remapped_seq - -**type**: `#!python int` - -**environment variable**: `MAVIS_ASSEMBLY_MIN_REMAPPED_SEQ` - -**default**: `#!python 3` - -The minimum input sequences that must remap for an assembled contig to be used - - -## assembly_min_uniq - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_ASSEMBLY_MIN_UNIQ` - -**default**: `#!python 0.1` - -Minimum percent uniq required to keep separate assembled contigs. if contigs are more similar then the lower scoring, then shorter, contig is dropped - - -## assembly_strand_concordance - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_ASSEMBLY_STRAND_CONCORDANCE` - -**default**: `#!python 0.51` - -When the number of remapped reads from each strand are compared, the ratio must be above this number to decide on the strand - - -## blat_limit_top_aln - -**type**: `#!python int` - -**environment variable**: `MAVIS_BLAT_LIMIT_TOP_ALN` - -**default**: `#!python 10` - -Number of results to return from blat (ranking based on score) - - -## blat_min_identity - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_BLAT_MIN_IDENTITY` - -**default**: `#!python 0.9` - -The minimum percent identity match required for blat results when aligning contigs - - -## breakpoint_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_BREAKPOINT_COLOR` - -**default**: `#!python '#000000'` - -Breakpoint outline color - - -## call_error - -**type**: `#!python int` - -**environment variable**: `MAVIS_CALL_ERROR` - -**default**: `#!python 10` - -Buffer zone for the evidence window - - -## clean_aligner_files - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_CLEAN_ALIGNER_FILES` - -**default**: `#!python False` - -Remove the aligner output files after the validation stage is complete. not required for subsequent steps but can be useful in debugging and deep investigation of events - - -## cluster_initial_size_limit - -**type**: `#!python int` - -**environment variable**: `MAVIS_CLUSTER_INITIAL_SIZE_LIMIT` - -**default**: `#!python 25` - -The maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering phase (combining based on overlap) - - -## cluster_radius - -**type**: `#!python int` - -**environment variable**: `MAVIS_CLUSTER_RADIUS` - -**default**: `#!python 100` - -Maximum distance allowed between paired breakpoint pairs - - -## concurrency_limit - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONCURRENCY_LIMIT` - -**default**: `#!python None` - -The concurrency limit for tasks in any given job array or the number of concurrent processes allowed for a local run - - -## contig_aln_max_event_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONTIG_ALN_MAX_EVENT_SIZE` - -**default**: `#!python 50` - -Relates to determining breakpoints when pairing contig alignments. for any given read in a putative pair the soft clipping is extended to include any events of greater than this size. the softclipping is added to the side of the alignment as indicated by the breakpoint we are assigning pairs to - - -## contig_aln_merge_inner_anchor - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONTIG_ALN_MERGE_INNER_ANCHOR` - -**default**: `#!python 20` - -The minimum number of consecutive exact match base pairs to not merge events within a contig alignment - - -## contig_aln_merge_outer_anchor - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONTIG_ALN_MERGE_OUTER_ANCHOR` - -**default**: `#!python 15` - -Minimum consecutively aligned exact matches to anchor an end for merging internal events - - -## contig_aln_min_anchor_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONTIG_ALN_MIN_ANCHOR_SIZE` - -**default**: `#!python 50` - -The minimum number of aligned bases for a contig (m or =) in order to simplify. do not have to be consecutive - - -## contig_aln_min_extend_overlap - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONTIG_ALN_MIN_EXTEND_OVERLAP` - -**default**: `#!python 10` - -Minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment - - -## contig_aln_min_query_consumption - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_CONTIG_ALN_MIN_QUERY_CONSUMPTION` - -**default**: `#!python 0.9` - -Minimum fraction of the original query sequence that must be used by the read(s) of the alignment - - -## contig_aln_min_score - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_CONTIG_ALN_MIN_SCORE` - -**default**: `#!python 0.9` - -Minimum score for a contig to be used as evidence in a call by contig - - -## contig_call_distance - -**type**: `#!python int` - -**environment variable**: `MAVIS_CONTIG_CALL_DISTANCE` - -**default**: `#!python 10` - -The maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair - - -## dgv_annotation - -**type**: `#!python filepath` - -**environment variable**: `MAVIS_DGV_ANNOTATION` - -**default**: `#!python []` - -Path to the dgv reference processed to look like the cytoband file - - -## domain_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_DOMAIN_COLOR` - -**default**: `#!python '#ccccb3'` - -Domain fill color - - -## domain_mismatch_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_DOMAIN_MISMATCH_COLOR` - -**default**: `#!python '#b2182b'` - -Domain fill color on 0%% match - - -## domain_name_regex_filter - -**type**: `#!python str` - -**environment variable**: `MAVIS_DOMAIN_NAME_REGEX_FILTER` - -**default**: `#!python '^PF\\d+$'` - -The regular expression used to select domains to be displayed (filtered by name) - - -## domain_scaffold_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_DOMAIN_SCAFFOLD_COLOR` - -**default**: `#!python '#000000'` - -The color of the domain scaffold - - -## draw_fusions_only - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_DRAW_FUSIONS_ONLY` - -**default**: `#!python True` - -Flag to indicate if events which do not produce a fusion transcript should produce illustrations - - -## draw_non_synonymous_cdna_only - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_DRAW_NON_SYNONYMOUS_CDNA_ONLY` - -**default**: `#!python True` - -Flag to indicate if events which are synonymous at the cdna level should produce illustrations - - -## drawing_width_iter_increase - -**type**: `#!python int` - -**environment variable**: `MAVIS_DRAWING_WIDTH_ITER_INCREASE` - -**default**: `#!python 500` - -The amount (in pixels) by which to increase the drawing width upon failure to fit - - -## exon_min_focus_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_EXON_MIN_FOCUS_SIZE` - -**default**: `#!python 10` - -Minimum size of an exon for it to be granted a label or min exon width - - -## fetch_min_bin_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_FETCH_MIN_BIN_SIZE` - -**default**: `#!python 50` - -The minimum size of any bin for reading from a bam file. increasing this number will result in smaller bins being merged or less bins being created (depending on the fetch method) - - -## fetch_reads_bins - -**type**: `#!python int` - -**environment variable**: `MAVIS_FETCH_READS_BINS` - -**default**: `#!python 5` - -Number of bins to split an evidence window into to ensure more even sampling of high coverage regions - - -## fetch_reads_limit - -**type**: `#!python int` - -**environment variable**: `MAVIS_FETCH_READS_LIMIT` - -**default**: `#!python 3000` - -Maximum number of reads, cap, to loop over for any given evidence window - - -## filter_cdna_synon - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_FILTER_CDNA_SYNON` - -**default**: `#!python True` - -Filter all annotations synonymous at the cdna level - - -## filter_min_complexity - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_FILTER_MIN_COMPLEXITY` - -**default**: `#!python 0.2` - -Filter event calls based on call sequence complexity - - -## filter_min_flanking_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_FILTER_MIN_FLANKING_READS` - -**default**: `#!python 10` - -Minimum number of flanking pairs for a call by flanking pairs - - -## filter_min_linking_split_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_FILTER_MIN_LINKING_SPLIT_READS` - -**default**: `#!python 1` - -Minimum number of linking split reads for a call by split reads - - -## filter_min_remapped_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_FILTER_MIN_REMAPPED_READS` - -**default**: `#!python 5` - -Minimum number of remapped reads for a call by contig - - -## filter_min_spanning_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_FILTER_MIN_SPANNING_READS` - -**default**: `#!python 5` - -Minimum number of spanning reads for a call by spanning reads - - -## filter_min_split_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_FILTER_MIN_SPLIT_READS` - -**default**: `#!python 5` - -Minimum number of split reads for a call by split reads - - -## filter_protein_synon - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_FILTER_PROTEIN_SYNON` - -**default**: `#!python False` - -Filter all annotations synonymous at the protein level - - -## filter_secondary_alignments - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_FILTER_SECONDARY_ALIGNMENTS` - -**default**: `#!python True` - -Filter secondary alignments when gathering read evidence - - -## filter_trans_homopolymers - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_FILTER_TRANS_HOMOPOLYMERS` - -**default**: `#!python True` - -Filter all single bp ins/del/dup events that are in a homopolymer region of at least 3 bps and are not paired to a genomic event - - -## flanking_call_distance - -**type**: `#!python int` - -**environment variable**: `MAVIS_FLANKING_CALL_DISTANCE` - -**default**: `#!python 50` - -The maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair - - -## fuzzy_mismatch_number - -**type**: `#!python int` - -**environment variable**: `MAVIS_FUZZY_MISMATCH_NUMBER` - -**default**: `#!python 1` - -The number of events/mismatches allowed to be considered a fuzzy match - - -## gene1_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_GENE1_COLOR` - -**default**: `#!python '#657e91'` - -The color of genes near the first gene - - -## gene1_color_selected - -**type**: `#!python str` - -**environment variable**: `MAVIS_GENE1_COLOR_SELECTED` - -**default**: `#!python '#518dc5'` - -The color of the first gene - - -## gene2_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_GENE2_COLOR` - -**default**: `#!python '#325556'` - -The color of genes near the second gene - - -## gene2_color_selected - -**type**: `#!python str` - -**environment variable**: `MAVIS_GENE2_COLOR_SELECTED` - -**default**: `#!python '#4c9677'` - -The color of the second gene - - -## import_env - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_IMPORT_ENV` - -**default**: `#!python True` - -Flag to import environment variables - - -## input_call_distance - -**type**: `#!python int` - -**environment variable**: `MAVIS_INPUT_CALL_DISTANCE` - -**default**: `#!python 20` - -The maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair - - -## label_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_LABEL_COLOR` - -**default**: `#!python '#000000'` - -The label color - - -## limit_to_chr - -**type**: `#!python str` - -**environment variable**: `MAVIS_LIMIT_TO_CHR` - -**default**: `#!python ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']` - -A list of chromosome names to use. breakpointpairs on other chromosomes will be filteredout. for example '1 2 3 4' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4 - - -## mail_type - -**type**: `#!python mavis.schedule.constants.MAIL_TYPE` - -**environment variable**: `MAVIS_MAIL_TYPE` - -**default**: `#!python 'NONE'` - -**accepted values**: `'BEGIN'`, `'END'`, `'FAIL'`, `'ALL'`, `'NONE'` - - -When to notify the mail_user (if given) - - -## mail_user - -**type**: `#!python str` - -**environment variable**: `MAVIS_MAIL_USER` - -**default**: `#!python ''` - -User(s) to send notifications to - - -## mask_fill - -**type**: `#!python str` - -**environment variable**: `MAVIS_MASK_FILL` - -**default**: `#!python '#ffffff'` - -Color of mask (for deleted region etc.) - - -## mask_opacity - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_MASK_OPACITY` - -**default**: `#!python 0.7` - -Opacity of the mask layer - - -## masking - -**type**: `#!python filepath` - -**environment variable**: `MAVIS_MASKING` - -**default**: `#!python []` - -File containing regions for which input events overlapping them are dropped prior to validation - - -## max_drawing_retries - -**type**: `#!python int` - -**environment variable**: `MAVIS_MAX_DRAWING_RETRIES` - -**default**: `#!python 5` - -The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output - - -## max_files - -**type**: `#!python int` - -**environment variable**: `MAVIS_MAX_FILES` - -**default**: `#!python 200` - -The maximum number of files to output from clustering/splitting - - -## max_orf_cap - -**type**: `#!python int` - -**environment variable**: `MAVIS_MAX_ORF_CAP` - -**default**: `#!python 3` - -The maximum number of orfs to return (best putative orfs will be retained) - - -## max_proximity - -**type**: `#!python int` - -**environment variable**: `MAVIS_MAX_PROXIMITY` - -**default**: `#!python 5000` - -The maximum distance away from an annotation before the region in considered to be uninformative - - -## max_sc_preceeding_anchor - -**type**: `#!python int` - -**environment variable**: `MAVIS_MAX_SC_PRECEEDING_ANCHOR` - -**default**: `#!python 6` - -When remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of where we expect it. for example for a softclipped read on a breakpoint with a left orientation this limits the amount of softclipping that is allowed on the right. if this is set to none then there is no limit on softclipping - - -## memory_limit - -**type**: `#!python int` - -**environment variable**: `MAVIS_MEMORY_LIMIT` - -**default**: `#!python 16000` - -The maximum number of megabytes (mb) any given job is allowed - - -## min_anchor_exact - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_ANCHOR_EXACT` - -**default**: `#!python 6` - -Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum number of consecutive exact matches to anchor a read to initiate targeted realignment - - -## min_anchor_fuzzy - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_ANCHOR_FUZZY` - -**default**: `#!python 10` - -Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum length of a fuzzy match to anchor a read to initiate targeted realignment - - -## min_anchor_match - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_MIN_ANCHOR_MATCH` - -**default**: `#!python 0.9` - -Minimum percent match for a read to be kept as evidence - - -## min_call_complexity - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_MIN_CALL_COMPLEXITY` - -**default**: `#!python 0.1` - -The minimum complexity score for a call sequence. is an average for non-contig calls. filters low complexity contigs before alignment. see [contig_complexity](#contig_complexity) - - -## min_clusters_per_file - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_CLUSTERS_PER_FILE` - -**default**: `#!python 50` - -The minimum number of breakpoint pairs to output to a file - - -## min_domain_mapping_match - -**type**: `#!python float_fraction` - -**environment variable**: `MAVIS_MIN_DOMAIN_MAPPING_MATCH` - -**default**: `#!python 0.9` - -A number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript to be displayed - - -## min_double_aligned_to_estimate_insertion_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_DOUBLE_ALIGNED_TO_ESTIMATE_INSERTION_SIZE` - -**default**: `#!python 2` - -The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping - - -## min_flanking_pairs_resolution - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_FLANKING_PAIRS_RESOLUTION` - -**default**: `#!python 10` - -The minimum number of flanking reads required to call a breakpoint by flanking evidence - - -## min_linking_split_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_LINKING_SPLIT_READS` - -**default**: `#!python 2` - -The minimum number of split reads which aligned to both breakpoints - - -## min_mapping_quality - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_MAPPING_QUALITY` - -**default**: `#!python 5` - -The minimum mapping quality of reads to be used as evidence - - -## min_non_target_aligned_split_reads - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_NON_TARGET_ALIGNED_SPLIT_READS` - -**default**: `#!python 1` - -The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local alignment to the target region to call a breakpoint by split read evidence - - -## min_orf_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_ORF_SIZE` - -**default**: `#!python 300` - -The minimum length (in base pairs) to retain a putative open reading frame (orf) - - -## min_sample_size_to_apply_percentage - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_SAMPLE_SIZE_TO_APPLY_PERCENTAGE` - -**default**: `#!python 10` - -Minimum number of aligned bases to compute a match percent. if there are less than this number of aligned bases (match or mismatch) the percent comparator is not used - - -## min_softclipping - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_SOFTCLIPPING` - -**default**: `#!python 6` - -Minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence - - -## min_spanning_reads_resolution - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_SPANNING_READS_RESOLUTION` - -**default**: `#!python 5` - -Minimum number of spanning reads required to call an event by spanning evidence - - -## min_splits_reads_resolution - -**type**: `#!python int` - -**environment variable**: `MAVIS_MIN_SPLITS_READS_RESOLUTION` - -**default**: `#!python 3` - -Minimum number of split reads required to call a breakpoint by split reads - - -## novel_exon_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_NOVEL_EXON_COLOR` - -**default**: `#!python '#5D3F6A'` - -Novel exon fill color - - -## outer_window_min_event_size - -**type**: `#!python int` - -**environment variable**: `MAVIS_OUTER_WINDOW_MIN_EVENT_SIZE` - -**default**: `#!python 125` - -The minimum size of an event in order for flanking read evidence to be collected - - -## queue - -**type**: `#!python str` - -**environment variable**: `MAVIS_QUEUE` - -**default**: `#!python ''` - -The queue jobs are to be submitted to - - -## reference_genome - -**type**: `#!python filepath` - -**environment variable**: `MAVIS_REFERENCE_GENOME` - -**default**: `#!python []` - -Path to the human reference genome fasta file - - -## remote_head_ssh - -**type**: `#!python str` - -**environment variable**: `MAVIS_REMOTE_HEAD_SSH` - -**default**: `#!python ''` - -Ssh target for remote scheduler commands - - -## scaffold_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_SCAFFOLD_COLOR` - -**default**: `#!python '#000000'` - -The color used for the gene/transcripts scaffolds - - -## scheduler - -**type**: `#!python mavis.schedule.constants.SCHEDULER` - -**environment variable**: `MAVIS_SCHEDULER` - -**default**: `#!python 'SLURM'` - -**accepted values**: `'SGE'`, `'SLURM'`, `'TORQUE'`, `'LOCAL'` - - -The scheduler being used - - -## spanning_call_distance - -**type**: `#!python int` - -**environment variable**: `MAVIS_SPANNING_CALL_DISTANCE` - -**default**: `#!python 20` - -The maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair - - -## splice_color - -**type**: `#!python str` - -**environment variable**: `MAVIS_SPLICE_COLOR` - -**default**: `#!python '#000000'` - -Splicing lines color - - -## split_call_distance - -**type**: `#!python int` - -**environment variable**: `MAVIS_SPLIT_CALL_DISTANCE` - -**default**: `#!python 20` - -The maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair - - -## stdev_count_abnormal - -**type**: `#!python float` - -**environment variable**: `MAVIS_STDEV_COUNT_ABNORMAL` - -**default**: `#!python 3.0` - -The number of standard deviations away from the normal considered expected and therefore not qualifying as flanking reads - - -## strand_determining_read - -**type**: `#!python int` - -**environment variable**: `MAVIS_STRAND_DETERMINING_READ` - -**default**: `#!python 2` - -1 or 2. the read in the pair which determines if (assuming a stranded protocol) the first or second read in the pair matches the strand sequenced - - -## template_metadata - -**type**: `#!python filepath` - -**environment variable**: `MAVIS_TEMPLATE_METADATA` - -**default**: `#!python []` - -File containing the cytoband template information. used for illustrations only - - -## time_limit - -**type**: `#!python int` - -**environment variable**: `MAVIS_TIME_LIMIT` - -**default**: `#!python 57600` - -The time in seconds any given jobs is allowed - - -## trans_fetch_reads_limit - -**type**: `#!python int` - -**environment variable**: `MAVIS_TRANS_FETCH_READS_LIMIT` - -**default**: `#!python 12000` - -Related to [fetch_reads_limit](#fetch_reads_limit). overrides fetch_reads_limit for transcriptome libraries when set. if this has a value of none then fetch_reads_limit will be used for transcriptome libraries instead - - -## trans_min_mapping_quality - -**type**: `#!python int` - -**environment variable**: `MAVIS_TRANS_MIN_MAPPING_QUALITY` - -**default**: `#!python 0` - -Related to [min_mapping_quality](#min_mapping_quality). overrides the min_mapping_quality if the library is a transcriptome and this is set to any number not none. if this value is none, min_mapping_quality is used for transcriptomes aswell as genomes - - -## trans_validation_memory - -**type**: `#!python int` - -**environment variable**: `MAVIS_TRANS_VALIDATION_MEMORY` - -**default**: `#!python 18000` - -Default memory limit (mb) for the validation stage (for transcriptomes) - - -## uninformative_filter - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_UNINFORMATIVE_FILTER` - -**default**: `#!python False` - -Flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered out prior to clustering - - -## validation_memory - -**type**: `#!python int` - -**environment variable**: `MAVIS_VALIDATION_MEMORY` - -**default**: `#!python 16000` - -Default memory limit (mb) for the validation stage - - -## width - -**type**: `#!python int` - -**environment variable**: `MAVIS_WIDTH` - -**default**: `#!python 1000` - -The drawing width in pixels - - -## write_evidence_files - -**type**: `#!python cast_boolean` - -**environment variable**: `MAVIS_WRITE_EVIDENCE_FILES` - -**default**: `#!python True` - -Write the intermediate bam and bed files containing the raw evidence collected and contigs aligned. not required for subsequent steps but can be useful in debugging and deep investigation of events - - diff --git a/docs/hooks.py b/docs/hooks.py index 3727c646..7684ddc8 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -1,73 +1,140 @@ +import json import os -import re +from textwrap import dedent +import pkg_resources from markdown_refdocs.main import extract_to_markdown -from mavis.annotate.constants import DEFAULTS as ANNOTATION_DEFAULTS -from mavis.cluster.constants import DEFAULTS as CLUSTER_DEFAULTS -from mavis.config import REFERENCE_DEFAULTS -from mavis.illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS -from mavis.pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from mavis.schedule.constants import OPTIONS as SUBMIT_OPTIONS -from mavis.summary.constants import DEFAULTS as SUMMARY_DEFAULTS +from mavis_config import DEFAULTS from mavis.util import ENV_VAR_PREFIX -from mavis.validate.constants import DEFAULTS as VALIDATION_DEFAULTS -def generate_settings_doc(): +def json_to_pytype(record): + input_type = record + try: + input_type = record['type'] + except TypeError: + pass + types = { + 'string': 'str', + 'integer': 'int', + 'float': 'float', + 'boolean': 'bool', + 'number': 'float', + } + + if input_type == 'array': + try: + sub_type = json_to_pytype(record['items']['type']) + return f'List[{sub_type}]' + except TypeError: + return 'List' + + if isinstance(input_type, list): + # Union + types = ', '.join([json_to_pytype(t) for t in input_type]) + return f'Union[{types}]' + return types.get(input_type, input_type) + + +def list_properties(schema, skip_terms=tuple()): + glossary = {} + for term, defn in schema['properties'].items(): + if term in skip_terms: + continue + typ = json_to_pytype(defn) + desc = defn.get('description', '') + default_value = defn.get('default') + schema_fields = {k: v for k, v in defn.items() if k not in ['description', 'default']} + + if len(schema_fields) > 1: + schema_defn = json.dumps( + schema_fields, + sort_keys=True, + indent=' ', + ) + schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n' + else: + schema_defn = '' + + lines = [ + f'### {term}', + f'**type**: `#!python {typ}`', + f'**default**: `#!python {repr(default_value)}`' if default_value is not None else '', + desc, + schema_defn, + ] + glossary[term] = '\n\n'.join(lines) + return [v for k, v in sorted(glossary.items())] + + +def generate_settings_doc(schema_file): + with open(schema_file, 'r') as fh: + schema = json.load(fh) dirname = os.path.dirname(os.path.abspath(__file__)) + filepath = 'configuration/settings.md' + title = 'Configurable Settings' + + fname = os.path.join(dirname, filepath) + + result = [f'\n\n# {title}\n'] + result.append( + dedent( + '''\ + ## Defining Samples/Libraries - for (filepath, title, namespaces) in [ - ( - 'configuration/settings.md', - 'Configurable Settings', - [ - SUBMIT_OPTIONS, - REFERENCE_DEFAULTS, - SUMMARY_DEFAULTS, - PAIRING_DEFAULTS, - ANNOTATION_DEFAULTS, - VALIDATION_DEFAULTS, - CLUSTER_DEFAULTS, - ILLUSTRATION_DEFAULTS, - ], - ), - ]: - fname = os.path.join(dirname, filepath) - print('writing:', fname) - with open(fname, 'w') as fh: - fh.write(f'\n\n# {title}\n') - glossary = {} - for namespace in namespaces: - for term, value in namespace.items(): - typ = namespace.type(term).__name__ - # typ = CUSTOM_TYPES.get(typ, typ) - desc = re.sub(r"\.?$", "", namespace.define(term, "")).capitalize() - accepted = '' - try: - accepted = '\n\n**accepted values**: {}\n'.format( - ', '.join(['`{}`'.format(repr(v)) for v in namespace.type(term).values()]) - ) - except AttributeError: - pass - defn = f'''## {term} - -**type**: `#!python {typ}` - -**environment variable**: `{ENV_VAR_PREFIX}{term.upper()}` - -**default**: `#!python {repr(value)}`{accepted} - -{desc} - ''' - glossary[term] = defn - for term, defn in sorted(glossary.items()): - fh.write(f'{defn}\n\n') + The `libraries` property of the mavis config is required to run the snakemake + workflow. This is the section that defines what inputs to use, and what types of + samples are available. + + ```json + { + "libraries": { + "": { } // mapping of library name to library settings + } + } + ``` + + The library specific settings are listed below + ''' + ) + ) + result.extend(list_properties(schema['properties']['libraries']['additionalProperties'])) + result.append( + dedent( + '''\ + ## Defining Conversions + + If the input to MAVIS is raw tool output and has not been pre-converted to the + standard tab delimited format expected by MAVIS then you will need to add + a section to the config to tell mavis how to perform the required conversions + + ```json + { + "convert": { + "": { } // mapping of alias to conversion settings + } + } + ``` + + The conversion specific settings are listed below + ''' + ) + ) + result.extend(list_properties(schema['properties']['convert']['additionalProperties'])) + result.append('\n## General Settings\n') + result.extend(list_properties(schema, ('libraries', 'convert'))) + + print('writing:', fname) + with open(fname, 'w') as fh: + fh.write('\n\n'.join(result) + '\n') def build_package_docs(config): - generate_settings_doc() - package_dir = os.path.join(os.path.dirname(__file__), '../mavis') + schema_file = pkg_resources.resource_filename('mavis_config', 'config.json') + generate_settings_doc(schema_file) + package_dir = os.path.join(os.path.dirname(__file__), '../src/mavis') output_dir = os.path.join(os.path.dirname(__file__), 'package') + extract_to_markdown( [package_dir], output_dir, @@ -75,5 +142,5 @@ def build_package_docs(config): hide_private=True, hide_undoc=True, hide_undoc_args=True, - namespace_headers=True, + namespace_headers=False, ) diff --git a/docs/images/snakemake.cluster.full-tutorial.png b/docs/images/snakemake.cluster.full-tutorial.png new file mode 100644 index 00000000..cae31051 Binary files /dev/null and b/docs/images/snakemake.cluster.full-tutorial.png differ diff --git a/docs/images/snakemake.cluster.mini-tutorial.png b/docs/images/snakemake.cluster.mini-tutorial.png new file mode 100644 index 00000000..98565be6 Binary files /dev/null and b/docs/images/snakemake.cluster.mini-tutorial.png differ diff --git a/docs/images/snakemake.validate.mini-tutorial.png b/docs/images/snakemake.validate.mini-tutorial.png new file mode 100644 index 00000000..3badbe42 Binary files /dev/null and b/docs/images/snakemake.validate.mini-tutorial.png differ diff --git a/docs/inputs/reference.md b/docs/inputs/reference.md index cc31d56e..8c7f0f36 100644 --- a/docs/inputs/reference.md +++ b/docs/inputs/reference.md @@ -15,13 +15,12 @@ not available, | File Name (Type/Format) | Environment Variable | Download | | --------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [reference genome](../../inputs/reference/#reference-genome) ([fasta](../../glossary/#fasta)) | `MAVIS_REFERENCE_GENOME` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.tar.gz) | -| [annotations](../../inputs/reference/#annotations) ([JSON](../../glossary/#json)) | `MAVIS_ANNOTATIONS` | [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69](http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations.json)
[![](../images/get_app-24px.svg) GRCh38 + Ensembl79](http://www.bcgsc.ca/downloads/mavis/ensembl79_hg38_annotations.json) | +| [annotations](../../inputs/reference/#annotations) ([JSON](../../glossary/#json)) | `MAVIS_ANNOTATIONS` | [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69](http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz)
[![](../images/get_app-24px.svg) GRCh38 + Ensembl79](http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz) | | [masking](../../inputs/reference/#masking-file) (text/tabbed) | `MAVIS_MASKING` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab) | | [template metadata](../../inputs/reference/#template-metadata) (text/tabbed) | `MAVIS_TEMPLATE_METADATA` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) | | [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | | [aligner reference](../../inputs/reference/#aligner-reference) | `MAVIS_ALIGNER_REFERENCE` | [![](../images/get_app-24px.svg) GRCh37/Hg19 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit)
[![](../images/get_app-24px.svg) GRCh38 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit) | - If the environment variables above are set they will be used as the default values when any step of the pipeline script is called (including generating the template config file) @@ -38,11 +37,13 @@ chromosomes. This is only used during visualization. The structure of the file should look something like this - chr1 0 2300000 p36.33 gneg - chr1 2300000 5400000 p36.32 gpos25 - chr1 5400000 7200000 p36.31 gneg - chr1 7200000 9200000 p36.23 gpos25 - chr1 9200000 12700000 p36.22 gneg +```text +chr1 0 2300000 p36.33 gneg +chr1 2300000 5400000 p36.32 gpos25 +chr1 5400000 7200000 p36.31 gneg +chr1 7200000 9200000 p36.23 gpos25 +chr1 9200000 12700000 p36.22 gneg +``` ## Masking File @@ -52,9 +53,11 @@ known false positives, bad mapping, centromeres, telomeres etc. An example of the expected format is shown below. The file should have four columns: chr, start, end and name. - #chr start end name - chr1 0 2300000 centromere - chr1 9200000 12700000 telomere +```text +chr start end name +chr1 0 2300000 centromere +chr1 9200000 12700000 telomere +``` The pre-built masking files in the downloads table above are telomere regions, centromere regions (based on the cytoband file), and nspan @@ -79,8 +82,7 @@ the ensembl annotations file including non-coding transcripts below. annotations file. On our standard COLO829 we increased the default memory for the annotation step from 12G to 18G. -[![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69 (includes non-coding genes)](http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations_with_ncrna.json) - +[![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69 (includes non-coding genes)](http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations_with_ncrna.v3.json.gz) !!! warning the `mavis.annotate.file_io.load_reference_genes`{.interpreted-text @@ -98,7 +100,7 @@ be seen below { "name": string, "start": int, - "end": int + "end": int, "aliases": [string, string, ...], "transcripts": [ { @@ -161,6 +163,26 @@ python tools/generate_ensembl_json.py -s human -r 75 -o ensembl_human_v75.json This will produce the JSON file required as input by MAVIS +### Conversion from Other Standard Formats + +If you have a GTF or GFF3 file you can convert them to match the MAVIS json format with the helper script provided in the tools folder + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/gtf/file \ + --input_type gtf \ + output_mavis_annotations.json +``` + +or similarly for the GFF3 format + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/gff3/file \ + --input_type gff3 \ + output_mavis_annotations.json +``` + ## DGV (Database of Genomic Variants) @@ -180,10 +202,12 @@ awk '{print $2"\t"$3"\t"$4"\t"$1} GRCh37_hg19_variants_2016-05-15.txt > dgv_hg19 Note in hg19 the column is called "name" and in hg38 the column is called "variantaccession". An example is shown below - #chr start end name - 1 1 2300000 nsv482937 - 1 10001 22118 dgv1n82 - 1 10001 127330 nsv7879 +```text +chr start end name +1 1 2300000 nsv482937 +1 10001 22118 dgv1n82 +1 10001 127330 nsv7879 +``` ## Aligner Reference diff --git a/docs/inputs/standard.md b/docs/inputs/standard.md index cd3d7e92..373a7cb1 100644 --- a/docs/inputs/standard.md +++ b/docs/inputs/standard.md @@ -1,40 +1,41 @@ # MAVIS standard input file format - These requirements pertain to the columns of input files from the various tools you want to merge. The input files should be tab-delimited text files. Comments at the top of may be included. Comments should -begin with two hash marks. They will be ignored when the file is read - +begin with hash marks. They will be ignored when the file is read - ## This is a comment +```text +## This is a comment +``` The header row contains the column names and is the first row following -the comments (or the first row if no comments are included). Optionally -the header row may (or may not) begin with a hash which will be stripped -out on read +the comments (or the first row if no comments are included). - ## This is a comment - ## this is another comment - # this is the header row +```text +## This is a comment +## this is another comment +# this is also a comment +This Is The Header +``` A simple input file might look as follows - ## File created at: 2018-01-02 - ## Generated by: MAVIS v1.0.0 - #break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end - X 1234 1234 X 77965 77965 +```text +## File created at: 2018-01-02 +## Generated by: MAVIS v1.0.0 +break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end +X 1234 1234 X 77965 77965 +``` ## Required Columns -- [break1_chromosome](../../outputs/columns/#break1_chromosome) -- [break1_position_start](../../outputs/columns/#break1_position_start) -- [break1_position_end](../../outputs/columns/#break1_position_end) (can be the - same as break1\_position\_start) -- [break2_chromosome](../../outputs/columns/#break2_chromosome) -- [break2_position_start](../../outputs/columns/#break2_position_start) -- [break2_position_end](../../outputs/columns/#break2_position_end) (can be the - same as break2\_position\_start) +- [break1_chromosome](../../outputs/columns/#break1_chromosome) +- [break1_position_start](../../outputs/columns/#break1_position_start) +- [break1_position_end](../../outputs/columns/#break1_position_end) (can be the same as break1\_position\_start) +- [break2_chromosome](../../outputs/columns/#break2_chromosome) +- [break2_position_start](../../outputs/columns/#break2_position_start) +- [break2_position_end](../../outputs/columns/#break2_position_end) (can be the same as break2\_position\_start) ## Optional Columns @@ -42,24 +43,15 @@ Optional Columns that are not given as input will be added with default (or command line parameter options) during the clustering stage of MAVIS as some are required for subsequent pipeline steps -- [break1_strand](../../outputs/columns/#break1_strand) (defaults to - not-specified during clustering) -- [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all - possible values during clustering) -- [break2_strand](../../outputs/columns/#break2_strand) (defaults to - not-specified during clustering) -- [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all - possible values during clustering) -- [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all - possible values during clustering) -- [stranded](../../outputs/columns/#stranded) (defaults to False during - clustering) -- [library](../../outputs/columns/#library) (defaults to command line - library parameter during clustering) -- [protocol](../../outputs/columns/#protocol) (defaults to command line - protocol parameter during clustering) -- [tools](../../outputs/columns/#tools) (defaults to an empty string - during clustering) +- [break1_strand](../../outputs/columns/#break1_strand) (defaults to not-specified during clustering) +- [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all possible values during clustering) +- [break2_strand](../../outputs/columns/#break2_strand) (defaults to not-specified during clustering) +- [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all possible values during clustering) +- [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all possible values during clustering) +- [stranded](../../outputs/columns/#stranded) (defaults to False during clustering) +- [library](../../outputs/columns/#library) (defaults to command line library parameter during clustering) +- [protocol](../../outputs/columns/#protocol) (defaults to command line protocol parameter during clustering) +- [tools](../../outputs/columns/#tools) (defaults to an empty string during clustering) ## Summary by Pipeline Step diff --git a/docs/inputs/support.md b/docs/inputs/support.md index 89c6c482..af3a5e90 100644 --- a/docs/inputs/support.md +++ b/docs/inputs/support.md @@ -11,28 +11,7 @@ to be compatible with MAVIS. ## Job Schedulers -MAVIS can be run locally without a job scheduler -(`MAVIS_SCHEDULER=LOCAL`) however, due to the computational resources -generally required, it is recommended that you use one of the supported -schedulers listed below. - -| Name | Version(s) | Environment Setting | -| -------------------------------- | ----------- | ------------------------ | -| [TORQUE](../../glossary/#torque) | `6.1.2` | `MAVIS_SCHEDULER=TORQUE` | -| [SGE](../../glossary/#sge) | `8.1.8` | `MAVIS_SCHEDULER=SGE` | -| [SLURM](../../glossary/#slurm) | `17.02.1-2` | `MAVIS_SCHEDULER=SLURM` | - -Users requiring support for other schedulers may make a request by -[submitting an issue to our github -page](https://github.com/bcgsc/mavis/issues). Additionally, developers -looking to extend the functionality may submit a pull request (Please -see the -[guidelines for contributors](../../development/) - -MAVIS running locally uses the python -`concurrent.futures` library to manage -jobs. -## +MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling ## Aligners diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 00000000..b9343c48 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,142 @@ +# Install Instructions + +Once the install steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run. +See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS. + +For either install option you will want to install the main Snakefile. It is best to use a tag to +specify the version of interest but you can download the latest version from the master branch + +```bash +wget https://raw.githubusercontent.com/bcgsc/mavis/master/Snakefile -O Snakefile +``` + +## Install for Docker/Singularity + +The simplest way to use MAVIS is via Singularity. The MAVIS docker container used +by singularity will take care of installing the aligner as well. + +```bash +pip install -U setuptools pip wheel +pip install mavis_config # also installs snakemake +``` + +Now you will run mavis via Snakemake as follows + +```bash +snakemake \ + -j \ + --configfile \ + --use-singularity \ + -s Snakefile +``` + +## Install (Python Only) + +MAVIS can also be run with just python. However you will need to install the aligner(s) required +by MAVIS separately and ensure they are availble on the default PATH variable when MAVIS is run + +### 1. Install Aligner + +In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed. +Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa). +For MAVIS to run successfully the aligner must be installed and accessible on the path. +If you have a non-standard install you may find it useful to edit the PATH environment variable. For example + +``` bash +export PATH=/path/to/directory/containing/blat/binary:$PATH +``` + +[blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) it must be specified +in the [config](https://mavis.readthedocs.io/en/latest/configuration/settings/) JSON file. + +After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/) + +### 2. Install MAVIS + +#### Install using pip + +The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads) + +Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies + +``` bash +pip install --upgrade pip setuptools +``` + +or (for Anaconda users) + +``` bash +conda update pip setuptools +``` + +If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html) + +Then install mavis itself + +``` bash +pip install mavis +``` + +This will install mavis and its python dependencies. + +#### Install using Buildout + +Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis + +``` bash +git clone https://github.com/bcgsc/mavis.git +cd mavis +pip install zc.buildout +python bootstrap.py +bin/buildout +``` + +This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis + +Finally you will need to Build/Download the necessary reference files + +## Build or Download Reference Files + +After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files is provided under mavis/tools for convenience. + +### Download Hg19 Files + +``` bash +cd /path/to/where/you/want/to/put/the/files +wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh +bash get_hg19_reference_files.sh +``` + +You should now see the reference files in the current directory + +```text +. +|-- cytoBand.txt +|-- dgv_hg19_variants.tab +|-- ensembl69_hg19_annotations.json +|-- get_hg19_reference_files.sh +|-- hg19.2bit +|-- hg19.fa +`-- hg19_masking.tab +``` + +### Download Hg38 Files + +``` bash +cd /path/to/where/you/want/to/put/the/files +wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg38_reference_files.sh +bash get_hg19_reference_files.sh +``` + +You should now see the reference files in the current directory + +```text +. +|-- cytoBand.txt +|-- dgv_hg38_variants.tab +|-- ensembl79_hg38_annotations.json +|-- get_hg38_reference_files.sh +|-- GCA_000001405.15_GRCh38_no_alt_analysis_set.fna +|-- GRCh38_masking.tab +`-- hg38.2bit +``` diff --git a/docs/migrating.md b/docs/migrating.md new file mode 100644 index 00000000..29231381 --- /dev/null +++ b/docs/migrating.md @@ -0,0 +1,41 @@ +# Migrating + +## Migrating from v2 to v3 + +There are major changes from v2 to v3 of MAVIS. + +### Tab File Headers + +Tab file headers no longer start with `#`. Any lines starting with a pound will be treated +as comments. This will apply to mavis-style inputs as well as any tab delimited +reference files + +### Configuration + +MAVIS no longer uses command line arguments, config files, and environment variables for +configuration. Instead all configurable settings are controlled via a single input JSON +config file + +### Scheduling + +MAVIS is now integrated with snakemake instead of handling its own scheduling + +## Reference Annotation Files + +MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory. + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/tab/file.tab \ + --input_type v2-tab \ + /path/to/new/json/file.json +``` + +In v3 the JSON files are slightly different to support multiple translations per transcript. You old v3 files can be automatically converted to the new format with the same script + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/json/file.json \ + --input_type v2-json \ + /path/to/new/json/file.json +``` diff --git a/docs/outputs/columns.md b/docs/outputs/columns.md index f2c8ba19..14cdcdd2 100644 --- a/docs/outputs/columns.md +++ b/docs/outputs/columns.md @@ -3,7 +3,6 @@ List of column names and their definitions. The types indicated here are the expected types in a row for a given column name. - ## library Identifier for the library/source @@ -34,7 +33,7 @@ decision from the annotation step ## event\_type -**type**: `mavis.constants.SVTYPE` +**type**: [`mavis.constants.SVTYPE`](../package/mavis/constants/#class-mavisconstantssvtype) The classification of the event @@ -57,7 +56,7 @@ Gene for the current annotation at the first breakpoint ## gene1\_direction -**type**: `mavis.constants.PRIME` +**type**: [`mavis.constants.PRIME`](../package/mavis/constants/#class-mavisconstantsprime) The direction/prime of the gene @@ -68,7 +67,7 @@ Gene for the current annotation at the second breakpoint ## gene2\_direction -**type**: `mavis.constants.PRIME` +**type**: [`mavis.constants.PRIME`](../package/mavis/constants/#class-mavisconstantsprime) The direction/prime of the gene. Has the following possible values @@ -85,16 +84,11 @@ second breakpoint ## gene\_product\_type -**type**: `mavis.constants.GENE_PRODUCT_TYPE` +**type**: [`mavis.constants.GENE_PRODUCT_TYPE`](../package/mavis/constants/#class-mavisconstantsgene_product_type) Describes if the putative fusion product will be sense or anti-sense -## fusion\_cdna\_coding\_end - -Position wrt the 5' end of the fusion transcript where coding ends -last base of the stop codon - ## transcript1 Transcript for the current annotation at the first breakpoint @@ -105,7 +99,8 @@ Transcript for the current annotation at the second breakpoint ## fusion\_splicing\_pattern -`mavis.constants.SPLICE_TYPE` - +**type**: [`mavis.constants.SPLICE_TYPE`](../package/mavis/constants/#class-mavisconstantsslice_type) + Type of splicing pattern used to create the fusion cDNA. ## fusion\_cdna\_coding\_start @@ -205,14 +200,14 @@ End integer inclusive ## break1\_orientation -**type**: `mavis.constants.ORIENT` +**type**: [`mavis.constants.ORIENT`](../package/mavis/constants/#class-mavisconstantsorient) The side of the breakpoint wrt the positive/forward strand that is retained. ## break1\_strand -**type**: `mavis.constants.STRAND` +**type**: [`mavis.constants.STRAND`](../package/mavis/constants/#class-mavisconstantsstrand) The strand wrt to the reference positive/forward strand at this @@ -246,14 +241,14 @@ End integer inclusive ## break2\_orientation -**type**: `mavis.constants.ORIENT` +**type**: [`mavis.constants.ORIENT`](../package/mavis/constants/#class-mavisconstantsorient) The side of the breakpoint wrt the positive/forward strand that is retained. ## break2\_strand -**type**: `mavis.constants.STRAND` +**type**: [`mavis.constants.STRAND`](../package/mavis/constants/#class-mavisconstantsstrand) The strand wrt to the reference positive/forward strand at this @@ -283,7 +278,8 @@ protocol was strand specific or not. Expects a boolean ## protocol -`mavis.constants.PROTOCOL` - +**type**: [`mavis.constants.PROTOCOL`](../package/mavis/constants/#class-mavisconstantsprotocol) + Specifies the type of library ## tools @@ -404,7 +400,7 @@ event ## call\_method -**type**: `mavis.constants.CALL_METHOD` +**type**: [`mavis.constants.CALL_METHOD`](../package/mavis/constants/#class-mavisconstantscall_method) The method used to call the breakpoints diff --git a/docs/outputs/illustrations.md b/docs/outputs/illustrations.md index f419db26..da7fb6a3 100644 --- a/docs/outputs/illustrations.md +++ b/docs/outputs/illustrations.md @@ -5,33 +5,29 @@ These are diagrams produced during the annotate step. These represent the putative fusion events of a single breakpoint pair. -![](../images/GIMAP4_IL7_fusion.svg) +![fusion diagram](../images/GIMAP4_IL7_fusion.svg) Fusion from transcriptome data. Intronic breakpoints here indicate retained intron sequence and a novel exon is predicted. - If the [draw_fusions_only](../../configuration/settings/#draw_fusions_only flag is set to False then all events will produce a diagram, even anti-sense fusions -![](../images/UBE2V2_GIMAP4_disruptive_fusion.svg) +![disruptive fusion diagram](../images/UBE2V2_GIMAP4_disruptive_fusion.svg) Disruptive Anti-sense Fusion - ## Transcript Overlays MAVIS supports generating diagrams of all transcripts for a given gene. These can be overlaid with markers and bam\_file pileup data. This is particularly useful for visualizing splice site mutations. -![](../images/ENSG00000139687_RB1_overlay.png) - -RB1 splice site mutation results in skipping of exon -9 +![overlay diagram](../images/ENSG00000139687_RB1_overlay.png) +RB1 splice site mutation results in skipping of exon 9 The above diagram was generated using the overlay command diff --git a/docs/tutorials/.pages b/docs/tutorials/.pages new file mode 100644 index 00000000..b9f03d9d --- /dev/null +++ b/docs/tutorials/.pages @@ -0,0 +1,4 @@ +nav: + - mini.md + - full.md + - ... diff --git a/docs/tutorials/annotation.md b/docs/tutorials/annotation.md new file mode 100644 index 00000000..adfaf17d --- /dev/null +++ b/docs/tutorials/annotation.md @@ -0,0 +1,98 @@ +# Annotation Only + +Sometimes you have a set of variants and would simply like to run the annotate step of MAVIS to visualize and annotate them. + +First you need to create your basic config to tell MAVIS where the reference files you want to use are and some minimal information about the library/sample you want to process. + +Here is an example config where the user has created a minimal input file in the MAVIS standard input file format. We convert it to expand any unknowns (ex. SV type if left blank) + +```json +{ + "libraries": { + "my_library": { + "assign": ["my_converted_file"], + "disease_status": "normal", + "protocol": "genome" + } + }, + "convert": { + "my_converted_file": { + "inputs": ["/path/to/file/structural_variants.txt"], + "file_type": "mavis" + } + }, + "cluster.split_only": true, + "skip_stage.validate": true, + "output_dir": "my_output_dir", + "reference.annotations": "/path/to/mavis/reference_files/ensembl79_hg38_annotations.json", + "reference.template_metadata": "/path/to/mavis/reference_files/hg38_cytoBand.txt", + "reference.reference_genome": "/path/to/hg38_no_alt/genome/hg38_no_alt.fa", + "reference.masking": "/path/to/mavis/reference_files/masking_hg38.adjusted.tab", + "reference.dgv_annotation": "/path/to/mavis/reference_files/dgv_hg38_annotations.tab" +} +``` + +Another example is given in the MAVIS tests folder under `tests/mini-tutorial.annotate_only.config.json` which looks like this + +```json +{ + "annotate.draw_fusions_only": false, + "convert": { + "mock_converted": { + "inputs": [ + "tests/data/mock_sv_events.tsv" + ], + "file_type": "mavis", + "assume_no_untemplated": true + } + }, + "skip_stage.validate": true, + "cluster.uninformative_filter": true, + "cluster.limit_to_chr": null, + "cluster.min_clusters_per_file": 5, + "libraries": { + "mock-A47933": { + "assign": [ + "tests/data/mock_trans_sv_events.tsv" + ], + "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", + "disease_status": "diseased", + "protocol": "transcriptome", + "strand_specific": true + }, + "mock-A36971": { + "assign": [ + "mock_converted" + ], + "bam_file": "tests/data/mock_reads_for_events.sorted.bam", + "disease_status": "diseased", + "protocol": "genome", + "strand_specific": false + } + }, + "output_dir": "output_dir", + "reference.annotations": [ + "tests/data/mock_annotations.json" + ], + "reference.dgv_annotation": [ + "tests/data/mock_dgv_annotation.txt" + ], + "reference.masking": [ + "tests/data/mock_masking.tab" + ], + "reference.reference_genome": [ + "tests/data/mock_reference_genome.fa" + ], + "reference.template_metadata": [ + "tests/data/cytoBand.txt" + ] +} +``` + +Either of these configurations can be run with the following command simply by changing the configfile argument + +```bash +snakemake -j 1 \ + --configfile tests/mini-tutorial.annotate_only.config.json \ + -s Snakefile +``` diff --git a/docs/tutorials/full.md b/docs/tutorials/full.md index b42ae624..20054def 100644 --- a/docs/tutorials/full.md +++ b/docs/tutorials/full.md @@ -18,16 +18,16 @@ tar -xvzf tutorial_data.tar.gz The expected contents are -| Path | Description | -| ---------------------------------- | ------------------------------------------------------------------------------------------------------------------- | -| README | Information regarding the other files in the directory | -| L1522785992\_expected\_events.tab | The events that we expect to find, either experimentally validated or 'spiked' in | -| L1522785992\_normal.sorted.bam | Paired normal library BAM file | -| L1522785992\_normal.sorted.bam.bai | BAM index | -| L1522785992\_trans.sorted.bam | Tumour transcriptome BAM file | -| L1522785992\_trans.sorted.bam.bai | BAM index file | -| L1522785992\_tumour.sorted.bam | Tumour genome BAM file | -| L1522785992\_tumour.sorted.bam.bai | BAM index file | +| Path | Description | +| ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------ | +| README | Information regarding the other files in the directory | +| L1522785992\_expected\_events.tab | The events that we expect to find, either experimentally validated or 'spiked' in | +| L1522785992\_normal.sorted.bam | Paired normal library BAM file | +| L1522785992\_normal.sorted.bam.bai | BAM index | +| L1522785992\_trans.sorted.bam | Tumour transcriptome BAM file | +| L1522785992\_trans.sorted.bam.bai | BAM index file | +| L1522785992\_tumour.sorted.bam | Tumour genome BAM file | +| L1522785992\_tumour.sorted.bam.bai | BAM index file | | breakdancer-1.4.5/ | Contains the [BreakDancer](../../glossary/#breakdancer) output which was run on the tumour genome BAM file | | breakseq-2.2/ | Contains the [BreakSeq](../../glossary/#breakseq) output which was run on the tumour genome BAM file | | chimerascan-0.4.5/ | Contains the [ChimeraScan](../../glossary/#chimerascan) output which was run on the tumour transcriptome BAM file | @@ -36,49 +36,22 @@ The expected contents are ## Downloading the Reference Inputs -Run the following to download the hg19 reference files and set up the -environment variables for configuring MAVIS +Run the following to download the hg19 reference files ```bash -wget https://raw.githubusercontent.com/bcgsc/mavis/master/tools/get_hg19_reference_files.sh +wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh +mkdir reference_inputs +cd reference_inputs bash get_hg19_reference_files.sh -source reference_inputs/hg19_env.sh +cd .. ``` -## Generating the Config File +## Creating the Config File -The [config](../../background/citations/#pipeline-config) command -does most of the work of creating the config for you but there are a few -things you need to tell it +Most settings can be left as defaults, however you will need to fill out the `libraries` and +`convert` sections to tell MAVIS how to convert your inputs and what libraries to expect. -1. **Where your bams are and what library they belong to** - -```text ---library L1522785992-normal genome normal False tutorial_data/L1522785992_normal.sorted.bam ---library L1522785992-tumour genome diseased False tutorial_data/L1522785992_tumour.sorted.bam ---library L1522785992-trans transcriptome diseased True tutorial_data/L1522785992_trans.sorted.bam -``` - -1. **Where your SV caller output files (events) are** - -If they are raw tool output as in the current example you will need to -use the convert argument to tell MAVIS the file type - -```text ---convert breakdancer tutorial_data/breakdancer-1.4.5/*txt breakdancer ---convert breakseq tutorial_data/breakseq-2.2/breakseq.vcf.gz breakseq ---convert chimerascan tutorial_data/chimerascan-0.4.5/chimeras.bedpe chimerascan ---convert defuse tutorial_data/defuse-0.6.2/results.classify.tsv defuse ---convert manta tutorial_data/manta-1.0.0/diploidSV.vcf.gz tutorial_data/manta-1.0.0/somaticSV.vcf manta -``` - -!!! note - For older versions of MAVIS the convert command may require the path to - the file(s) be quoted and the strandedness be specified (default is - False) - - -3. **Which events you should validate in which libraries** +### Libraries Settings For this example, because we want to determine which events are germline/somatic we are going to pass all genome calls to both genomes. @@ -86,142 +59,140 @@ We can use either full file paths (if the input is already in the standard format) or the alias from a conversion (the first argument given to the convert option) -```text ---assign L1522785992-trans chimerascan defuse ---assign L1522785992-tumour breakdancer breakseq manta ---assign L1522785992-normal breakdancer breakseq manta +```json +{ + "libraries": { + "L1522785992-normal": { // keyed by library name + "assign": [ // these are the names of the input files (or conversion aliases) to check for this library + "breakdancer", + "breakseq", + "manta" + ], + "bam_file": "tutorial_data/L1522785992_normal.sorted.bam", + "disease_status": "normal", + "protocol": "genome" + }, + "L1522785992-trans": { + "assign": [ + "chimerascan", + "defuse" + ], + "bam_file": "tutorial_data/L1522785992_trans.sorted.bam", + "disease_status": "diseased", + "protocol": "transcriptome", + "strand_specific": true + }, + "L1522785992-tumour": { + "assign": [ + "breakdancer", + "breakseq", + "manta" + ], + "bam_file": "tutorial_data/L1522785992_tumour.sorted.bam", + "disease_status": "diseased", + "protocol": "genome" + } + } +} ``` -Putting this altogether with a name to call the config, we have the -command to generate the pipeline config. You should expect this step -with these inputs to take about \~5GB memory. +### Convert Settings -```bash -mavis config \ - --library L1522785992-normal genome normal False tutorial_data/L1522785992_normal.sorted.bam \ - --library L1522785992-tumour genome diseased False tutorial_data/L1522785992_tumour.sorted.bam \ - --library L1522785992-trans transcriptome diseased True tutorial_data/L1522785992_trans.sorted.bam \ - --convert breakdancer tutorial_data/breakdancer-1.4.5/*txt breakdancer \ - --convert breakseq tutorial_data/breakseq-2.2/breakseq.vcf.gz breakseq \ - --convert chimerascan tutorial_data/chimerascan-0.4.5/chimeras.bedpe chimerascan \ - --convert defuse tutorial_data/defuse-0.6.2/results.classify.tsv defuse \ - --convert manta tutorial_data/manta-1.0.0/diploidSV.vcf.gz tutorial_data/manta-1.0.0/somaticSV.vcf manta \ - --assign L1522785992-trans chimerascan defuse \ - --assign L1522785992-tumour breakdancer breakseq manta \ - --assign L1522785992-normal breakdancer breakseq manta \ - -w mavis.cfg -``` - -## Setting Up the Pipeline - -The next step is running the setup stage. This will perform conversion, clustering, and creating the -submission scripts for the other stages. +If they are raw tool output as in the current example you will need to +use the convert argument to tell MAVIS the file type -```bash -mavis setup mavis.cfg -o output_dir/ +```json +{ + "convert": { + "breakdancer": { // conversion alias/key + "assume_no_untemplated": true, + "file_type": "breakdancer", // input/file type + "inputs": [ + "tutorial_data/breakdancer-1.4.5/*txt" + ] + }, + "breakseq": { + "assume_no_untemplated": true, + "file_type": "breakseq", + "inputs": [ + "tutorial_data/breakseq-2.2/breakseq.vcf.gz" + ] + }, + "chimerascan": { + "assume_no_untemplated": true, + "file_type": "chimerascan", + "inputs": [ + "tutorial_data/chimerascan-0.4.5/chimeras.bedpe" + ] + }, + "defuse": { + "assume_no_untemplated": true, + "file_type": "defuse", + "inputs": [ + "tutorial_data/defuse-0.6.2/results.classify.tsv" + ] + }, + "manta": { + "assume_no_untemplated": true, + "file_type": "manta", + "inputs": [ + "tutorial_data/manta-1.0.0/diploidSV.vcf.gz", + "tutorial_data/manta-1.0.0/somaticSV.vcf" + ] + } + } +} ``` -At this stage you should have something that looks like this. For -simplicity not all files/directories have been shown. - - output_dir/ - |-- build.cfg - |-- converted_inputs - | |-- breakdancer.tab - | |-- breakseq.tab - | |-- chimerascan.tab - | |-- defuse.tab - | `-- manta.tab - |-- L1522785992-normal_normal_genome - | |-- annotate - | | |-- batch-aUmErftiY7eEWvENfSeJwc-1/ - | | `-- submit.sh - | |-- cluster - | | |-- batch-aUmErftiY7eEWvENfSeJwc-1.tab - | | |-- cluster_assignment.tab - | | |-- clusters.bed - | | |-- filtered_pairs.tab - | | `-- MAVIS-batch-aUmErftiY7eEWvENfSeJwc.COMPLETE - | `-- validate - | |-- batch-aUmErftiY7eEWvENfSeJwc-1/ - | `-- submit.sh - |-- pairing - | `-- submit.sh - `-- summary - `-- submit.sh - -## Submitting Jobs to the Cluster - -The last step is simple, ssh to your head node of your -[SLURM](../../glossary/#slurm) cluster (or run locally if you -have configured [remote_head_ssh](../../configuration/settings/#remote_head_ssh) and -run the schedule step. This will submit the jobs and create the -dependency chain - -```bash -ssh head_node -mavis schedule -o output_dir --submit +### Top-level Settings + +Finally you will need to set output directory and the reference files + +```json +{ + "output_dir": "output_dir_full", // where to output files + "reference.aligner_reference": [ + "reference_inputs/hg19.2bit" + ], + "reference.annotations": [ + "reference_inputs/ensembl69_hg19_annotations.v3.json" + ], + "reference.dgv_annotation": [ + "reference_inputs/dgv_hg19_variants.tab" + ], + "reference.masking": [ + "reference_inputs/hg19_masking.tab" + ], + "reference.reference_genome": [ + "reference_inputs/hg19.fa" + ], + "reference.template_metadata": [ + "reference_inputs/cytoBand.txt" + ] +} ``` -The schedule step also acts as a built-in checker and can be run to -check for errors or if the pipeline has completed. +## Running the Workflow + +In order to run the snakemake file you will need to have the config validation module +`mavis_config` installed which has minimal dependencies. ```bash -mavis schedule -o output_dir +pip install mavis_config ``` -This should give you output something like below (times may vary) after -your run completed correctly. +You are now ready to run the workflow -```text - MAVIS: 2.0.0 - hostname: gphost08.bcgsc.ca -[2018-06-02 19:47:56] arguments - command = 'schedule' - log = None - log_level = 'INFO' - output = 'output_dir/' - resubmit = False - submit = False -[2018-06-02 19:48:01] validate - MV_L1522785992-normal_batch-aUmErftiY7eEWvENfSeJwc (1701000) is COMPLETED - 200 tasks are COMPLETED - run time: 609 - MV_L1522785992-tumour_batch-aUmErftiY7eEWvENfSeJwc (1701001) is COMPLETED - 200 tasks are COMPLETED - run time: 669 - MV_L1522785992-trans_batch-aUmErftiY7eEWvENfSeJwc (1701002) is COMPLETED - 23 tasks are COMPLETED - run time: 1307 -[2018-06-02 19:48:02] annotate - MA_L1522785992-normal_batch-aUmErftiY7eEWvENfSeJwc (1701003) is COMPLETED - 200 tasks are COMPLETED - run time: 622 - MA_L1522785992-tumour_batch-aUmErftiY7eEWvENfSeJwc (1701004) is COMPLETED - 200 tasks are COMPLETED - run time: 573 - MA_L1522785992-trans_batch-aUmErftiY7eEWvENfSeJwc (1701005) is COMPLETED - 23 tasks are COMPLETED - run time: 537 -[2018-06-02 19:48:07] pairing - MP_batch-aUmErftiY7eEWvENfSeJwc (1701006) is COMPLETED - run time: 466 -[2018-06-02 19:48:07] summary - MS_batch-aUmErftiY7eEWvENfSeJwc (1701007) is COMPLETED - run time: 465 - parallel run time: 3545 - rewriting: output_dir/build.cfg - run time (hh/mm/ss): 0:00:11 - run time (s): 11 +```bash +snakemake --jobs 100 --configfile=tests/full-tutorial.config.json ``` -The parallel run time reported corresponds to the sum of the slowest job -for each stage and does not include any queue time etc. - ## Analyzing the Output The best place to start with looking at the MAVIS output is the summary folder which contains the final results. For column name definitions see the [glossary](../../outputs/columns). - output_dir/summary/mavis_summary_all_L1522785992-normal_L1522785992-trans_L1522785992-tumour.tab +```text +output_dir/summary/mavis_summary_all_L1522785992-normal_L1522785992-trans_L1522785992-tumour.tab +``` diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index 929a3dad..d4475985 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -3,7 +3,7 @@ This tutorial is based on the data included in the tests folder of MAVIS. The data files are very small and this tutorial is really only intended for testing a MAVIS install. The data here is simulated and -results are not representitive of the typical events you would see +results are not representative of the typical events you would see reported from MAVIS. For a more complete tutorial with actual fusion gene examples, please see the [full tutorial](../../tutorials/full/). @@ -14,112 +14,25 @@ installed ```bash git clone https://github.com/bcgsc/mavis.git -git checkout v2.0.0 +git checkout mv mavis/tests . +mv mavis/Snakefile . rm -r mavis ``` -Now you should have a folder called `tests` in your current directory. -You will need to specify the scheduler if you want to test one that is -not the default. For example +Now you should have a folder called `tests` in your current directory. Since this is a trivial +example, it can easily be run locally. However in order to run the snakemake file you will need +to have the config validation module `mavis_config` installed which has minimal dependencies. ```bash -export MAVIS_SCHEDULER=LOCAL +pip install mavis_config ``` -Since this is a trivial example, it can easily be run locally. By -default MAVIS in local mode will run a maximum of 1 less than the -current cpu count processes. If you are running other things on the same -machine you may find it useful to set this directly. +Now you are ready to run MAVIS. This can be done in a single command using snakemake. ```bash -export MAVIS_CONCURRENCY_LIMIT=2 +snakemake -j 1 --configfile=tests/mini-tutorial.config.json -s Snakefile ``` -The above will limit mavis to running 2 processes concurrently. - -Now you are ready to run MAVIS itself. This can be done in two commands -(since the config file we are going to use is already built). First set -up the pipeline - -```bash -mavis setup tests/data/pipeline_config.cfg -o output_dir -``` - -Now if you run the schedule step (without the submit flag, schedule acts -as a checker) you should see something like - -```bash -mavis schedule -o output_dir/ -``` - -```text - MAVIS: 1.8.4 - hostname: gphost08.bcgsc.ca -[2018-06-01 12:19:31] arguments - command = 'schedule' - log = None - log_level = 'INFO' - output = 'output_dir/' - resubmit = False - submit = False -[2018-06-01 12:19:31] validate - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 is NOT SUBMITTED -[2018-06-01 12:19:31] annotate - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 is NOT SUBMITTED -[2018-06-01 12:19:31] pairing - MP_batch-s4W2Go4tinn49nkhSuusrE is NOT SUBMITTED -[2018-06-01 12:19:31] summary - MS_batch-s4W2Go4tinn49nkhSuusrE is NOT SUBMITTED - rewriting: output_dir/build.cfg -``` - -Adding the submit argument will start the pipeline - -```bash -mavis schedule -o output_dir/ --submit -``` - -After this completes, run schedule without the submit flag again and you -should see something like - -```text - MAVIS: 1.8.4 - hostname: gphost08.bcgsc.ca -[2018-06-01 13:15:28] arguments - command = 'schedule' - log = None - log_level = 'INFO' - output = 'output_dir/' - resubmit = False - submit = False -[2018-06-01 13:15:28] validate - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 (zQJYndSMimaoALwcSSiYwi) is COMPLETED - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 (BHFVf3BmXVrDUA5X4GGSki) is COMPLETED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 (tUpx3iabCrpR9iKu9rJtES) is COMPLETED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 (hgmH7nqPXZ49a8yTsxSUWZ) is COMPLETED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 (cEoRN582An3eAGALaSKmpJ) is COMPLETED -[2018-06-01 13:15:28] annotate - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 (tMHiVR8ueNokhBDnghXYo6) is COMPLETED - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 (AsNpNdvUyhNtKmRZqRSPpR) is COMPLETED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 (k7qQiAzxfC2dnZwsGH7BzD) is COMPLETED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 (dqAuhhcVKejDvHGBXn22xb) is COMPLETED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 (eB69Ghed2xAdp2VRdaCJBf) is COMPLETED -[2018-06-01 13:15:28] pairing - MP_batch-s4W2Go4tinn49nkhSuusrE (6LfEgBtBsmGhQpLQp9rXmi) is COMPLETED -[2018-06-01 13:15:28] summary - MS_batch-s4W2Go4tinn49nkhSuusrE (HDJhXgKjRmseahcQ7mgNoD) is COMPLETED - rewriting: output_dir/build.cfg - run time (hh/mm/ss): 0:00:00 - run time (s): 0 -``` - -If you see the above, then MAVIS has completed correctly! +Which will run the mini tutorial version and output files into a folder called `output_dir` in the +current directory diff --git a/mavis/annotate/constants.py b/mavis/annotate/constants.py deleted file mode 100644 index a7645efd..00000000 --- a/mavis/annotate/constants.py +++ /dev/null @@ -1,89 +0,0 @@ -import re - -import tab - -from ..constants import MavisNamespace, float_fraction -from ..util import WeakMavisNamespace - - -PASS_FILENAME = 'annotations.tab' - -DEFAULTS = WeakMavisNamespace() -""" -- [annotation_filters](/configuration/settings/#annotation_filters) -- [max_orf_cap](/configuration/settings/#max_orf_cap) -- [min_domain_mapping_match](/configuration/settings/#min_domain_mapping_match) -- [min_orf_size](/configuration/settings/#min_orf_size) -""" -DEFAULTS.add( - 'min_domain_mapping_match', - 0.9, - cast_type=float_fraction, - defn='a number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript ' - 'to be displayed', -) -DEFAULTS.add( - 'min_orf_size', - 300, - defn='the minimum length (in base pairs) to retain a putative open reading frame (ORF)', -) -DEFAULTS.add( - 'max_orf_cap', - 3, - defn='the maximum number of ORFs to return (best putative ORFs will be retained)', -) -DEFAULTS.add( - 'annotation_filters', - 'choose_more_annotated,choose_transcripts_by_priority', - defn='a comma separated list of filters to apply to putative annotations', -) -DEFAULTS.add( - 'draw_fusions_only', - True, - cast_type=tab.cast_boolean, - defn='flag to indicate if events which do not produce a fusion transcript should produce illustrations', -) -DEFAULTS.add( - 'draw_non_synonymous_cdna_only', - True, - cast_type=tab.cast_boolean, - defn='flag to indicate if events which are synonymous at the cdna level should produce illustrations', -) - -SPLICE_TYPE = MavisNamespace( - RETAIN='retained intron', - SKIP='skipped exon', - NORMAL='normal', - MULTI_RETAIN='retained multiple introns', - MULTI_SKIP='skipped multiple exons', - COMPLEX='complex', -) -"""MavisNamespace: holds controlled vocabulary for allowed splice type classification values - -- ``RETAIN``: an intron was retained -- ``SKIP``: an exon was skipped -- ``NORMAL``: no exons were skipped and no introns were retained. the normal/expected splicing pattern was followed -- ``MULTI_RETAIN``: multiple introns were retained -- ``MULTI_SKIP``: multiple exons were skipped -- ``COMPLEX``: some combination of exon skipping and intron retention -""" - -SPLICE_SITE_TYPE = MavisNamespace(DONOR=3, ACCEPTOR=5) - -SPLICE_SITE_RADIUS = 2 -"""int: number of bases away from an exon boundary considered to be part of the splice site such that if it were altered - the splice site would be considered to be abrogated. -""" - -# splice site sequences based on: http://www.nature.com/nrg/journal/v17/n7/fig_tab/nrg.2016.46_F5.html?foxtrotcallback=true - -DONOR_SEQ = [ - re.compile('(AG)(GT[AG]AG)'), - re.compile('([CA]AG)(GTA)'), -] - -ACCEPTOR_SEQ = [ - re.compile('([TC]{8}[ATCG]CAG)([GA][ATCG])'), - re.compile('([TC]{9}TAG)([GA][ATCG])'), - re.compile('([TC]{8}[ATCG]AAG)([GA][ATCG])'), -] diff --git a/mavis/annotate/file_io.py b/mavis/annotate/file_io.py deleted file mode 100644 index fb7ec50d..00000000 --- a/mavis/annotate/file_io.py +++ /dev/null @@ -1,554 +0,0 @@ -""" -module which holds all functions relating to loading reference files -""" -import json -import re -import warnings -import os - -from Bio import SeqIO -import tab - -from .base import BioInterval, ReferenceName -from .genomic import Exon, Gene, Template, Transcript, PreTranscript -from .protein import Domain, Translation -from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate -from ..interval import Interval -from ..util import DEVNULL, LOG, filepath, WeakMavisNamespace - - -REFERENCE_DEFAULTS = WeakMavisNamespace() -REFERENCE_DEFAULTS.add( - 'template_metadata', - [], - cast_type=filepath, - listable=True, - defn='file containing the cytoband template information. Used for illustrations only', -) -REFERENCE_DEFAULTS.add( - 'masking', - [], - cast_type=filepath, - listable=True, - defn='file containing regions for which input events overlapping them are dropped prior to validation', -) -REFERENCE_DEFAULTS.add( - 'annotations', - [], - cast_type=filepath, - listable=True, - defn='path to the reference annotations of genes, transcript, exons, domains, etc', -) -REFERENCE_DEFAULTS.add( - 'aligner_reference', - None, - cast_type=filepath, - nullable=True, - defn='path to the aligner reference file used for aligning the contig sequences', -) -REFERENCE_DEFAULTS.add( - 'dgv_annotation', - [], - cast_type=filepath, - listable=True, - defn='Path to the dgv reference processed to look like the cytoband file.', -) -REFERENCE_DEFAULTS.add( - 'reference_genome', - [], - cast_type=filepath, - listable=True, - defn='Path to the human reference genome fasta file', -) - - -def load_masking_regions(*filepaths): - """ - reads a file of regions. The expect input format for the file is tab-delimited and - the header should contain the following columns - - - chr: the chromosome - - start: start of the region, 1-based inclusive - - end: end of the region, 1-based inclusive - - name: the name/label of the region - - For example: - - .. code-block:: text - - #chr start end name - chr20 25600000 27500000 centromere - - Args: - filepath (str): path to the input tab-delimited file - Returns: - Dict[str,List[BioInterval]]: a dictionary keyed by chromosome name with values of lists of regions on the chromosome - - Example: - >>> m = load_masking_regions('filename') - >>> m['1'] - [BioInterval(), BioInterval(), ...] - """ - regions = {} - for filepath in filepaths: - _, rows = tab.read_file( - filepath, - require=['chr', 'start', 'end', 'name'], - cast={'start': int, 'end': int, 'chr': ReferenceName}, - ) - for row in rows: - mask_region = BioInterval( - reference_object=row['chr'], start=row['start'], end=row['end'], name=row['name'] - ) - regions.setdefault(mask_region.reference_object, []).append(mask_region) - return regions - - -def load_reference_genes(*pos, **kwargs): - """ - *Deprecated* Use :func:`load_annotations` instead - """ - warnings.warn('this function has been replaced by load_annotations', DeprecationWarning) - return load_annotations(*pos, **kwargs) - - -def load_annotations(*filepaths, warn=DEVNULL, reference_genome=None, best_transcripts_only=False): - """ - loads gene models from an input file. Expects a tabbed or json file. - - Args: - filepath (str): path to the input file - verbose (bool): output extra information to stdout - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence by - template/chr name - filetype (str): json or tab/tsv. only required if the file type can't be interpolated from the path extension - - Returns: - Dict[str,List[mavis.annotate.genomic.Gene]]: lists of genes keyed by chromosome name - """ - total_annotations = {} - - for filename in filepaths: - data = None - - if filename.endswith('.tab') or filename.endswith('.tsv'): - data = convert_tab_to_json(filename, warn) - else: - with open(filename) as fh: - data = json.load(fh) - - current_annotations = parse_annotations_json( - data, - reference_genome=reference_genome, - best_transcripts_only=best_transcripts_only, - warn=warn, - ) - - for chrom in current_annotations: - for gene in current_annotations[chrom]: - total_annotations.setdefault(chrom, []).append(gene) - return total_annotations - - -def parse_annotations_json(data, reference_genome=None, best_transcripts_only=False, warn=DEVNULL): - """ - parses a json of annotation information into annotation objects - """ - genes_by_chr = {} - for gene_dict in data['genes']: - if gene_dict['strand'] in ['1', '+', 1]: - gene_dict['strand'] = STRAND.POS - elif gene_dict['strand'] in ['-1', '-', -1]: - gene_dict['strand'] = STRAND.NEG - else: - raise AssertionError( - 'input has unexpected form. strand must be 1 or -1 but found', gene_dict['strand'] - ) - - gene = Gene( - chr=gene_dict['chr'], - start=gene_dict['start'], - end=gene_dict['end'], - name=gene_dict['name'], - aliases=gene_dict['aliases'], - strand=gene_dict['strand'], - ) - - has_best = False - for transcript in gene_dict['transcripts']: - transcript['is_best_transcript'] = tab.cast_boolean(transcript['is_best_transcript']) - transcript.setdefault('exons', []) - exons = [Exon(strand=gene.strand, **ex) for ex in transcript['exons']] - if not exons: - exons = [(transcript['start'], transcript['end'])] - pre_transcript = PreTranscript( - name=transcript['name'], - gene=gene, - exons=exons, - is_best_transcript=transcript['is_best_transcript'], - ) - if pre_transcript.is_best_transcript: - has_best = True - if best_transcripts_only and not pre_transcript.is_best_transcript: - continue - gene.transcripts.append(pre_transcript) - - for spl_patt in pre_transcript.generate_splicing_patterns(): - # make splice transcripts and translations - spl_tx = Transcript(pre_transcript, spl_patt) - pre_transcript.spliced_transcripts.append(spl_tx) - - if ( - transcript.get('cdna_coding_end', None) is None - or transcript.get('cdna_coding_start', None) is None - ): - continue - tx_length = transcript['cdna_coding_end'] - transcript['cdna_coding_start'] + 1 - # check that the translation makes sense before including it - if tx_length % CODON_SIZE != 0: - warn('Ignoring translation. The translated region is not a multiple of three') - continue - tx_length = tx_length // CODON_SIZE - domains = [] - for dom in transcript['domains']: - try: - regions = [Interval(r['start'], r['end']) for r in dom['regions']] - regions = Interval.min_nonoverlapping(*regions) - for region in regions: - if region.start < 1 or region.end > tx_length: - raise AssertionError( - 'region cannot be outside the translated length' - ) - domains.append( - Domain( - name=dom['name'], - data={'desc': dom.get('desc', None)}, - regions=regions, - ) - ) - except AssertionError as err: - warn(repr(err)) - translation = Translation( - transcript['cdna_coding_start'], - transcript['cdna_coding_end'], - transcript=spl_tx, - domains=domains, - ) - if reference_genome and gene.chr in reference_genome: - # get the sequence near here to see why these are wrong? - seq = pre_transcript.get_cdna_seq(spl_tx.splicing_pattern, reference_genome) - met = seq[translation.start - 1 : translation.start + 2] - stop = seq[translation.end - CODON_SIZE : translation.end] - if translate(met) != START_AA or translate(stop) != STOP_AA: - warn( - 'Sequence error. The sequence computed from the reference does look like ' - 'a valid translation' - ) - continue - spl_tx.translations.append(translation) - if not best_transcripts_only or has_best: - genes_by_chr.setdefault(gene.chr, []).append(gene) - return genes_by_chr - - -def convert_tab_to_json(filepath, warn=DEVNULL): - """ - given a file in the std input format (see below) reads and return a list of genes (and sub-objects) - - +-----------------------+---------------------------+-----------------------------------------------------------+ - | column name | example | description | - +=======================+===========================+===========================================================+ - | ensembl_transcript_id | ENST000001 | | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | ensembl_gene_id | ENSG000001 | | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | strand | -1 | positive or negative 1 | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | cdna_coding_start | 44 | where translation begins relative to the start of the cdna| - +-----------------------+---------------------------+-----------------------------------------------------------+ - | cdna_coding_end | 150 | where translation terminates | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | hugo_names | KRAS | hugo gene name | - +-----------------------+---------------------------+-----------------------------------------------------------+ - - Args: - filepath (str): path to the input tab-delimited file - - Returns: - Dict[str,List[Gene]]: a dictionary keyed by chromosome name with values of list of genes on the chromosome - - Example: - >>> ref = load_reference_genes('filename') - >>> ref['1'] - [Gene(), Gene(), ....] - - Warning: - does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 - """ - - def parse_exon_list(row): - if not row: - return [] - exons = [] - for temp in re.split('[; ]', row): - try: - start, end = temp.split('-') - exons.append({'start': int(start), 'end': int(end)}) - except Exception as err: - warn('exon error:', repr(temp), repr(err)) - return exons - - def parse_domain_list(row): - if not row: - return [] - domains = [] - for domain in row.split(';'): - try: - name, temp = domain.rsplit(':') - temp = temp.split(',') - temp = [x.split('-') for x in temp] - regions = [{'start': int(x), 'end': int(y)} for x, y in temp] - domains.append({'name': name, 'regions': regions}) - except Exception as err: - warn('error in domain:', domain, row, repr(err)) - return domains - - def nullable_int(row): - try: - row = int(row) - except ValueError: - row = tab.cast_null(row) - return row - - _, rows = tab.read_file( - filepath, - require=['ensembl_gene_id', 'chr', 'ensembl_transcript_id'], - add_default={ - 'cdna_coding_start': 'null', - 'cdna_coding_end': 'null', - 'AA_domain_ranges': '', - 'genomic_exon_ranges': '', - 'hugo_names': '', - 'transcript_genomic_start': 'null', - 'transcript_genomic_end': 'null', - 'best_ensembl_transcript_id': 'null', - }, - cast={ - 'genomic_exon_ranges': parse_exon_list, - 'AA_domain_ranges': parse_domain_list, - 'cdna_coding_end': nullable_int, - 'cdna_coding_start': nullable_int, - 'transcript_genomic_end': nullable_int, - 'transcript_genomic_start': nullable_int, - 'gene_start': int, - 'gene_end': int, - }, - ) - genes = {} - for row in rows: - gene = { - 'chr': row['chr'], - 'start': row['gene_start'], - 'end': row['gene_end'], - 'name': row['ensembl_gene_id'], - 'strand': row['strand'], - 'aliases': row['hugo_names'].split(';') if row['hugo_names'] else [], - 'transcripts': [], - } - if gene['name'] not in genes: - genes[gene['name']] = gene - else: - gene = genes[gene['name']] - - transcript = { - 'is_best_transcript': row['best_ensembl_transcript_id'] == row['ensembl_transcript_id'], - 'name': row['ensembl_transcript_id'], - 'exons': row['genomic_exon_ranges'], - 'domains': row['AA_domain_ranges'], - 'start': row['transcript_genomic_start'], - 'end': row['transcript_genomic_end'], - 'cdna_coding_start': row['cdna_coding_start'], - 'cdna_coding_end': row['cdna_coding_end'], - 'aliases': [], - } - gene['transcripts'].append(transcript) - - return {'genes': genes.values()} - - -def load_reference_genome(*filepaths): - """ - Args: - filepaths (List[str]): the paths to the files containing the input fasta genomes - - Returns: - Dict[str,Bio.SeqRecord]: a dictionary representing the sequences in the fasta file - """ - reference_genome = {} - for filename in filepaths: - with open(filename, 'rU') as fh: - for chrom, seq in SeqIO.to_dict(SeqIO.parse(fh, 'fasta')).items(): - if chrom in reference_genome: - raise KeyError('Duplicate chromosome name', chrom, filename) - reference_genome[chrom] = seq - - names = list(reference_genome.keys()) - - # to fix hg38 issues - for template_name in names: - if template_name.startswith('chr'): - truncated = re.sub('^chr', '', template_name) - if truncated in reference_genome: - raise KeyError( - 'template names {} and {} are considered equal but both have been defined in the reference' - 'loaded'.format(template_name, truncated) - ) - reference_genome.setdefault(truncated, reference_genome[template_name].upper()) - else: - prefixed = 'chr' + template_name - if prefixed in reference_genome: - raise KeyError( - 'template names {} and {} are considered equal but both have been defined in the reference' - 'loaded'.format(template_name, prefixed) - ) - reference_genome.setdefault(prefixed, reference_genome[template_name].upper()) - reference_genome[template_name] = reference_genome[template_name].upper() - - return reference_genome - - -def load_templates(*filepaths): - """ - primarily useful if template drawings are required and is not necessary otherwise - assumes the input file is 0-indexed with [start,end) style. Columns are expected in - the following order, tab-delimited. A header should not be given - - 1. name - 2. start - 3. end - 4. band_name - 5. giemsa_stain - - for example - - .. code-block:: text - - chr1 0 2300000 p36.33 gneg - chr1 2300000 5400000 p36.32 gpos25 - - Args: - filename (str): the path to the file with the cytoband template information - - Returns: - List[Template]: list of the templates loaded - - """ - header = ['name', 'start', 'end', 'band_name', 'giemsa_stain'] - templates = {} - - for filename in filepaths: - header, rows = tab.read_file( - filename, - header=header, - cast={'start': int, 'end': int}, - in_={'giemsa_stain': GIEMSA_STAIN.values()}, - ) - - bands_by_template = {} - for row in rows: - band = BioInterval(None, row['start'] + 1, row['end'], name=row['band_name'], data=row) - bands_by_template.setdefault(row['name'], []).append(band) - - for tname, bands in bands_by_template.items(): - start = min([b.start for b in bands]) - end = max([b.end for b in bands]) - end = Template(tname, start, end, bands=bands) - templates[end.name] = end - return templates - - -class ReferenceFile: - - CACHE = {} # store loaded file to avoid re-loading - - LOAD_FUNCTIONS = { - 'annotations': load_annotations, - 'reference_genome': load_reference_genome, - 'masking': load_masking_regions, - 'template_metadata': load_templates, - 'dgv_annotation': load_masking_regions, - 'aligner_reference': None, - } - """dict: Mapping of file types (based on ENV name) to load functions""" - - def __init__(self, file_type, *filepaths, eager_load=False, assert_exists=False, **opt): - """ - Args: - *filepaths (str): list of paths to load - file_type (str): Type of file to load - eager_load (bool=False): load the files immeadiately - assert_exists (bool=False): check that all files exist - **opt: key word arguments to be passed to the load function and used as part of the file cache key - - Raises - FileNotFoundError: when assert_exists and an input does not exist - """ - self.name = sorted(filepaths) - self.file_type = file_type - self.key = tuple( - self.name + sorted(list(opt.items())) - ) # freeze the input state so we know when to reload - self.content = None - self.opt = opt - self.loader = self.LOAD_FUNCTIONS[self.file_type] - if assert_exists: - self.files_exist() - if eager_load: - self.load() - - def __repr__(self): - cls = self.__class__.__name__ - return '{}(file_type={}, files={}, loaded={}, content={})'.format( - cls, self.file_type, self.name, self.content is not None, object.__repr__(self.content) - ) - - def files_exist(self, not_empty=False): - if not_empty and not self.name: - raise FileNotFoundError('expected files but given an empty list', self) - for filename in self.name: - if not os.path.exists(filename): - raise FileNotFoundError('Missing file', filename, self) - - def __iter__(self): - return iter(self.name) - - def is_empty(self): - return not self.name - - def is_loaded(self): - return False if self.content is None else True - - def load(self, ignore_cache=False, verbose=True): - """ - load (or return) the contents of a reference file and add it to the cache if enabled - """ - if self.content is not None: - return self - if self.key in ReferenceFile.CACHE and not ignore_cache: - if verbose: - LOG('cached content:', self.name) - self.content = ReferenceFile.CACHE[self.key].content - return self - self.files_exist() - try: - LOG('loading:', self.name, time_stamp=True) - self.content = self.loader(*self.name, **self.opt) - ReferenceFile.CACHE[self.key] = self - except Exception as err: - message = 'Error in loading files: {}. {}'.format(', '.join(self.name), err) - raise err.__class__(message) - return self diff --git a/mavis/cluster/constants.py b/mavis/cluster/constants.py deleted file mode 100644 index 107100bf..00000000 --- a/mavis/cluster/constants.py +++ /dev/null @@ -1,46 +0,0 @@ -from ..util import WeakMavisNamespace - - -DEFAULTS = WeakMavisNamespace() -""" -- [cluster_initial_size_limit](/configuration/settings/#cluster_initial_size_limit) -- [cluster_radius](/configuration/settings/#cluster_radius) -- [limit_to_chr](/configuration/settings/#limit_to_chr) -- [max_files](/configuration/settings/#max_files) -- [max_proximity](/configuration/settings/#max_proximity) -- [min_clusters_per_file](/configuration/settings/#min_clusters_per_file) -- [uninformative_filter](/configuration/settings/#uninformative_filter) -""" -DEFAULTS.add( - 'min_clusters_per_file', 50, defn='the minimum number of breakpoint pairs to output to a file' -) -DEFAULTS.add( - 'max_files', 200, defn='The maximum number of files to output from clustering/splitting' -) -DEFAULTS.add( - 'cluster_initial_size_limit', - 25, - defn='the maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering ' - 'phase (combining based on overlap)', -) -DEFAULTS.add('cluster_radius', 100, defn='maximum distance allowed between paired breakpoint pairs') -DEFAULTS.add( - 'max_proximity', - 5000, - defn='the maximum distance away from an annotation before the region in considered to be uninformative', -) -DEFAULTS.add( - 'uninformative_filter', - False, - defn='flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered ' - 'out prior to clustering', -) -DEFAULTS.add( - 'limit_to_chr', - [str(x) for x in range(1, 23)] + ['X', 'Y'], - cast_type=str, - listable=True, - nullable=True, - defn='A list of chromosome names to use. BreakpointPairs on other chromosomes will be filtered' - 'out. For example \'1 2 3 4\' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4', -) diff --git a/mavis/config.py b/mavis/config.py deleted file mode 100644 index bfeb7a52..00000000 --- a/mavis/config.py +++ /dev/null @@ -1,786 +0,0 @@ -import argparse -from configparser import ConfigParser, ExtendedInterpolation -from copy import copy as _copy -import logging -import os -import re -import sys -import warnings - -import tab - -from . import __version__ -from .align import SUPPORTED_ALIGNER -from .annotate.constants import DEFAULTS as ANNOTATION_DEFAULTS -from .annotate.file_io import REFERENCE_DEFAULTS -from .bam.cache import BamCache -from .bam import stats -from .cluster.constants import DEFAULTS as CLUSTER_DEFAULTS -from .constants import DISEASE_STATUS, SUBCOMMAND, PROTOCOL, float_fraction -from .illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS -from .pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from .schedule.constants import OPTIONS as SUBMIT_OPTIONS -from .schedule.constants import SCHEDULER -from .summary.constants import DEFAULTS as SUMMARY_DEFAULTS -from .tools import SUPPORTED_TOOL -from .util import ( - bash_expands, - cast, - DEVNULL, - MavisNamespace, - WeakMavisNamespace, - filepath, - NullableType, -) -from .validate.constants import DEFAULTS as VALIDATION_DEFAULTS - - -CONVERT_OPTIONS = WeakMavisNamespace() -CONVERT_OPTIONS.add( - 'assume_no_untemplated', - True, - defn='assume that if not given there is no untemplated sequence between the breakpoints', -) - - -class CustomHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): - """ - subclass the default help formatter to stop default printing for required arguments - """ - - def _format_args(self, action, default_metavar): - if action.metavar is None: - action.metavar = get_metavar(action.type) - if isinstance(action, RangeAppendAction): - return '%s' % self._metavar_formatter(action, default_metavar)(1) - return super(CustomHelpFormatter, self)._format_args(action, default_metavar) - - def _get_help_string(self, action): - if action.required: - return action.help - return super(CustomHelpFormatter, self)._get_help_string(action) - - def add_arguments(self, actions): - # sort the arguments alphanumerically so they print in the help that way - actions = sorted(actions, key=lambda x: getattr(x, 'option_strings')) - super(CustomHelpFormatter, self).add_arguments(actions) - - -class RangeAppendAction(argparse.Action): - """ - allows an argument to accept a range of arguments - """ - - def __init__(self, nmin=1, nmax=None, **kwargs): - kwargs.setdefault('nargs', '+') - kwargs.setdefault('default', []) - argparse.Action.__init__(self, **kwargs) - self.nmin = nmin - self.nmax = nmax - assert nmin is not None - - def __call__(self, parser, namespace, values, option_string=None): - if getattr(namespace, self.dest, None) is None: - setattr(namespace, self.dest, []) - items = _copy(getattr(namespace, self.dest)) - items.append(values) - if self.nmax is None: - if len(values) < self.nmin: - raise argparse.ArgumentError( - self, 'must have at least {} arguments. Given: {}'.format(self.nmin, values) - ) - elif not self.nmin <= len(values) <= self.nmax: - raise argparse.ArgumentError( - self, 'requires {}-{} arguments. Given: {}'.format(self.nmin, self.nmax, values) - ) - setattr(namespace, self.dest, items) - - -class LibraryConfig(MavisNamespace): - """ - holds library specific configuration information - """ - - def __init__( - self, - library, - protocol, - disease_status, - bam_file=None, - inputs=None, - read_length=None, - median_fragment_size=None, - stdev_fragment_size=None, - strand_specific=False, - strand_determining_read=2, - **kwargs - ): - MavisNamespace.__init__(self) - self.library = library - self.protocol = PROTOCOL.enforce(protocol) - self.bam_file = bam_file - self.read_length = NullableType(int)(read_length) - self.median_fragment_size = NullableType(int)(median_fragment_size) - self.stdev_fragment_size = NullableType(int)(stdev_fragment_size) - self.strand_specific = cast(strand_specific, bool) - self.strand_determining_read = int(strand_determining_read) - self.disease_status = DISEASE_STATUS.enforce(disease_status) - try: - self.inputs = [f for f in re.split(r'[;\s]+', inputs) if f] - except TypeError: - self.inputs = inputs if inputs is not None else [] - - for attr, value in kwargs.items(): - for namespace in [CLUSTER_DEFAULTS, VALIDATION_DEFAULTS, ANNOTATION_DEFAULTS]: - if attr not in namespace: - continue - self.add( - attr, - value, - listable=namespace.is_listable(attr), - nullable=namespace.is_nullable(attr), - cast_type=namespace.type(attr), - ) - break - - def flatten(self): - result = MavisNamespace.items(self) - result['inputs'] = '\n'.join(result['inputs']) - return result - - def is_trans(self): - return True if self.protocol == PROTOCOL.TRANS else False - - @staticmethod - def build( - library, - protocol, - bam_file, - inputs, - annotations=None, - log=DEVNULL, - distribution_fraction=0.98, - sample_cap=3000, - sample_bin_size=1000, - sample_size=500, - **kwargs - ): - """ - Builds a library config section and gathers the bam stats - """ - PROTOCOL.enforce(protocol) - - if protocol == PROTOCOL.TRANS: - if annotations is None or annotations.is_empty(): - raise AttributeError( - 'missing required attribute: annotations. Annotations must be given for transcriptomes' - ) - annotations.load() - bam = BamCache(bam_file) - if protocol == PROTOCOL.TRANS: - bamstats = stats.compute_transcriptome_bam_stats( - bam, - annotations=annotations.content, - sample_size=sample_size, - sample_cap=sample_cap, - distribution_fraction=distribution_fraction, - ) - elif protocol == PROTOCOL.GENOME: - bamstats = stats.compute_genome_bam_stats( - bam, - sample_size=sample_size, - sample_bin_size=sample_bin_size, - sample_cap=sample_cap, - distribution_fraction=distribution_fraction, - ) - else: - raise ValueError('unrecognized value for protocol', protocol) - log(bamstats) - - return LibraryConfig( - library=library, - protocol=protocol, - bam_file=bam_file, - inputs=inputs, - median_fragment_size=bamstats.median_fragment_size, - stdev_fragment_size=bamstats.stdev_fragment_size, - read_length=bamstats.read_length, - strand_determining_read=bamstats.strand_determining_read, - **kwargs - ) - - @classmethod - def parse_args(cls, *args): - # '', '(genome|transcriptome)', '', '[strand_specific]', '[/path/to/bam/file]' - if len(args) < 4: - return LibraryConfig(args[0], protocol=args[1], disease_status=args[2]) - elif len(args) < 5: - return LibraryConfig( - args[0], protocol=args[1], disease_status=args[2], strand_specific=args[3] - ) - return LibraryConfig( - args[0], - protocol=args[1], - disease_status=args[2], - strand_specific=args[3], - bam_file=args[4], - ) - - -class MavisConfig(MavisNamespace): - def __init__(self, **kwargs): - # section can be named schedule or qsub to support older versions - MavisNamespace.__init__(self) - try: - content = validate_section( - kwargs.pop('schedule', kwargs.pop('qsub', {})), SUBMIT_OPTIONS, True - ) - self.schedule = content - except Exception as err: - err.args = [ - 'Error in validating the schedule section in the config. ' - + ' '.join([str(a) for a in err.args]) - ] - raise err - - # set the global defaults - for sec, defaults in [ - ('pairing', PAIRING_DEFAULTS), - ('summary', SUMMARY_DEFAULTS), - ('validate', VALIDATION_DEFAULTS), - ('annotate', ANNOTATION_DEFAULTS), - ('illustrate', ILLUSTRATION_DEFAULTS), - ('cluster', CLUSTER_DEFAULTS), - ('reference', REFERENCE_DEFAULTS), - ]: - try: - self[sec] = validate_section(kwargs.pop(sec, {}), defaults, True) - except Exception as err: - err.args = [ - 'Error in validating the {} section in the config. '.format(sec) - + ' '.join([str(a) for a in err.args]) - ] - - raise err - - SUPPORTED_ALIGNER.enforce(self.validate.aligner) - for attr, fnames in self.reference.items(): - if attr != 'aligner_reference': - self.reference[attr] = [f for f in [NullableType(filepath)(v) for v in fnames] if f] - if not self.reference[attr] and attr not in { - 'dgv_annotation', - 'masking', - 'template_metadata', - }: - raise FileNotFoundError( - 'Error in validating the convert section of the config for tag={}. ' - 'Required reference file does not exist'.format(attr) - ) - - # set the conversion section - self.convert = kwargs.pop('convert', {}) - for attr, val in self.convert.items(): - if attr in CONVERT_OPTIONS: - self.convert[attr] = CONVERT_OPTIONS.type(attr)(val) - continue - val = [v for v in re.split(r'[;\s]+', val) if v] - if not val: - raise UserWarning( - 'Error in validating convert section of the config for tag={}. Tag requires arguments'.format( - attr - ) - ) - if val[0] == 'convert_tool_output': - try: - val[-1] = tab.cast_boolean(val[-1]) - except TypeError: - val.append(False) - if len(val) < 4 or val[-2] not in SUPPORTED_TOOL.values(): - raise UserWarning( - 'Error in validating the convert section of the config for tag={}. '.format( - attr - ), - 'Conversion using the built-in convert_tool_output requires specifying the input file(s) and ' - 'tool name. Currently supported tools include:', - SUPPORTED_TOOL.values(), - 'given', - val, - ) - expanded_inputs = [] - for file_expr in val[1:-2]: - expanded = bash_expands(file_expr) - if not expanded: - raise FileNotFoundError( - 'Error in validating the config for tag={}. ' - 'Input file(s) do not exist'.format(attr), - val[1:-2], - ) - expanded_inputs.extend(expanded) - val = [val[0]] + expanded_inputs + val[-2:] - self.convert[attr] = val - self.convert = MavisNamespace(**self.convert) - - # now add the library specific sections - self.libraries = {} - - for libname, val in kwargs.items(): # all other sections already popped - libname = nameable_string(libname) - d = {} - d.update(self.cluster.items()) - d.update(self.validate.items()) - d.update(self.annotate.items()) - d.update(val) - d['library'] = libname - val['library'] = libname - self.libraries[libname] = LibraryConfig(**val) - # now try building the LibraryConfig object - try: - lc = LibraryConfig(**d) - self.libraries[libname] = lc - except TypeError as terr: # missing required argument - try: - lc = LibraryConfig.build(**d) - self.libraries[libname] = lc - except Exception as err: - raise UserWarning( - 'Error in validating the library section of the config.', libname, err, terr - ) - for inputfile in lc.inputs: - if inputfile not in self.convert and not os.path.exists(inputfile): - raise FileNotFoundError( - 'Error in validating the library section of the config. Input file does not exist', - libname, - inputfile, - ) - - def has_transcriptome(self): - return any([lib.is_trans() for lib in self.libraries.values()]) - - @staticmethod - def read(filepath): - """ - reads the configuration settings from the configuration file - - Args: - filepath (str): path to the input configuration file - - Returns: - List[Namespace]: namespace arguments for each library - """ - if not os.path.exists(filepath): - raise FileNotFoundError('File does not exist: {}'.format(filepath)) - parser = ConfigParser(interpolation=ExtendedInterpolation()) - parser.read(filepath) - config_dict = {} - - # get the library sections and add the default settings - for sec in parser.sections(): - config_dict.setdefault(sec, {}).update(parser[sec].items()) - return MavisConfig(**config_dict) - - -def write_config(filename, include_defaults=False, libraries=[], conversions={}, log=DEVNULL): - """ - Args: - filename (str): path to the output file - include_defaults (bool): True if default parameters should be written to the config, False otherwise - libraries (List[LibraryConfig]): library configuration sections - conversions (Dict[str,List]): conversion commands by alias name - log (Callable): function to pass output logging to - """ - config = {} - - config['reference'] = REFERENCE_DEFAULTS.to_dict() - for filetype, fname in REFERENCE_DEFAULTS.items(): - if fname is None: - warnings.warn( - 'filetype {} has not been set. This must be done manually before the configuration file is used'.format( - filetype - ) - ) - - if libraries: - for lib in libraries: - config[lib.library] = lib.to_dict() - - if include_defaults: - config['schedule'] = SUBMIT_OPTIONS.to_dict() - config['validate'] = VALIDATION_DEFAULTS.to_dict() - config['cluster'] = CLUSTER_DEFAULTS.to_dict() - config['annotate'] = ANNOTATION_DEFAULTS.to_dict() - config['illustrate'] = ILLUSTRATION_DEFAULTS.to_dict() - config['summary'] = SUMMARY_DEFAULTS.to_dict() - - config['convert'] = CONVERT_OPTIONS.to_dict() - for alias, command in conversions.items(): - if alias in CONVERT_OPTIONS: - raise UserWarning( - 'error in writing config. Alias for conversion product cannot be a setting', - alias, - CONVERT_OPTIONS.keys(), - ) - config['convert'][alias] = '\n'.join(command) - - for sec in config: - for tag, value in config[sec].items(): - if '_regex_' in tag: - config[sec][tag] = re.sub(r'\$', '$$', config[sec][tag]) - continue - elif not isinstance(value, str): - try: - config[sec][tag] = '\n'.join([str(v) for v in value]) - continue - except TypeError: - pass - config[sec][tag] = str(value) - - conf = ConfigParser() - for sec in config: - conf[sec] = {} - for tag, val in config[sec].items(): - conf[sec][tag] = val - log('writing:', filename) - with open(filename, 'w') as configfile: - conf.write(configfile) - - -def validate_section(section, namespace, use_defaults=False): - """ - given a dictionary of values, returns a new dict with the values casted to their appropriate type or set - to a default if the value was not given - """ - new_namespace = MavisNamespace() - if use_defaults: - new_namespace.copy_from(namespace) - - for attr, value in section.items(): - if attr not in namespace: - raise KeyError('tag not recognized', attr) - else: - cast_type = namespace.type(attr) - if namespace.is_listable(attr): - value = MavisNamespace.parse_listable_string( - value, cast_type, namespace.is_nullable(attr) - ) - else: - value = cast_type(value) - try: - new_namespace.add( - attr, - value, - cast_type=cast_type, - listable=namespace.is_listable(attr), - nullable=namespace.is_nullable(attr), - ) - except Exception as err: - raise ValueError('failed adding {}. {}'.format(attr, err)) - return new_namespace - - -def get_metavar(arg_type): - """ - For a given argument type, returns the string to be used for the metavar argument in add_argument - - Example: - >>> get_metavar(bool) - '{True,False}' - """ - if arg_type in [bool, tab.cast_boolean]: - return '{True,False}' - elif arg_type in [float_fraction, float]: - return 'FLOAT' - elif arg_type == int: - return 'INT' - elif arg_type == filepath: - return 'FILEPATH' - return None - - -def nameable_string(input_string): - """ - A string that can be used for library and/or filenames - """ - input_string = str(input_string) - if re.search(r'[;,_\s]', input_string): - raise TypeError('names cannot contain the reserved characters [;,_\\s]', input_string) - if input_string.lower() == 'none': - raise TypeError('names cannot be none', input_string) - if not input_string: - raise TypeError('names cannot be an empty string', input_string) - if not re.search(r'^[a-zA-Z]', input_string): - raise TypeError('names must start with a letter', input_string) - return input_string - - -def augment_parser(arguments, parser, required=None): - """ - Adds options to the argument parser. Separate function to facilitate the pipeline steps - all having a similar look/feel - """ - if required is None: - try: - required = bool(parser.title.startswith('required')) - except AttributeError: - pass - - for arg in arguments: - - if arg == 'help': - parser.add_argument( - '-h', '--help', action='help', help='show this help message and exit' - ) - elif arg == 'version': - parser.add_argument( - '-v', - '--version', - action='version', - version='%(prog)s version ' + __version__, - help='Outputs the version number', - ) - elif arg == 'log': - parser.add_argument('--log', help='redirect stdout to a log file', default=None) - elif arg == 'log_level': - parser.add_argument( - '--log_level', - help='level of logging to output', - choices=['INFO', 'DEBUG'], - default='INFO', - ) - elif arg == 'aligner_reference': - default = REFERENCE_DEFAULTS[arg] - parser.add_argument( - '--{}'.format(arg), - default=default, - required=required if not default else False, - help=REFERENCE_DEFAULTS.define(arg), - type=filepath, - ) - elif arg in REFERENCE_DEFAULTS: - default = REFERENCE_DEFAULTS[arg] - parser.add_argument( - '--{}'.format(arg), - default=default, - required=required if not default else False, - help=REFERENCE_DEFAULTS.define(arg), - type=filepath if required else NullableType(filepath), - nargs='*', - ) - elif arg == 'config': - parser.add_argument('config', help='path to the config file', type=filepath) - elif arg == 'bam_file': - parser.add_argument( - '--bam_file', help='path to the input bam file', required=required, type=filepath - ) - elif arg == 'read_length': - parser.add_argument( - '--read_length', - type=int, - help='the length of the reads in the bam file', - required=required, - ) - elif arg == 'stdev_fragment_size': - parser.add_argument( - '--stdev_fragment_size', - type=int, - help='expected standard deviation in insert sizes', - required=required, - ) - elif arg == 'median_fragment_size': - parser.add_argument( - '--median_fragment_size', - type=int, - help='median inset size for pairs in the bam file', - required=required, - ) - elif arg == 'library': - parser.add_argument( - '--library', help='library name', required=required, type=nameable_string - ) - elif arg == 'protocol': - parser.add_argument( - '--protocol', choices=PROTOCOL.values(), help='library protocol', required=required - ) - elif arg == 'disease_status': - parser.add_argument( - '--disease_status', - choices=DISEASE_STATUS.values(), - help='library disease status', - required=required, - ) - elif arg == 'skip_stage': - parser.add_argument( - '--skip_stage', - choices=[SUBCOMMAND.CLUSTER, SUBCOMMAND.VALIDATE], - action='append', - default=[], - help='Use flag once per stage to skip. Can skip clustering or validation or both', - ) - elif arg == 'strand_specific': - parser.add_argument( - '--strand_specific', - type=tab.cast_boolean, - default=False, - help='indicates that the input is strand specific', - ) - else: - value_type = None - help_msg = None - default_value = None - choices = None - nargs = None - if arg == 'aligner': - choices = SUPPORTED_ALIGNER.values() - help_msg = 'aligner to use for aligning contigs' - if arg == 'uninformative_filter': - help_msg = 'If flag is False then the clusters will not be filtered based on lack of annotation' - if arg == 'scheduler': - choices = SCHEDULER.keys() - - # get default values - for nspace in [ - CLUSTER_DEFAULTS, - VALIDATION_DEFAULTS, - ANNOTATION_DEFAULTS, - ILLUSTRATION_DEFAULTS, - PAIRING_DEFAULTS, - SUMMARY_DEFAULTS, - SUBMIT_OPTIONS, - CONVERT_OPTIONS, - ]: - if arg in nspace: - default_value = nspace[arg] - if nspace.is_listable(arg): - nargs = '*' - value_type = nspace.type(arg, None) - if nspace.is_nullable(arg): - value_type = NullableType(value_type) - if not help_msg: - help_msg = nspace.define(arg) - break - - if help_msg is None: - raise KeyError('invalid argument', arg) - parser.add_argument( - '--{}'.format(arg), - choices=choices, - nargs=nargs, - help=help_msg, - required=required, - default=default_value, - type=value_type, - ) - - -def generate_config(args, parser, log=DEVNULL): - """ - Args: - parser (argparse.ArgumentParser): the main parser - required: the argparse required arguments group - optional: the argparse optional arguments group - """ - libs = [] - inputs_by_lib = {} - convert = {} - try: - # process the libraries by input argument (--input) - for libconf in [LibraryConfig.parse_args(*a) for a in args.library]: - if not libconf.bam_file and SUBCOMMAND.VALIDATE not in args.skip_stage: - raise KeyError( - 'argument --library: bam file must be given if validation is not being skipped' - ) - libs.append(libconf) - inputs_by_lib[libconf.library] = set() - if ( - SUBCOMMAND.VALIDATE not in args.skip_stage - and libconf.protocol == PROTOCOL.TRANS - and (not args.annotations or args.annotations.is_empty()) - ): - parser.error( - 'argument --annotations is required to build configuration files for transcriptome libraries' - ) - - for arg_list in args.input: - inputfile = arg_list[0] - for lib in arg_list[1:]: - if lib not in inputs_by_lib: - raise KeyError( - 'argument --input: specified a library that was not configured. Please input all libraries using ' - 'the --library flag', - lib, - ) - inputs_by_lib[lib].add(inputfile) - # process the inputs by library argument (--assign) - for arg_list in args.assign: - lib = arg_list[0] - if lib not in inputs_by_lib: - raise KeyError( - 'argument --assign: specified a library that was not configured. Please input all libraries using ' - 'the --library flag', - lib, - ) - inputs_by_lib[lib].update(arg_list[1:]) - - for libconf in libs: - if not inputs_by_lib[libconf.library]: - raise KeyError( - 'argument --input: no input was given for the library', libconf.library - ) - libconf.inputs = inputs_by_lib[libconf.library] - - for alias, command in args.external_conversion: - if alias in convert: - raise KeyError('duplicate alias names are not allowed', alias) - convert[alias] = [] - open_option = False - for item in re.split(r'\s+', command): - if convert[alias]: - if open_option: - convert[alias][-1] += ' ' + item - open_option = False - else: - convert[alias].append(item) - if item[0] == '-': - open_option = True - else: - convert[alias].append(item) - - for arg in args.convert: - # should follow the pattern: alias file [file...] toolname [stranded] - alias = arg[0] - if alias in convert: - raise KeyError('duplicate alias names are not allowed: {}'.format(alias)) - if arg[-1] in SUPPORTED_TOOL.values(): - toolname = arg[-1] - stranded = False - inputfiles = arg[1:-1] - else: - toolname, stranded = arg[-2:] - inputfiles = arg[1:-2] - if not inputfiles: - raise KeyError('argument --convert is missing input file path(s): {}'.format(arg)) - stranded = str(tab.cast_boolean(stranded)) - SUPPORTED_TOOL.enforce(toolname) - convert[alias] = ['convert_tool_output'] + inputfiles + [toolname, stranded] - except KeyError as err: - parser.error(' '.join(err.args)) - - if SUBCOMMAND.VALIDATE not in args.skip_stage: - for i, libconf in enumerate(libs): - log('generating the config section for:', libconf.library) - libs[i] = LibraryConfig.build( - library=libconf.library, - protocol=libconf.protocol, - bam_file=libconf.bam_file, - inputs=inputs_by_lib[libconf.library], - strand_specific=libconf.strand_specific, - disease_status=libconf.disease_status, - annotations=args.annotations, - log=log, - sample_size=args.genome_bins - if libconf.protocol == PROTOCOL.GENOME - else args.transcriptome_bins, - distribution_fraction=args.distribution_fraction, - ) - write_config( - args.write, include_defaults=args.add_defaults, libraries=libs, conversions=convert, log=log - ) diff --git a/mavis/constants.py b/mavis/constants.py deleted file mode 100644 index 8c53cb49..00000000 --- a/mavis/constants.py +++ /dev/null @@ -1,819 +0,0 @@ -""" -module responsible for small utility functions and constants used throughout the structural_variant package -""" -import argparse -import re -import os - -from Bio.Alphabet import Gapped -from Bio.Alphabet.IUPAC import ambiguous_dna -from Bio.Data.IUPACData import ambiguous_dna_values -from Bio.Seq import Seq -from tab import cast_boolean, cast_null - - -PROGNAME = 'mavis' -EXIT_OK = 0 -EXIT_ERROR = 1 -EXIT_INCOMPLETE = 2 - - -class MavisNamespace: - """ - Namespace to hold module constants - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.thing - 1 - >>> nspace.otherthing - 2 - """ - - DELIM = r'[;,\s]+' - """str: delimiter to use is parsing listable variables from the environment or config file""" - - def __init__(self, *pos, **kwargs): - object.__setattr__(self, '_defns', {}) - object.__setattr__(self, '_types', {}) - object.__setattr__(self, '_members', {}) - object.__setattr__(self, '_nullable', set()) - object.__setattr__(self, '_listable', set()) - object.__setattr__(self, '_env_overwritable', set()) - object.__setattr__(self, '_env_prefix', 'MAVIS') - if '__name__' in kwargs: # for building auto documentation - object.__setattr__(self, '__name__', kwargs.pop('__name__')) - - for k in pos: - if k in self._members: - raise AttributeError('Cannot respecify existing attribute', k, self._members[k]) - self[k] = k - - for attr, val in kwargs.items(): - if attr in self._members: - raise AttributeError( - 'Cannot respecify existing attribute', attr, self._members[attr] - ) - self[attr] = val - - for attr, value in self._members.items(): - self._set_type(attr, type(value)) - - def __repr__(self): - return '{}({})'.format( - self.__class__.__name__, - ', '.join(sorted(['{}={}'.format(k, repr(v)) for k, v in self.items()])), - ) - - def discard(self, attr): - """ - Remove a variable if it exists - """ - self._members.pop(attr, None) - self._listable.discard(attr) - self._nullable.discard(attr) - self._defns.pop(attr, None) - self._types.pop(attr, None) - self._env_overwritable.discard(attr) - - def get_env_name(self, attr): - """ - Get the name of the corresponding environment variable - - Example: - >>> nspace = MavisNamespace(a=1) - >>> nspace.get_env_name('a') - 'MAVIS_A' - """ - if self._env_prefix: - return '{}_{}'.format(self._env_prefix, attr).upper() - return attr.upper() - - def get_env_var(self, attr): - """ - retrieve the environment variable definition of a given attribute - """ - env_name = self.get_env_name(attr) - env = os.environ[env_name].strip() - attr_type = self._types.get(attr, str) - - if attr in self._listable: - return self.parse_listable_string(env, attr_type, attr in self._nullable) - if attr in self._nullable and env.lower() == 'none': - return None - return attr_type(env) - - @classmethod - def parse_listable_string(cls, string, cast_type=str, nullable=False): - """ - Given some string, parse it into a list - - Example: - >>> MavisNamespace.parse_listable_string('1,2,3', int) - [1, 2, 3] - >>> MavisNamespace.parse_listable_string('1;2,None', int, True) - [1, 2, None] - """ - result = [] - string = string.strip() - for val in re.split(cls.DELIM, string) if string else []: - if nullable and val.lower() == 'none': - result.append(None) - else: - result.append(cast_type(val)) - return result - - def is_env_overwritable(self, attr): - """ - Returns: - bool: True if the variable is overrided by specifying the environment variable equivalent - """ - return attr in self._env_overwritable - - def is_listable(self, attr): - """ - Returns: - bool: True if the variable should be parsed as a list - """ - return attr in self._listable - - def is_nullable(self, attr): - """ - Returns: - bool: True if the variable can be set to None - """ - return attr in self._nullable - - def __getattribute__(self, attr): - try: - return object.__getattribute__(self, attr) - except AttributeError as err: - variables = object.__getattribute__(self, '_members') - if attr not in variables: - raise err - if self.is_env_overwritable(attr): - try: - return self.get_env_var(attr) - except KeyError: - pass - return variables[attr] - - def items(self): - """ - Example: - >>> MavisNamespace(thing=1, otherthing=2).items() - [('thing', 1), ('otherthing', 2)] - """ - return [(k, self[k]) for k in self.keys()] - - def to_dict(self): - return dict(self.items()) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, val): - self.__setattr__(key, val) - - def __setattr__(self, attr, val): - if attr.startswith('_'): - raise ValueError('cannot set private', attr) - object.__getattribute__(self, '_members')[attr] = val - - def copy_from(self, source, attrs=None): - """ - Copy variables from one namespace onto the current namespace - """ - if attrs is None: - attrs = source.keys() - for attr in attrs: - self.add( - attr, - source[attr], - listable=source.is_listable(attr), - nullable=source.is_nullable(attr), - defn=source.define(attr, None), - cast_type=source.type(attr, None), - ) - - def get(self, key, *pos): - """ - get an attribute, return a default (if given) if the attribute does not exist - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.get('thing', 2) - 1 - >>> nspace.get('nonexistant_thing', 2) - 2 - >>> nspace.get('nonexistant_thing') - Traceback (most recent call last): - .... - """ - if len(pos) > 1: - raise TypeError('too many arguments. get takes a single \'default\' value argument') - try: - return self[key] - except AttributeError as err: - if pos: - return pos[0] - raise err - - def keys(self): - """ - get the attribute keys as a list - - Example: - >>> MavisNamespace(thing=1, otherthing=2).keys() - ['thing', 'otherthing'] - """ - return [k for k in self._members] - - def values(self): - """ - get the attribute values as a list - - Example: - >>> MavisNamespace(thing=1, otherthing=2).values() - [1, 2] - """ - return [self[k] for k in self._members] - - def enforce(self, value): - """ - checks that the current namespace has a given value - - Returns: - the input value - - Raises: - KeyError: the value did not exist - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.enforce(1) - 1 - >>> nspace.enforce(3) - Traceback (most recent call last): - .... - """ - if value not in self.values(): - raise KeyError( - 'value {0} is not a valid member of {1}'.format(repr(value), self.values()) - ) - return value - - def reverse(self, value): - """ - for a given value, return the associated key - - Args: - value: the value to get the key/attribute name for - - Raises: - KeyError: the value is not unique - KeyError: the value is not assigned - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.reverse(1) - 'thing' - """ - result = [] - for key in self.keys(): - if self[key] == value: - result.append(key) - if len(result) > 1: - raise KeyError('could not reverse, the mapping is not unique', value, result) - elif not result: - raise KeyError('input value is not assigned to a key', value) - return result[0] - - def __iter__(self): - return iter(self.keys()) - - def _set_type(self, attr, cast_type): - if cast_type == bool: - self._types[attr] = cast_boolean - else: - self._types[attr] = cast_type - - def type(self, attr, *pos): - """ - returns the type - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.type('thing') - - """ - if len(pos) > 1: - raise TypeError('too many arguments. type takes a single \'default\' value argument') - try: - return self._types[attr] - except AttributeError as err: - if pos: - return pos[0] - raise err - - def define(self, attr, *pos): - """ - Get the definition of a given attribute or return a default (when given) if the attribute does not exist - - Returns: - str: definition for the attribute - - Raises: - KeyError: the attribute does not exist and a default was not given - - Example: - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1, defn='I am a thing') - >>> nspace.add('otherthing', 2) - >>> nspace.define('thing') - 'I am a thing' - >>> nspace.define('otherthing') - Traceback (most recent call last): - .... - >>> nspace.define('otherthing', 'I am some other thing') - 'I am some other thing' - """ - if len(pos) > 1: - raise TypeError('too many arguments. define takes a single \'default\' value argument') - try: - return self._defns[attr] - except KeyError as err: - if pos: - return pos[0] - raise err - - def add( - self, - attr, - value, - defn=None, - cast_type=None, - nullable=False, - env_overwritable=False, - listable=False, - ): - """ - Add an attribute to the name space - - Args: - attr (str): name of the attribute being added - value: the value of the attribute - defn (str): the definition, will be used in generating documentation and help menus - cast_type (Callable): the function to use in casting the value - nullable (bool): True if this attribute can have a None value - env_overwritable (bool): True if this attribute will be overriden by its environment variable equivalent - listable (bool): True if this attribute can have multiple values - - Example: - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1, int, 'I am a thing') - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1, int) - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1) - >>> nspace = MavisNamespace() - >>> nspace.add('thing', value=1, cast_type=int, defn='I am a thing') - """ - if cast_type: - self._set_type(attr, cast_type) - else: - self._set_type(attr, type(value)) - if defn: - self._defns[attr] = defn - - if nullable: - self._nullable.add(attr) - if env_overwritable: - self._env_overwritable.add(attr) - if listable: - self._listable.add(attr) - self[attr] = value - - def __call__(self, value): - try: - return self.enforce(value) - except KeyError: - raise TypeError( - 'Invalid value {} for {}. Must be a valid member: {}'.format( - repr(value), self.__class__.__name__, self.values() - ) - ) - - -def float_fraction(num): - """ - cast input to a float - - Args: - num: input to cast - - Returns: - float - - Raises: - TypeError: if the input cannot be cast to a float or the number is not between 0 and 1 - """ - try: - num = float(num) - except ValueError: - raise argparse.ArgumentTypeError('Must be a value between 0 and 1') - if num < 0 or num > 1: - raise argparse.ArgumentTypeError('Must be a value between 0 and 1') - return num - - -COMPLETE_STAMP = 'MAVIS.COMPLETE' -"""str: Filename for all complete stamp files""" - -SUBCOMMAND = MavisNamespace( - ANNOTATE='annotate', - VALIDATE='validate', - SETUP='setup', - SCHEDULE='schedule', - CLUSTER='cluster', - PAIR='pairing', - SUMMARY='summary', - CONFIG='config', - CONVERT='convert', - OVERLAY='overlay', -) -"""MavisNamespace: holds controlled vocabulary for allowed pipeline stage values - -- annotate -- cluster -- config -- convert -- pairing -- pipeline -- schedule -- summary -- validate -""" - - -CODON_SIZE = 3 -"""int: the number of bases making up a codon""" - - -def reverse_complement(s): - """ - wrapper for the Bio.Seq reverse_complement method - - Args: - s (str): the input DNA sequence - - Returns: - str: the reverse complement of the input sequence - - Warning: - assumes the input is a DNA sequence - - Example: - >>> reverse_complement('ATCCGGT') - 'ACCGGAT' - """ - input_string = str(s) - if not re.match('^[A-Za-z]*$', input_string): - raise ValueError('unexpected sequence format. cannot reverse complement', input_string) - input_string = Seq(input_string, DNA_ALPHABET) - return str(input_string.reverse_complement()) - - -def translate(s, reading_frame=0): - """ - given a DNA sequence, translates it and returns the protein amino acid sequence - - Args: - s (str): the input DNA sequence - reading_frame (int): where to start translating the sequence - - Returns: - str: the amino acid sequence - """ - reading_frame = reading_frame % CODON_SIZE - - temp = s[reading_frame:] - if len(temp) % 3 == 1: - temp = temp[:-1] - elif len(temp) % 3 == 2: - temp = temp[:-2] - temp = Seq(temp, DNA_ALPHABET) - return str(temp.translate()) - - -GAP = '-' - -ORIENT = MavisNamespace(LEFT='L', RIGHT='R', NS='?') -"""MavisNamespace: holds controlled vocabulary for allowed orientation values - -- ``LEFT``: left wrt to the positive/forward strand -- ``RIGHT``: right wrt to the positive/forward strand -- ``NS``: orientation is not specified -""" -setattr(ORIENT, 'expand', lambda x: [ORIENT.LEFT, ORIENT.RIGHT] if x == ORIENT.NS else [x]) -setattr(ORIENT, 'compare', lambda x, y: True if ORIENT.NS in [x, y] else (x == y)) - -PROTOCOL = MavisNamespace(GENOME='genome', TRANS='transcriptome') -"""MavisNamespace: holds controlled vocabulary for allowed protocol values - -- ``GENOME``: genome -- ``TRANS``: transcriptome -""" - -DISEASE_STATUS = MavisNamespace(DISEASED='diseased', NORMAL='normal') -"""MavisNamespace: holds controlled vocabulary for allowed disease status - -- ``DISEASED``: diseased -- ``NORMAL``: normal -""" - -STRAND = MavisNamespace(POS='+', NEG='-', NS='?') -"""MavisNamespace: holds controlled vocabulary for allowed strand values - -- ``POS``: the positive/forward strand -- ``NEG``: the negative/reverse strand -- ``NS``: strand is not specified -""" -setattr(STRAND, 'expand', lambda x: [STRAND.POS, STRAND.NEG] if x == STRAND.NS else [x]) -setattr(STRAND, 'compare', lambda x, y: True if STRAND.NS in [x, y] else (x == y)) - -SVTYPE = MavisNamespace( - DEL='deletion', - TRANS='translocation', - ITRANS='inverted translocation', - INV='inversion', - INS='insertion', - DUP='duplication', -) -"""MavisNamespace: holds controlled vocabulary for acceptable structural variant classifications - -- ``DEL``: deletion -- ``TRANS``: translocation -- ``ITRANS``: inverted translocation -- ``INV``: inversion -- ``INS``: insertion -- ``DUP``: duplication -""" - -CIGAR = MavisNamespace(M=0, I=1, D=2, N=3, S=4, H=5, P=6, X=8, EQ=7) # noqa -"""MavisNamespace: Enum-like. For readable cigar values - -- ``M``: alignment match (can be a sequence match or mismatch) -- ``I``: insertion to the reference -- ``D``: deletion from the reference -- ``N``: skipped region from the reference -- ``S``: soft clipping (clipped sequences present in SEQ) -- ``H``: hard clipping (clipped sequences NOT present in SEQ) -- ``P``: padding (silent deletion from padded reference) -- ``EQ``: sequence match (=) -- ``X``: sequence mismatch - -note: descriptions are taken from the `samfile documentation `_ -""" - -NA_MAPPING_QUALITY = 255 -"""int: mapping quality value to indicate mapping was not performed/calculated""" - -PYSAM_READ_FLAGS = MavisNamespace( - REVERSE=16, - MATE_REVERSE=32, - UNMAPPED=4, - MATE_UNMAPPED=8, - FIRST_IN_PAIR=64, - LAST_IN_PAIR=128, - SECONDARY=256, - MULTIMAP=1, - SUPPLEMENTARY=2048, - TARGETED_ALIGNMENT='ta', - RECOMPUTED_CIGAR='rc', - BLAT_RANK='br', - BLAT_SCORE='bs', - BLAT_ALIGNMENTS='ba', - BLAT_PERCENT_IDENTITY='bi', - BLAT_PMS='bp', -) - -"""MavisNamespace: Enum-like. For readable PYSAM flag constants - -- ``MULTIMAP``: template having multiple segments in sequencing -- ``UNMAPPED``: segment unmapped -- ``MATE_UNMAPPED``: next segment in the template unmapped -- ``REVERSE``: SEQ being reverse complemented -- ``MATE_REVERSE``: SEQ of the next segment in the template being reverse complemented -- ``FIRST_IN_PAIR``: the first segment in the template -- ``LAST_IN_PAIR``: the last segment in the template -- ``SECONDARY``: secondary alignment -- ``SUPPLEMENTARY``: supplementary alignment - -note: descriptions are taken from the `samfile documentation `_ -""" - -# read paired, read mapped in proper pair, mate reverse strand, first in pair - - -def _match_ambiguous_dna(x, y): - """ - >>> _match_ambiguous_dna('A', 'N') - True - >>> _match_ambiguous_dna('A', 'T') - False - >>> _match_ambiguous_dna('A', 'A') - True - """ - x = x.upper() - y = y.upper() - xset = set(ambiguous_dna_values.get(x, x)) - yset = set(ambiguous_dna_values.get(y, y)) - if not xset.intersection(yset): - return False - return True - - -DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-') -DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y) - -FLAGS = MavisNamespace(LQ='LOWQUAL') - -READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR') - -CALL_METHOD = MavisNamespace( - CONTIG='contig', - SPLIT='split reads', - FLANK='flanking reads', - SPAN='spanning reads', - INPUT='input', -) -"""MavisNamespace: holds controlled vocabulary for allowed call methods - -- ``CONTIG``: a contig was assembled and aligned across the breakpoints -- ``SPLIT``: the event was called by [split read](/glossary/#split-read) -- ``FLANK``: the event was called by [flanking read pair](/glossary/#flanking-read-pair) -- ``SPAN``: the event was called by [spanning read](/glossary/#spanning-read) -""" - -GENE_PRODUCT_TYPE = MavisNamespace(SENSE='sense', ANTI_SENSE='anti-sense') -"""MavisNamespace: controlled vocabulary for gene products - -- ``SENSE``: the gene product is a sense fusion -- ``ANTI_SENSE``: the gene product is anti-sense -""" - -PRIME = MavisNamespace(FIVE=5, THREE=3) -"""MavisNamespace: holds controlled vocabulary - -- ``FIVE``: five prime -- ``THREE``: three prime -""" - -START_AA = 'M' -"""str: The amino acid expected to start translation -""" -STOP_AA = '*' -"""str: The amino acid expected to end translation -""" - -GIEMSA_STAIN = MavisNamespace( - GNEG='gneg', - GPOS33='gpos33', - GPOS50='gpos50', - GPOS66='gpos66', - GPOS75='gpos75', - GPOS25='gpos25', - GPOS100='gpos100', - ACEN='acen', - GVAR='gvar', - STALK='stalk', -) -"""MavisNamespace: holds controlled vocabulary relating to stains of chromosome bands""" - -# content related to tabbed files for input/output -# ensure that we don't have to change ALL the code when we update column names - - -COLUMNS = MavisNamespace( - tracking_id='tracking_id', - library='library', - cluster_id='cluster_id', - cluster_size='cluster_size', - validation_id='validation_id', - annotation_id='annotation_id', - product_id='product_id', - event_type='event_type', - pairing='pairing', - inferred_pairing='inferred_pairing', - gene1='gene1', - gene1_direction='gene1_direction', - gene2='gene2', - gene2_direction='gene2_direction', - gene1_aliases='gene1_aliases', - gene2_aliases='gene2_aliases', - gene_product_type='gene_product_type', - transcript1='transcript1', - transcript2='transcript2', - fusion_splicing_pattern='fusion_splicing_pattern', - fusion_cdna_coding_start='fusion_cdna_coding_start', - fusion_cdna_coding_end='fusion_cdna_coding_end', - fusion_mapped_domains='fusion_mapped_domains', - fusion_sequence_fasta_id='fusion_sequence_fasta_id', - fusion_sequence_fasta_file='fusion_sequence_fasta_file', - fusion_protein_hgvs='fusion_protein_hgvs', - annotation_figure='annotation_figure', - annotation_figure_legend='annotation_figure_legend', - genes_encompassed='genes_encompassed', - genes_overlapping_break1='genes_overlapping_break1', - genes_overlapping_break2='genes_overlapping_break2', - genes_proximal_to_break1='genes_proximal_to_break1', - genes_proximal_to_break2='genes_proximal_to_break2', - break1_chromosome='break1_chromosome', - break1_position_start='break1_position_start', - break1_position_end='break1_position_end', - break1_orientation='break1_orientation', - exon_last_5prime='exon_last_5prime', - exon_first_3prime='exon_first_3prime', - break1_strand='break1_strand', - break1_seq='break1_seq', - break2_chromosome='break2_chromosome', - break2_position_start='break2_position_start', - break2_position_end='break2_position_end', - break2_orientation='break2_orientation', - break2_strand='break2_strand', - break2_seq='break2_seq', - opposing_strands='opposing_strands', - stranded='stranded', - protocol='protocol', - disease_status='disease_status', - tools='tools', - call_method='call_method', - break1_ewindow='break1_ewindow', - break1_ewindow_count='break1_ewindow_count', - break1_ewindow_practical_coverage='break1_ewindow_practical_coverage', - break1_homologous_seq='break1_homologous_seq', - break1_split_read_names='break1_split_read_names', - break1_split_reads='break1_split_reads', - break1_split_reads_forced='break1_split_reads_forced', - break2_ewindow='break2_ewindow', - break2_ewindow_count='break2_ewindow_count', - break2_ewindow_practical_coverage='break2_ewindow_practical_coverage', - break2_homologous_seq='break2_homologous_seq', - break2_split_read_names='break2_split_read_names', - break2_split_reads='break2_split_reads', - break2_split_reads_forced='break2_split_reads_forced', - contig_alignment_query_consumption='contig_alignment_query_consumption', - contig_alignment_score='contig_alignment_score', - contig_alignment_query_name='contig_alignment_query_name', - contig_read_depth='contig_read_depth', - contig_break1_read_depth='contig_break1_read_depth', - contig_break2_read_depth='contig_break2_read_depth', - contig_alignment_rank='contig_alignment_rank', - contig_build_score='contig_build_score', - contig_remap_score='contig_remap_score', - contig_remap_coverage='contig_remap_coverage', - contig_remapped_read_names='contig_remapped_read_names', - contig_remapped_reads='contig_remapped_reads', - contig_seq='contig_seq', - contig_strand_specific='contig_strand_specific', - contigs_assembled='contigs_assembled', - call_sequence_complexity='call_sequence_complexity', - spanning_reads='spanning_reads', - spanning_read_names='spanning_read_names', - flanking_median_fragment_size='flanking_median_fragment_size', - flanking_pairs='flanking_pairs', - flanking_pairs_compatible='flanking_pairs_compatible', - flanking_pairs_read_names='flanking_pairs_read_names', - flanking_pairs_compatible_read_names='flanking_pairs_compatible_read_names', - flanking_stdev_fragment_size='flanking_stdev_fragment_size', - linking_split_read_names='linking_split_read_names', - linking_split_reads='linking_split_reads', - raw_break1_half_mapped_reads='raw_break1_half_mapped_reads', - raw_break1_split_reads='raw_break1_split_reads', - raw_break2_half_mapped_reads='raw_break2_half_mapped_reads', - raw_break2_split_reads='raw_break2_split_reads', - raw_flanking_pairs='raw_flanking_pairs', - raw_spanning_reads='raw_spanning_reads', - untemplated_seq='untemplated_seq', - filter_comment='filter_comment', - cdna_synon='cdna_synon', - protein_synon='protein_synon', - supplementary_call='supplementary_call', - net_size='net_size', - repeat_count='repeat_count', - assumed_untemplated='assumed_untemplated', -) -"""MavisNamespace: Column names for i/o files used throughout the pipeline - -see [column descriptions](/outputs/columns) -""" - - -def sort_columns(input_columns): - order = {} - for i, col in enumerate(COLUMNS.values()): - order[col] = i - temp = sorted([c for c in input_columns if c in order], key=lambda x: order[x]) - temp = temp + sorted([c for c in input_columns if c not in order]) - return temp diff --git a/mavis/main.py b/mavis/main.py deleted file mode 100644 index f3aec4cc..00000000 --- a/mavis/main.py +++ /dev/null @@ -1,609 +0,0 @@ -#!python -import argparse -import logging -import platform -import os -import time -import sys - -import tab - -from . import __version__ -from .align import get_aligner_version -from . import annotate as _annotate -from .annotate import main as annotate_main -from .cluster.constants import DEFAULTS as CLUSTER_DEFAULTS -from .cluster import main as cluster_main -from . import config as _config -from .constants import SUBCOMMAND, PROTOCOL, float_fraction, EXIT_OK -from .error import DrawingFitError -from .illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS, DiagramSettings -from .illustrate.diagram import draw_multi_transcript_overlay -from .illustrate.scatter import bam_to_scatter -from .pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from .pairing import main as pairing_main -from .summary.constants import DEFAULTS as SUMMARY_DEFAULTS -from .summary import main as summary_main -from .tools import convert_tool_output, SUPPORTED_TOOL -from . import util as _util -from .validate.constants import DEFAULTS as VALIDATION_DEFAULTS -from .validate import main as validate_main -from .schedule import pipeline as _pipeline - - -def check_overlay_args(args, parser): - """ - parse the overlay options and check the formatting - """ - # check complex options - for marker in args.markers: - if len(marker) < 3: - marker.append(marker[-1]) - try: - marker[1] = int(marker[1]) - marker[2] = int(marker[2]) - except ValueError: - parser.error('argument --marker: start and end must be integers: {}'.format(marker)) - - defaults = [None, None, 0.5, None, True] - bam_file, density, ymax, stranded = range(1, 5) - - for plot in args.read_depth_plots: - for i, d in enumerate(defaults): - if i >= len(plot): - plot.append(d) - if not os.path.exists(plot[bam_file]): - parser.error( - 'argument --read_depth_plots: the bam file given does not exist: {}'.format( - plot[bam_file] - ) - ) - try: - plot[density] = float(plot[density]) - if plot[density] < 0 or plot[density] > 1: - raise ValueError() - except ValueError: - parser.error( - 'argument --read_depth_plots: density must be an float between 0 and 1: {}'.format( - plot[density] - ) - ) - try: - if str(plot[ymax]).lower() in ['null', 'none']: - plot[ymax] = None - else: - plot[ymax] = int(plot[ymax]) - except ValueError: - parser.error( - 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax]) - ) - try: - plot[stranded] = tab.cast_boolean(plot[stranded]) - except TypeError: - parser.error( - 'argument --read_depth_plots: stranded must be an boolean: {}'.format( - plot[stranded] - ) - ) - return args - - -def overlay_main( - gene_name, - output, - buffer_length, - read_depth_plots, - markers, - annotations, - drawing_width_iter_increase, - max_drawing_retries, - min_mapping_quality, - ymax_color='#FF0000', - **kwargs -): - """ - generates an overlay diagram - """ - annotations.load() - # check options formatting - gene_to_draw = None - - for chrom in annotations.content: - for gene in annotations.content[chrom]: - if gene_name in gene.aliases or gene_name == gene.name: - gene_to_draw = gene - _util.LOG( - 'Found target gene: {}(aka. {}) {}:{}-{}'.format( - gene.name, gene.aliases, gene.chr, gene.start, gene.end - ) - ) - break - if gene_to_draw is None: - raise KeyError('Could not find gene alias or id in annotations file', gene_name) - - settings = DiagramSettings(**kwargs) - - genomic_min = max(gene_to_draw.start - buffer_length, 1) - genomic_max = gene_to_draw.end + buffer_length - - plots = [] - for axis_name, bam_file, density, ymax, stranded in read_depth_plots: - # one plot per bam - plots.append( - bam_to_scatter( - bam_file, - gene_to_draw.chr, - genomic_min, - genomic_max, - strand=gene_to_draw.get_strand() if stranded else None, - ymax=ymax, - density=density, - axis_name=axis_name, - min_mapping_quality=min_mapping_quality, - ymax_color=ymax_color, - ) - ) - - for i, (marker_name, marker_start, marker_end) in enumerate(markers): - markers[i] = _annotate.base.BioInterval( - gene_to_draw.chr, marker_start, marker_end, name=marker_name - ) - - canvas = None - attempts = 1 - while True: - try: - canvas = draw_multi_transcript_overlay( - settings, - gene_to_draw, - vmarkers=markers, - plots=plots, - window_buffer=buffer_length, - log=_util.LOG, - ) - break - except DrawingFitError as err: - if attempts > max_drawing_retries: - raise err - _util.LOG('Drawing fit: extending window', drawing_width_iter_increase) - settings.width += drawing_width_iter_increase - attempts += 1 - - svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name)) - _util.LOG('writing:', svg_output_file) - - canvas.saveas(svg_output_file) - - -def convert_main(inputs, outputfile, file_type, strand_specific=False, assume_no_untemplated=True): - bpp_results = convert_tool_output( - inputs, - file_type, - strand_specific, - _util.LOG, - True, - assume_no_untemplated=assume_no_untemplated, - ) - if os.path.dirname(outputfile): - _util.mkdirp(os.path.dirname(outputfile)) - _util.output_tabbed_file(bpp_results, outputfile) - - -def main(argv=None): - """ - sets up the parser and checks the validity of command line args - loads reference files and redirects into subcommand main functions - - Args: - argv (list): List of arguments, defaults to command line arguments - """ - if argv is None: # need to do at run time or patching will not behave as expected - argv = sys.argv[1:] - start_time = int(time.time()) - - parser = argparse.ArgumentParser(formatter_class=_config.CustomHelpFormatter) - _config.augment_parser(['version'], parser) - subp = parser.add_subparsers( - dest='command', help='specifies which step/stage in the pipeline or which subprogram to use' - ) - subp.required = True - required = {} # hold required argument group by subparser command name - optional = {} # hold optional argument group by subparser command name - for command in SUBCOMMAND.values(): - subparser = subp.add_parser( - command, formatter_class=_config.CustomHelpFormatter, add_help=False - ) - required[command] = subparser.add_argument_group('required arguments') - optional[command] = subparser.add_argument_group('optional arguments') - _config.augment_parser(['help', 'version', 'log', 'log_level'], optional[command]) - - # config arguments - required[SUBCOMMAND.CONFIG].add_argument( - '-w', - '--write', - help='path to the new configuration file', - required=True, - metavar='FILEPATH', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--library', - metavar=' {genome,transcriptome} {diseased,normal} [strand_specific] [/path/to/bam/file]', - action=_config.RangeAppendAction, - help='configuration for libraries to be analyzed by mavis', - nmin=3, - nmax=5, - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--input', - help='path to an input file or filter for mavis followed by the library names it ' - 'should be used for', - nmin=2, - action=_config.RangeAppendAction, - metavar='FILEPATH [ ...]', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--assign', - help='library name followed by path(s) to input file(s) or filter names. This represents the list' - ' of inputs that should be used for the library', - action=_config.RangeAppendAction, - nmin=2, - metavar=' FILEPATH [FILEPATH ...]', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--genome_bins', - default=_util.get_env_variable('genome_bins', 100), - type=int, - metavar=_config.get_metavar(int), - help='number of bins/samples to use in calculating the fragment size stats for genomes', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--transcriptome_bins', - default=_util.get_env_variable('transcriptome_bins', 500), - type=int, - metavar=_config.get_metavar(int), - help='number of genes to use in calculating the fragment size stats for genomes', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--distribution_fraction', - default=_util.get_env_variable('distribution_fraction', 0.97), - type=float_fraction, - metavar=_config.get_metavar(float), - help='the proportion of the distribution of calculated fragment sizes to use in determining the stdev', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--convert', - nmin=3, - metavar=' FILEPATH [FILEPATH ...] {{{}}} [stranded]'.format( - ','.join(SUPPORTED_TOOL.values()) - ), - help='input file conversion for internally supported tools', - action=_config.RangeAppendAction, - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--external_conversion', - metavar=('', '<"command">'), - nargs=2, - default=[], - help='alias for use in inputs and full command (quoted)', - action='append', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--add_defaults', - default=False, - action='store_true', - help='write current defaults for all non-specified options to the config output', - ) - _config.augment_parser(['annotations'], optional[SUBCOMMAND.CONFIG]) - # add the optional annotations file (only need this is auto generating bam stats for the transcriptome) - _config.augment_parser(['skip_stage'], optional[SUBCOMMAND.CONFIG]) - - # convert - required[SUBCOMMAND.CONVERT].add_argument( - '--file_type', - choices=sorted(SUPPORTED_TOOL.values()), - required=True, - help='Indicates the input file type to be parsed', - ) - _config.augment_parser( - ['strand_specific', 'assume_no_untemplated'], optional[SUBCOMMAND.CONVERT] - ) - required[SUBCOMMAND.CONVERT].add_argument( - '--outputfile', '-o', required=True, help='path to the outputfile', metavar='FILEPATH' - ) - - for command in set(SUBCOMMAND.values()) - {SUBCOMMAND.CONFIG, SUBCOMMAND.CONVERT}: - required[command].add_argument( - '-o', '--output', help='path to the output directory', required=True - ) - - # pipeline - _config.augment_parser(['config'], required[SUBCOMMAND.SETUP]) - optional[SUBCOMMAND.SETUP].add_argument( - '--skip_stage', - choices=[SUBCOMMAND.CLUSTER, SUBCOMMAND.VALIDATE], - action='append', - default=[], - help='Use flag once per stage to skip. Can skip clustering or validation or both', - ) - - # schedule arguments - optional[SUBCOMMAND.SCHEDULE].add_argument( - '--submit', - action='store_true', - default=False, - help='submit jobs to the the scheduler specified', - ) - optional[SUBCOMMAND.SCHEDULE].add_argument( - '--resubmit', - action='store_true', - default=False, - help='resubmit jobs in error states to the the scheduler specified', - ) - - # add the inputs argument - for command in [ - SUBCOMMAND.CLUSTER, - SUBCOMMAND.ANNOTATE, - SUBCOMMAND.VALIDATE, - SUBCOMMAND.PAIR, - SUBCOMMAND.SUMMARY, - SUBCOMMAND.CONVERT, - ]: - required[command].add_argument( - '-n', - '--inputs', - nargs='+', - help='path to the input files', - required=True, - metavar='FILEPATH', - ) - - # cluster - _config.augment_parser( - ['library', 'protocol', 'strand_specific', 'disease_status'], required[SUBCOMMAND.CLUSTER] - ) - _config.augment_parser( - list(CLUSTER_DEFAULTS.keys()) + ['masking', 'annotations'], optional[SUBCOMMAND.CLUSTER] - ) - optional[SUBCOMMAND.CLUSTER].add_argument( - '--batch_id', help='batch id to use for prefix of split files', type=_config.nameable_string - ) - optional[SUBCOMMAND.CLUSTER].add_argument( - '--split_only', - help='Cluster the files or simply split them without clustering', - type=tab.cast_boolean, - ) - - # validate - _config.augment_parser( - [ - 'library', - 'protocol', - 'bam_file', - 'read_length', - 'stdev_fragment_size', - 'median_fragment_size', - 'strand_specific', - 'reference_genome', - 'aligner_reference', - ], - required[SUBCOMMAND.VALIDATE], - ) - _config.augment_parser(VALIDATION_DEFAULTS.keys(), optional[SUBCOMMAND.VALIDATE]) - _config.augment_parser(['masking', 'annotations'], optional[SUBCOMMAND.VALIDATE]) - - # annotate - _config.augment_parser( - ['library', 'protocol', 'annotations', 'reference_genome'], required[SUBCOMMAND.ANNOTATE] - ) - _config.augment_parser( - ['max_proximity', 'masking', 'template_metadata'], optional[SUBCOMMAND.ANNOTATE] - ) - _config.augment_parser( - list(_annotate.constants.DEFAULTS.keys()) + list(ILLUSTRATION_DEFAULTS.keys()), - optional[SUBCOMMAND.ANNOTATE], - ) - - # pair - _config.augment_parser(['annotations'], required[SUBCOMMAND.PAIR], optional[SUBCOMMAND.PAIR]) - _config.augment_parser( - ['max_proximity'] + list(PAIRING_DEFAULTS.keys()), optional[SUBCOMMAND.PAIR] - ) - - # summary - _config.augment_parser( - [ - 'annotations', - 'flanking_call_distance', - 'split_call_distance', - 'contig_call_distance', - 'spanning_call_distance', - ], - required[SUBCOMMAND.SUMMARY], - ) - _config.augment_parser(SUMMARY_DEFAULTS.keys(), optional[SUBCOMMAND.SUMMARY]) - _config.augment_parser(['dgv_annotation'], optional[SUBCOMMAND.SUMMARY]) - - # overlay arguments - required[SUBCOMMAND.OVERLAY].add_argument('gene_name', help='Gene ID or gene alias to be drawn') - _config.augment_parser(['annotations'], required[SUBCOMMAND.OVERLAY]) - _config.augment_parser( - ['drawing_width_iter_increase', 'max_drawing_retries', 'width', 'min_mapping_quality'], - optional[SUBCOMMAND.OVERLAY], - ) - optional[SUBCOMMAND.OVERLAY].add_argument( - '--buffer_length', - default=0, - type=int, - help='minimum genomic length to plot on either side of the target gene', - ) - optional[SUBCOMMAND.OVERLAY].add_argument( - '--read_depth_plot', - dest='read_depth_plots', - metavar=' [density FLOAT] [ymax INT] [stranded BOOL]', - nmin=2, - nmax=5, - help='bam file to use as data for plotting read_depth', - action=_config.RangeAppendAction, - ) - optional[SUBCOMMAND.OVERLAY].add_argument( - '--marker', - dest='markers', - metavar='