From 65db15126e814afbe23af13850c5dc310f3e7632 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 16 Apr 2021 12:04:23 -0700 Subject: [PATCH 001/137] Use snakemake to manage mavis scheduling - replace mavis scheduler with snakemake - use JSON config for snakemake instead of many command line arguments --- .github/workflows/build.yml | 1 - .gitignore | 9 +- MANIFEST.in | 3 +- Snakefile | 198 ++++ docs/configuration/pipeline.md | 3 + docs/hooks.py | 2 - .../snakemake.cluster.full-tutorial.png | Bin 0 -> 28669 bytes .../snakemake.cluster.mini-tutorial.png | Bin 0 -> 12934 bytes .../snakemake.validate.mini-tutorial.png | Bin 0 -> 34485 bytes docs/inputs/support.md | 23 +- docs/tutorials/mini.md | 106 +- mavis/annotate/file_io.py | 17 +- mavis/annotate/main.py | 86 +- mavis/cluster/main.py | 128 ++- mavis/config.py | 814 +++------------ mavis/constants.py | 8 +- mavis/main.py | 597 +++-------- mavis/overlay.py | 159 +++ mavis/pairing/main.py | 46 +- mavis/schedule/constants.py | 109 -- mavis/schedule/job.py | 265 ----- mavis/schedule/local.py | 161 --- mavis/schedule/pipeline.py | 945 ------------------ mavis/schedule/scheduler.py | 942 ----------------- mavis/schemas/config.json | 781 +++++++++++++++ mavis/schemas/overlay.json | 142 +++ mavis/summary/main.py | 67 +- mavis/validate/base.py | 5 +- mavis/validate/constants.py | 2 +- mavis/validate/main.py | 116 ++- setup.py | 8 +- tests/end_to_end/__init__.py | 21 - tests/end_to_end/test_config.py | 202 ---- tests/end_to_end/test_convert.py | 6 +- tests/end_to_end/test_full_pipeline.py | 406 -------- tests/end_to_end/test_help.py | 8 - tests/end_to_end/test_overlay.py | 158 +-- tests/end_to_end/test_pairing.py | 51 - tests/end_to_end/test_ref_alt_count.py | 7 +- tests/full-tutorial.config.json | 98 ++ tests/integration/schedule/__init__.py | 0 tests/integration/schedule/test_pipeline.py | 157 --- tests/integration/schedule/test_sge.py | 748 -------------- tests/integration/schedule/test_slurm.py | 617 ------------ tests/integration/schedule/test_torque.py | 441 -------- tests/integration/test_args.py | 518 +++++----- tests/integration/test_checker.py | 111 -- tests/integration/test_config.py | 64 -- tests/integration/test_mains.py | 135 --- tests/mini-tutorial.config.json | 64 ++ .../schedule => tests/snakemake}/__init__.py | 0 tests/snakemake/test_mini_workflow.py | 55 + tests/unit/test_config.py | 67 -- tests/util.py | 24 + 54 files changed, 2419 insertions(+), 7282 deletions(-) create mode 100644 Snakefile create mode 100644 docs/images/snakemake.cluster.full-tutorial.png create mode 100644 docs/images/snakemake.cluster.mini-tutorial.png create mode 100644 docs/images/snakemake.validate.mini-tutorial.png create mode 100644 mavis/overlay.py delete mode 100644 mavis/schedule/constants.py delete mode 100644 mavis/schedule/job.py delete mode 100644 mavis/schedule/local.py delete mode 100644 mavis/schedule/pipeline.py delete mode 100644 mavis/schedule/scheduler.py create mode 100644 mavis/schemas/config.json create mode 100644 mavis/schemas/overlay.json delete mode 100644 tests/end_to_end/test_config.py delete mode 100644 tests/end_to_end/test_full_pipeline.py delete mode 100644 tests/end_to_end/test_pairing.py create mode 100644 tests/full-tutorial.config.json delete mode 100644 tests/integration/schedule/__init__.py delete mode 100644 tests/integration/schedule/test_pipeline.py delete mode 100644 tests/integration/schedule/test_sge.py delete mode 100644 tests/integration/schedule/test_slurm.py delete mode 100644 tests/integration/schedule/test_torque.py delete mode 100644 tests/integration/test_checker.py delete mode 100644 tests/integration/test_config.py delete mode 100644 tests/integration/test_mains.py create mode 100644 tests/mini-tutorial.config.json rename {mavis/schedule => tests/snakemake}/__init__.py (100%) create mode 100644 tests/snakemake/test_mini_workflow.py delete mode 100644 tests/unit/test_config.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa09b829..1601aeba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,6 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/.gitignore b/.gitignore index 01fe3a1e..26638751 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # python generated files /.eggs /coverage -/venv +/venv* /.coverage *.pyc *__pycache__ @@ -32,3 +32,10 @@ junit /docs/package/mavis/*/*.md # don't ignore subpackage summary files !/docs/package/mavis/*/index.md + +.snakemake +output_dir* +bin +dag* +tutorial_data +reference_inputs diff --git a/MANIFEST.in b/MANIFEST.in index 7b0f98c6..165d54e6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,10 +1,11 @@ recursive-include docs * recursive-include tests *.py include tests/*/data/* -recursive-include mavis *.py +recursive-include mavis *.py *.json recursive-include tools *.pl *.py *.pm recursive-include tab *.py include README.md include LICENSE.txt +include mavis/config-schema.json prune docs/build prune docs/source/auto diff --git a/Snakefile b/Snakefile new file mode 100644 index 00000000..2e617c92 --- /dev/null +++ b/Snakefile @@ -0,0 +1,198 @@ +from snakemake.utils import validate +from snakemake import WorkflowError +import os +from typing import List, Dict +import re +import json +import pandas as pd + +CONTAINER = 'creisle/mavis:latest' + +def output_dir(*paths): + return os.path.join(config['output_dir'], *paths) + +INITIALIZED_CONFIG = output_dir('config.json') + + +try: + # TODO: replace with URL so that the user does not need a copy of the config schema + validate( + config, + os.path.join(os.getcwd(), 'mavis/schemas/config.json') + ) + for key in [ + "libraries", + "reference.annotations", + "output_dir" + ]: + if key not in config: + raise ValueError(f'missing required property: {key}') +except Exception as err: + short_msg = ' '.join(str(err).split('\n')[:2]) # these can get super long + raise WorkflowError(short_msg) + +libraries = sorted(list(config['libraries'])) +VALIDATE_OUTPUT = output_dir('{library}/validate/batch-{job_id}/validation-passed.tab') +CLUSTER_OUTPUT = output_dir('{library}/cluster/batch-{job_id}.tab') + +# create the cluster inputs and guess the cluster sizes +def count_total_rows(filenames): + row_count = 0 + for filename in filenames: + df = pd.read_csv(filename, sep='\t').drop_duplicates() + row_count += df.shape[0] + return row_count + + +for library in libraries: + lib_config = config['libraries'][library] + if 'total_batches' in lib_config: + continue + inputs = [] + for assignment in lib_config['assign']: + if assignment in config['convert']: + inputs.extend(config['convert'][assignment]['inputs']) + else: + inputs.append(assignment) + + # if not input by user, estimate the clusters based on the input files + max_files = config['cluster.max_files'] + min_rows = config['cluster.min_clusters_per_file'] + total_rows = count_total_rows(inputs) + + if round(total_rows / max_files) >= min_rows: + # use max number of jobs + lib_config['total_batches'] = max_files + else: + lib_config['total_batches'] = total_rows // min_rows + + +libs_args = [] +jobs_args = [] +for library in libraries: + for job_id in range(1, config['libraries'][library]['total_batches'] + 1): + libs_args.append(library) + jobs_args.append(job_id) + + +rule all: + input: output_dir('summary/MAVIS.COMPLETE') + + +rule copy_config: + output: output_dir('config.raw.json') + run: + with open(output_dir('config.raw.json'), 'w') as fh: + fh.write(json.dumps(config, sort_keys=True, indent=' ')) + + +rule init_config: + input: rules.copy_config.output + output: INITIALIZED_CONFIG + container: CONTAINER + shell: 'mavis setup --config {input} --outputfile {output}' + + +rule convert: + output: output_dir('converted_outputs/{alias}.tab') + input: rules.init_config.output + log: output_dir('converted_outputs/snakemake.{alias}.log.txt') + params: + file_type=lambda w: config['convert'][w.alias]['file_type'], + strand_specific=lambda w: config['convert'][w.alias]['strand_specific'], + assume_no_untemplated=lambda w: config['convert'][w.alias]['assume_no_untemplated'], + input_files=lambda w: config['convert'][w.alias]['inputs'] + container: CONTAINER + shell: + 'mavis convert --file_type {params.file_type}' + + ' --strand_specific {params.strand_specific}' + + ' --assume_no_untemplated {params.assume_no_untemplated}' + + ' --inputs {params.input_files}' + + ' --outputfile {output}' + + ' &> {log}' + + +def get_cluster_inputs(w): + conversions = config['convert'] + inputs = [] + for assignment in config['libraries'][w.library]['assign']: + if assignment in conversions: + inputs.extend(expand(rules.convert.output, alias=assignment)) + else: + inputs.append(assignment) + + return inputs + + +rule cluster: + input: files=get_cluster_inputs, + config=rules.init_config.output + output: directory(output_dir('{library}/cluster')) + log: output_dir('snakemake.cluster.{library}.log.txt') + container: CONTAINER + shell: + 'mavis cluster --config {input.config}' + + ' --library {wildcards.library}' + + ' --inputs {input.files}' + + ' --output {output}' + + ' &> {log}' + + +if not config['skip_stage.validate']: + rule validate: + input: rules.cluster.output + params: + dirname=lambda w: output_dir(f'{w.library}/validate/batch-{w.job_id}'), + inputfile=lambda w: expand(CLUSTER_OUTPUT, library=[w.library], job_id=[w.job_id]) + output: VALIDATE_OUTPUT + log: output_dir('{library}/validate/snakemake.batch-{job_id}.log.txt') + container: CONTAINER + shell: + 'mavis validate --config {rules.init_config.output}' + + ' --library {wildcards.library}' + + ' --inputs {params.inputfile}' + + ' --output {params.dirname}' + + ' &> {log}' + + +rule annotate: + input: rules.validate.output if not config['skip_stage.validate'] else rules.cluster.output + output: stamp=output_dir('{library}/annotate/batch-{job_id}/MAVIS.COMPLETE'), + result=output_dir('{library}/annotate/batch-{job_id}/annotations.tab') + log: output_dir('{library}/annotate/snakemake.batch-{job_id}.log.txt') + container: CONTAINER + shell: + 'mavis annotate --config {rules.init_config.output}' + + ' --library {wildcards.library}' + + ' --inputs {input}' + + ' --output ' + output_dir('{wildcards.library}/annotate/batch-{wildcards.job_id}') + + ' &> {log}' + + +rule pairing: + input: expand(rules.annotate.output.result, zip, library=libs_args, job_id=jobs_args) + output: stamp=output_dir('pairing/MAVIS.COMPLETE'), + result=output_dir('pairing/mavis_paired.tab') + params: + dirname=output_dir('pairing') + log: output_dir('snakemake.pairing.log.txt') + container: CONTAINER + shell: + 'mavis pairing --config {rules.init_config.output}' + + ' --inputs {input}' + + ' --output {params.dirname}' + + ' &> {log}' + + +rule summary: + input: rules.pairing.output.result, + output: output_dir('summary/MAVIS.COMPLETE') + params: + dirname=output_dir('summary') + log: output_dir('snakemake.summary.log.txt') + container: CONTAINER + shell: + 'mavis summary --config {rules.init_config.output}' + + ' --inputs {input}' + + ' --output {params.dirname}' + + ' &> {log}' diff --git a/docs/configuration/pipeline.md b/docs/configuration/pipeline.md index 73e2126e..76ffdbbf 100644 --- a/docs/configuration/pipeline.md +++ b/docs/configuration/pipeline.md @@ -2,6 +2,9 @@ ## Running MAVIS using a Job Scheduler +MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling +and setup + The setup step of MAVIS is set up to use a job scheduler on a compute cluster. will generate submission scripts and a wrapper bash script for the user to execute on their cluster head node. diff --git a/docs/hooks.py b/docs/hooks.py index 3727c646..30314742 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -7,7 +7,6 @@ from mavis.config import REFERENCE_DEFAULTS from mavis.illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS from mavis.pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from mavis.schedule.constants import OPTIONS as SUBMIT_OPTIONS from mavis.summary.constants import DEFAULTS as SUMMARY_DEFAULTS from mavis.util import ENV_VAR_PREFIX from mavis.validate.constants import DEFAULTS as VALIDATION_DEFAULTS @@ -21,7 +20,6 @@ def generate_settings_doc(): 'configuration/settings.md', 'Configurable Settings', [ - SUBMIT_OPTIONS, REFERENCE_DEFAULTS, SUMMARY_DEFAULTS, PAIRING_DEFAULTS, diff --git a/docs/images/snakemake.cluster.full-tutorial.png b/docs/images/snakemake.cluster.full-tutorial.png new file mode 100644 index 0000000000000000000000000000000000000000..cae3105155640b6081170207d6a16a1227dbcf2e GIT binary patch literal 28669 zcmZU4Wmr{F*X^NGQo2LBMOwN+TDn^a>FyAa4h5v7LAtveq@+Q*ySwhUIrkUQ{{-wp)g%m#t%8A2d@Ne~F0 zU23DU0C)l3SWZeD^8E7WS4)08cm~N%TFVjql>X0O$pI@t2*fr^MqEVAZT|4l-HT|d zMd+zbvdNlxiCO0?ZdNAMZ9sBOSac4V4EgoVWi1p79C9_@j;P#_itPdl+)?akAR4?_`zA&Mv?N12HtjQH|9jZ_EW%TH;-_?*3v<_=7^KrzIiMaa&;zOEgH z2<4ziR5c<;h4#+cAMGT#)w63#j4SePFHt-xC1VAsLLLN$LNsDp+(w?NA7Rcv>7A(( zFi_ezkHyFjB$IA~Srnk{W4<~3O28FmnMT%2;x%G|wfKeZA45d3h11A$PKv;uvqroj z{fz==o5nO*24xx_zxYUujo0PZOaoEtFJ?0$h7`It!E_pB!TK$vaD&>A)G>*laY7q@ zbElZ{S0wmXq!LNNK`biTnD#2K84}xn&G|W8YD-Im&5g&?>+CrRPDd&=cNn#Ph`8Zv z=cxW5PEUPHr&{7KQbig(_(uAAoecup$#TIX=s)m8QJmg|Smie+F(}>eS0$lnrsJ0u zdP(!=>7i9Dp?1HHjT=N87+-&WHu4kmL6*o@qQMw#JMp}L>w zf8adRUhFxG;+cs)#b6&!N?j1F&rzIhV}gO{u0G7GUGHR85;5k zUJ%Zguu|Y(os@0M$|>jR!;~~5JJ^%*usoKD$A_`Qf3ZX4GNAP!&S=N3K06-h{>;SK zh&AQDO+ivROV9~Ee@E@yh^y^xCONh1^_wU&NDD)%SjXOJ2PddkU{7u6;w8A+`cT29 zK0*ws(sT!bjAPMIu0%|w-iTG?&-%&=?|&pNA=b#VIe>^8<;uHH)c1xblq(GujU&50!@@1T+kF?Pl+=mJg*U zRg6E}x+?!{QW>iZ)`EE6r~8jTK|O2B962zt)Ds8u%5|9C_%L#UsO!~U+kq&;cEt+% zBg;T3aR0-(7L1EAJe)nTvaESPLPaH}m9%+!qp5Q994Mai+Jndp$G1OBEYMf1uYUo8 zFP@4&B-iH8aL7x~8qUoV5iO72xwQ`qDKLZQ{H`CpmI|NAJ<NcSN%u+6%hU%$!I#yq-z0rAmz&Z+nbBz+ z78|k2-tFWKF_{kxs*xQ=X2k{{)1Nk_n6n!(G%xF=ZlTk`*dM^I*>CsHM2XZG#n@=c|{~9LuN8Jkdc!;33lD$f@dAdOK=GK}$4RC4Chr8JF*Q zJrd))K5f=@{t~1zYa>JTEN{6do%Pa#P)3{0B=|-LTx)&%*WmU#@JJmZoaulxt?@ve z>uMihgf6)AL^kWa2q{p={+nqVq~nUPbo|5)E(hYL@6D(F-5c9Ds<9Cw+-61Qa{Ul+ z%up_RZBQ_sp7@soWHbMV=b`0GCN-80?Z?BI@OPqL(u=2g&(P2`eWKY=M7qQ5zphG6 z=wEz0Zn<9aw73t1Il}dtzn5cUqoa1`san&2y)}u{LQV(oT}N*_v8y`)J~Vi|QaPQC zrHD}c6K&k@h!2_C)fAnHRO)2%ib$;Bg(r@iB{XSYm zb~2_?m={nW{-pd(w!J+6_){yl}iDxENd~;JH^JPg}PtI7K$2K*kEy#WFgdluXTWxAfS4=3w%0 zT@dQ}Kjl2C54C69`ZzGkwn0@$vs~<15GQL!=EdzpiQl1;5Xv9Or6nx0nJK5yR4%C~bBnU`x#M?++heYW9#;>2(gYqrf zw1-~CDaq8UpO*n8sZ*bOBDt3k13Mrk>fx`5xY}*&U?35aR4h#g9ItV4vl<%-0Uof5 zX2wIw+So9Yx~)K^x^E?G)>;#c@NN?)4Vy#Sv!eySIgY;P-l}_g*cQqzW#Jffc5}n$ z{$~es;ql&ykHofN6SscxD9LQu_xVw?&Xxq1Nz)I7wHc!?nyk-xMlWP<@AJb+h|p_O zPl^3Oyh@|739i1}NWAx z!NS8=y?3~7rpt#ymgz|Sk!W)W&Ermt_Vs;v1lqg|XVv@4Wd=aDl#mnCD2Vo5S+Z=p zw)O+Mz7Re`aftz_Hqu$(uAGVTiVW`TNcMSeG{S!@aP_^+EBF;EYPlg>zsu3T0bLe6 zl#s52>InrC?nNhT$EL-D*$NZu@hpjhSSixgHXT*fE-vfYc49OHlZc8G+s3d(xAn*f z_TIa@yCgLRv&F^5hi%s86Va=77<9H?{HB!vl-YMPx?PfzlC%B&h)o-Da z@2sZFV`DLU@l96kl9;tBgIaF4QUYukd+||%{vB*~X;hRP5@{{88(=qD8;p(A)_yU4 zxqz#V#JE~Cwe1SDi!bIcPxz9y@~z&l&GNk$xQXUvW+d8YPIjhx9a0wZ1)HSn%c#5i z+$@Wzk&x$R|2u1|dr6mH&*(^H*-AY9PwqE+T+@@@zT^@Z{GIK9{XJLPtVTwKkI?G=)0}L zJsoT4y9bwORs=lXqZr&k-_MQu^y2f+vGMWq&&5(nC5clSxiqCcAEdwcOj5`X*Sgvo zuHcv9YAB~6!%u(E9MNhAZG=E?>i3)VmS z{Acf8E^1-<`QH9`kfN`(qEi8}Rm z6!Lkq3*RIpXtA*wzRU6u)AB70YLBRcno-Q+p{{5T5$;Aa4GF_|{*x1N7 zIW@KVjj>di4PTtxuJT#L-MztZ!JOf2JS-ubUIWYQ;4E&6rC_8lK3vguzk6nPz~*hD z>EN99mw{rU%UJFU5N&9Rl z(0;!r3e`S-G_te9;7YO4HDSX?#U~~Xn%LbS5|ffbK!q3kH`I1eIb=8SfqS)TRIuxO zR4Dq3_r4Cq@7p(IX=<#4g*tTPVB^2Xj=I*Q46$_Yu|g%^j)ZhR1+j1oAd*OBEqcT^Sh+`#oV@QfCU9ldL`;!=rom+FBID1ACs+LrR>!H8)eo$B`a28>;uR zJYsS`m^zXz(!)}|v=XQn;bbZyHn|)IEFAV$nhlLjP6qKGH)E4VBhV_Owu_;^585wn zVmaDm{4VQtJ7mxs?Q=mz%;$t?@N||4CI0og)S!)&d)ezPJOTm~#Jq~&OY>=O!qCL7 z6OY$T`B;jLel}l5M!{0Qc!8cN*YD--l*A4~Nb}8Fc+7xlL0OrUkrBmF{W@BZD1{wy zNQq8erzuB=A=}D9)o8Cq>lu#Mb<=X%_*q%YmA3Ul?W;rsA6`zv4}XpsULzN4*B}!{ z!f$8z3Zjwlac))~wcgtbJl(AKkB;`0)GaYwN4obazo6*(aKMly`+NmleS`6{z53KMjdg>0pVaK)ISQoR9fj0i8~6Fp zn!{6Fk-ye;83v+haEkqUo0Hml&b&|I;OO=ZWThD{+Wf=Wgt(0PB4Vz|#&dbwvn#}p zby5(fL6g7ljd}j!`?i-3bCLsQMCU|WH1Yk4Z3o)=BgF45ychU{@i%F;6FnJQD-LFv9WV7_}oQ7H24%mEc0eVx1QHWJjs-?vJ%=a~_OF&D4e$GlxF8%XrIiX7I{J#n`S<)&Lg}CS znDLtZ2iK-PsnV*Zl|fdGUQa9Z!(R(efRQn(@omq@NOdK3x<= zpDb&%g)@%c-qs)bC6Ayy`_kH)3Ave)cs@?tYw{4AKR;c!Y1!JcEVOuXKPTVad0c^& zMBQHl8q(g_)I|HR^z?YIB_aZ~zqe-`!Oq3T)(+xuYqkXV$w6_|$j@rsckkX6mzG93 zJ2|mM_jQ%FJ$blX%~|v{x0w$Ua&d5AB`*sSo8#%)K{ajtw0bG(L<9sziMkF*O^3A$ zH^=fHmv%UoeeP|h=jR7ODCR)s4zqf04t_uIT)rngzx67vsrgQT3SaASxkte4+4c3+ zpH;gTuCSYzG7DH*f15AZRTX7kn~c_mhJJ8?BDId&3R?~h_$JnWFXul!JkL&FMHV+R z`2MR+LW%dLU6^l;wXs?5KM-86-v4oUXwvAMyyu#2vwy$*EVxg=+n(}Juu|Yn6?_?o;=_Sr@eT`Dx0fkhaj;|phbH6_p&2|Q(%#l6cNNru;&#gxbjQ|SA zmd%&)xlO0i6v^{^nA>EKZJcBI_C&(xUP~7NTN`kC_U1?f+Ant0rH9FTT$rO3u8og( zAnNAV%f3R-b?QZCp4Ug4&pxL;cv_B*j=E-PJ9SUzL>4dnW{#h>HRFV@vL3CHOrSFA z@v22`*01Ot^IlLz;iv|<6V|t9n+N@|RDTX#s)ktG1k5x3CpD%n#rqx`tV0T?-=>== zu785R>P%;?(98R<9`znHFE4Z{<4T~Tt=4{Bvi8$N-Y252!d{=Ul9I?mHHNFRA=Yz7 zZftOZg8|x%jb~C~VpB-cxaG$3aJnd)9d*{kDvZPQKg%;opho+ESEK(A?s8a!vUK3Khz1HYg%pp z9hW9i6Z6*Iq@hit^U&25`&6U+brS+Qsi|ykoVx%i7oa^aYuE5CV}H}3-8j7uD0koU zBcPMzfH9ucP#gFzSN0p0EBSNrJ-2$aw3Ehl@+e&5Z|YHoTbWT{Zynr^k^I(D?p* zXLU2vxylyMK@RN?$x+!&*o^epE6gu<*p$_EnKOH^0-rH6GwUB1ST!grEDZD15eI;L z`mE0I-HwMC;#XSA@){3MtcXNtaY$_+2leaCOs{RYc(}!)`CW_rXEN22A*VJgt`uc$ zZRB84s6^T`AA5H+h}7S`w~PDE)-9V`TfGk?iwJ27v}b2$Th+_MrjLUR6K$+YlsP&n zNk_9pqdxDktL75LI{JNM-sCH|*uGtz8kSnYzky&?=$`N;?|I=k?+FS_3E~|;k$%z3 z@L)+jbV;&(zdol?4JZ4woKNa${?B4~{(0GE|s&fcO1XW=jjn;o%{V!v-QeJUsb#*>wh13F%3%f0lyqgkT(oXso=pAYl-V(^hYS*XlCk zaPVpQ2d*>6)Gb5uTFHLJ|L*T4?>B#-A#6qqN%rNqb~5BYLX}#Dp~(NTpIpuuWL$Z= zG?AzjzLra5Njvehv9WP`e!O12zK@NM4}5s=lBT9bM*~C=J{_bLz3+M$ZMv|A2A-E7 z*-_&@i`S83#`^Kk)Zaiz*qV)l-JTTNq`oBWfJWi9`;OW7(^Pf zg@U3oIXStqo1Xy>nYHE;xId_}{}R5-q*V!Pk?u}o+j?uRS)vX|=D~aoifx_e;N=+{ zRb;5#Nthmu`YC{#qvWYZGu8W~&w6?Hi{NNX`px)4w*!hRXK0=OR@c|JMs zy^8wNIGmTIzy5CK5e))5Y&0@KfkTeK)p*+LiUkM3_^_#mww8|7>i<^Zg*KYX&r=M| zppSew>vciR7;b%dTF8jB9qq0|DPu&f5Qps&edU!ITU`)eIHEDYnbMd(Mq=W$C?+CdLuE6k&=O+6(I{jBl%X;Yyx)T8n~f-D=Xxs&1bT(fg;Tpf8}N?%}{=( zq}cc1Xc(^dMuJGY{r`;q>9a%DW_eh9c?h}l*VwqFGb_qf%wkPSLhBpWxkSY^eAoXHF>>5`?=!W%@FJ38ZjaWWIGA^G8 zA?2-WeRLzAR(FcX~&krNv{Ve$m{cWzIx*4_H29 zKz_w+WYW=|B9Q?V3Ia(fLqW?>-kBP9Q@^Hg3y5uf8$8&U^YB%Kq+~+j_XGjH4)^#Y zCAQtlubEUR2)@X0uPoq8GiQ0aiNc|G7R-j6r zkGwim%eUvLvE?wc6H!YyhWh46xl70=S=+J7jii_3zW&SXZ6_0K6m~X^uetXd3sHu_ z7vB!j?srdqmzsgpvS*4M%d-~^p@c#CUA--T)1U=srS1Oi_Zekm=IAYfd{mlf8EbNP z*s9HgOl|E7)&@;Qm7~~z1_kZ_6x+f6v(2ASo;duc7#QHWK#5CkGc(Wg)WIoaXY$l4 zh-H%)4!9MT5@m9J_j(1)ouuIMvSWfEEu|gG+!mm@lt&0IYR0$&>Q>@RyQ_C6xN2=DzWi zerh-9MFlU|#L-N;euhBQM~%Ssl#EHpTA9?v>K6C>?c6{(<)}cG3#UAXYl6XjITGB| zr+$#MQpK^h*xK$mxEpCgyB0lw?H#vN-C~AGM+x+ZCXw)i^IkMv{i1e-+rT=s#+Aad zp7lf}OYx1uI2U1xTSx8k1>$__#WM3BE$Eawm=R&t)F;v#F_(nj>SLifyvSALR~PTY z1h+|F1Zqc3buT!qG*rA$RK??i197_7qM&&}cHApa$d4Ipl)>xay#yPf1DiN#cn0Y! z+c(F(w;pL0K-lbLnGi$#x(Ix}RNHzK2CXymXaYuq3XD=eI2QxFfBsPQ58h*i3NU?d zT1PpO8-U*6HGR*6FB%gz1tf)ade#$jl42;~(rQ=`VUM&7O8l!%fGLFTW`l%O$(qq7 zc!EcBL58z1eqBJf$fT~Lf73SXAE@9=2>g&$lsUGw_feR~_Q3X{P*4e(m)}O`tr&V> zTfWXwYJj!6Tt-M-5wGv+>H@8Idxc5giXG3&shg6%zMyt~(fIZ}wralF?}@l+8IM-! zkBd1(AdBUl2|ZFX$T1u`_jqALmQm9_f9%BO9MyO%^}gx9Acr=hg%~y*?b9jl`o;JA zSX~ka@UuahmP18c8J&KA^pZrNj2MY^1&)yh*nq%L+`n+8RID%@JpMzk2?Hs~BE+h| zf+-CIF9MLjFct*ffZ3zJz|;c;E!eJ7`|Sqpl0+TABgs@bZ%Ft-#esapiCV9-f)d$| zvm=&qu3E6^u^}rLv?PBnn{ocu>)$ydut*@WoFv0{{N$MQ+qEmoK;LR2f5mEu z&@;bobPGTVTIAt8nHh(Ig90Wcup_}%5*Zxow5Qxr%QxzGgiT)|2QcVC+yu<=%oGOb z6ZnvdvS%$~0uKUI)EBFAs4XOp=WqQV9{M9wfCg(5OSCr%o5(VPFV-9UrH6D+gXov+ zh%J;kqslLRM*HfmRE6@zoxoe7eUq=<9ALyMc<2r7JrYlaGc+|^Fx&;W&i$VR(?JLxOhZ(#Va$+@E$sPdjAf?&}3A8hJ}4j z9Ot*o;5uLS8T?{Nc9~tlSZdhMhw!`>ofsK#2jrtjc&xp{GUgTk<2ql*{ZM7rH^OaS zYPB$|_HA!dk82_hM`4jN8d2H%8)KUO3um#`cYSA!(JLY+;k9wvs)FU_lsz+*5m5YQ zy(JHhpGk_-<0p7SdFW*w6SLVY=pAzTq{SUF>vSX$zqZ)jd9v@=2KRVlM5LuQW^=m2 z{gnM8z1KhL#?D4e=0S2e&72)=M&*$6=4E(8T_3JNGv)G;4#2H}9qfNc3{GS|`uF#D znn`G6@g1K#KBcX$8YK;H*~cpb`EuvdW5}IN_!ACY5SAQ^7SHeKOUpc!51_^w)m9ISQQ&y@6 z6KjioXJKGa#~7HcsDAbI&fKX$MTN(Msr0LEoK$oLHZ_Jr$L?`pPYap)btXGZ6%YPo zes<;tA8>Z*2&|{TrG69Lou0fam_*0EGh{S6goZ%aB!V=YNZ_htHqMf;`K8Xd51Dr?g5N(8~ z2-3bjLqrt4R2h3C-XuESPnHh3xdp6GiV4wMbn@K)Z=NJbi~Llmy*tM9M`z*$@>u^V zzWcXeOQr_g;KAItPqtVAHHh$EDHQp^V9k#6(N}IB$_tmD?{cdMIF6x%sJra8Y_J=7 zD$Iyl82%Q`L<@fXnY3uzx6}g6UCLO*;Nak~{mZe>s{9@^WDe~IH;6R%3aFmZ3OuR0 ztNLickEm?1cu}3c}>F4;7QcZk}fVT4Bgy#w%+HegaJ_i>=|k5fw{c= z>0MoR`08Th1XSL-0?$_}D7z@_fZsiYrqW+R30BF! zs-=b#e`3$c>L#vI!7U!Pc>n%AkH-Za#PD#wMu}nj*pfA*3#w`kHM}RF;v;LUv=Yz} zfQ=eVdD+z{mWct}fbQ$uxS>r;OG`i5cWq?sylDyt%WWBLRW)v>#tGlQzj<&4;>O+m zy%9(9%4IL9_4)a^vbs7~kyM%}k*Ac~T%{Qui-J$7a2yc$W^1hQ9(;t5k&(w{XX%_` zIiO8YL1$mCNhy>0j_}pbpFi^spJms$g}!LzBuP4+{T?u{8s0bjP@?>;~{an5FX(Iz#Q_uv#fK)~e;AGy515Gq=Kn1`M)? zv~6MXn6sy*Wq6d9&sgl*q1lAWt`B+J3U;YKPgx%^dj$4}AB{?L^)#Il?mSa1UkBa93UA)A{CU#x2t zUQK58HOli`}~?2sXX@3 zdU|><^-I4@Q99(J03dsHP0eXE;?Uq=x#f74`*i^e>3Gp_1r$-AJso|RP-gM+adZO_ zpSn}Kw2x2gSkX@7RU!L%)-CtF{Us7`zYk~}aal=bu?{r#bm`Z3TzCzkE4PRe2vV zH8T@JB^7n@xb^$@ZzVlF<_j8W2GCA}8%iwTh5={@$QlTnO*@yCkXWG_a+(4{i+)C| zYZ@@Pq5b<2-yu&+Q{v#^by;!(38-WL68W#9Yb(Fk4I4;QC~@eTygVA@$+*y3>(t(K zCTlJoNmr*I($o*@qaYR+S69ZsYm8tA>IZoAn{t3O7b%+=tsj*FvKDYrA>x0ffgWNi z>6*0lwnUvsqa7D5q-{AG$bk5f zz4eZ@!Ck9K3tK=By1cy9a-~eCGsFh@%)mfcmzJH?a0<-OX{YLF7v31z+;fi@SQ#OZ zue`jxc1??*ZoMuZZXeH<0y&C8&Ve8C-;7oO<>7P9&$g-r8<4NS7r{Rdo!oUgbM*sa zTVC@zHY)u1-@lz-_yLCX_V&8JuXJGn2Fl+4K6pC7jwhnFmdlcpaQ5&jNh-~;A&_@j zVnMbR?wtCatgP(V+#C`BnCQOvn!c>$7v%Bq@PLRn4lDio!3UP(r(P61Y#unV3tEH0 zK}^>1YV6$H-1R#ZUSc#JVl6eiX%{FCpP~VWLseY4viKyjoPy0H+nc z$mzfjT(ljnk9B>Eqd3A30#F2+$67Dq*Ic6k34O@GARN-M_MfL;d|Bas@+M(ge9kDvzz@i z{AXZhnOC`Ye)}I=QNcTL7}b@2(Llgygehamfz-^B$jEqK^1;G-q&6xF&B@iZulY;eLWR{# zY0ieVoQA(J#M|2&drOkvIWM%cQxxoP3SzYEni`zQ$Vg!KVSC`fVlK7;|5sfQo~BS) zMI`{6Rsj+!fShZ{tPWgtvI6?o$ei!qDgUCu0B4mhn(EwiI3?-W6~_u^$DODXVZs&5 zpD+T&?Gx|9-`K8IVX$@UKmE%48CX+LfC${ub(q>f1axz=c3f2WRfh)17?o0-u0uoW zakzDbX1U#}Xl8fS}61xpIkXz+}@RDp&S>7bpAX z&71N$>yulr@`Xbuc`0hVtyy84o^Lp>3y>nfi3mNc@71ifwl+X(p0e5jU2zz_sCCO{N*N7Z-3XJu zD1UbN@SeC!8(ZtY4#<;#5#X%eQYQ}GI=$vfiHxH+elR__y}bn{eFkUEfUaY0E=tNU zeAV2aW%`J?0aH3==Clb;B@>uhCAuIGw5-BHdPW86aega??+lC+>oJpYvbir-{MZ-< zNTAQ3KXa7oNH#}En#li>j24^#XRG=(Zh2$KypRUsrI6pX))T(YH#-dW&X{QC%V$8L zTrOjfEm0RMo5{|}p{78%Ut2Q{>N2XXtLuZcP8Q(fm~VmnlKMsM47M#OB0<9BezCE! z#l*W`IFfHVa5<7uKYaL*K*tlQRL+Gm&WQq2%(jl-D!)@^!}Q>E!reK%~ z)YQHW$#D480m4NPGvr=;pzKBfZ5h-#Q>XSVJDwXTf;l_9*qo58EJ`0f5fD1(j-Laj zou})4FYvrMlxT72&a9^lPL;o6(z32y*g`w^EQoK8#;6;;f||6lQ=XlO*s%L1|`ZWAl>;$IZ354i_gPDv-%l z0r_pV$;-U>a+-#Q27sA}u+SiB=olHX8>Q*7TZSf8En`+a%Iz)Ix6AclI;yP~@iV2xJS)*73d z!U}H$&5w=6Toje3rV<0|z86(mT5_aT^?q1Fh8hcy?=C|&M2{YV7P?S)Yrr!8)CXEQ zQPVAcq$*U`)I_Y%{Btcgd|_BpSBKXZMFJ!2Kkp?BYBlIz07-Rg(ZBI()r!7LVf|~w z;Swg^DfA?-Hd4C_kD9X(I$2rC0n&t+-vt%i@%!uJ(WJm)FnfdWTd7=9iZ$eu?-iU$ zwXv-u0w@K%hfNq*a))+>Yuzuv$cbPl7V;6uhOVs$3z~3Vi2tLmqS86aXI!I={TH2k z@#6CGKSYVrUz^4yzh4}{d7O61mK6G&b#26}X>g-&H;RR+l?qCL(5cy+ykQ-VHcG~C zX>{NK$e0R;AvtE&?PC_yfL4yd*tLC^=w~w@>}J;cwOh-XTwJu>#zq z|EcF~YD#T#Uy?j*-l_=GXAcf50Y;t1dVVSi@GfN4J@yTbj!J21%{8ZHOd7KJv3~_X z+J1a}?mD|=)sDLuy;DkvDeE*HyR@V)cs%b_?-nlQ>+3833k;S?`_Bu+k&@VN6TGW!@SrNXFQj6scgXg##Nun?{&^!U1@q=Y>XzIV$MIQlQno8avR zHlQHxB-R>gYQKK}{$^12cz@jmsFhwZLFU6}YQ2rk%~P0OCfx9`p5!V(>Rv{hBG?6$ zvc0XJGW&ix{+jySVNWbXXm5Q~0LBK4kJ@j z(|?xFVC;|%^kx6*6y9+u0eS>_7h!$<^syews`iQsq}=`pZ!qy-$Cox{$yrcb90Ul+ zqHV~ct(2MBW^ulO?7PIF;Fmm&lK1fu1mXWNNdUdYaF=Av1p^<@VJJURTY(i@^?{Cr zgalZf@EGos{95f?oSX?~-L1+}(WDXp5CD||)gC+mT?EQtOh!gD;G80L6clJ7FrHTj zP!O<tqoeQCo(gw3-q3d#upO|dRZ z$mn(#faTcIYfdmx3pinK?hyeY;kSltN&oo=sg8M6aC2VZ0W?JZ!iMvs`9n^;04l&# zY={~|&)|1P6X@t2#7uGd-gIm59`mYX^xs)ID)hkc1I63o($csTP>A6GIFxq$z;VkU z!1ln_EH~~!?l-MW!fXXYrvY$Q@%DUaUQ(9H^6+597wm-mOA`G!USPX|;szeR6iKE2 zKLHqnSah!1$(Qy3$Oh<1NsJ~2FsijR9l8cpSgM^MW@r<{*By#XgFGqy_x8~ToN}bd-xyq^ZzcAJD7}v#g8?3p6QhB*FzM9Fmmi@~SM1R7 zh=t=acjjmK5Ke@7bL?UiH|VkeU^{)8e;^k0!YA%~hgfyC>{N z7c1R8I+0u)wewPN?bl#s<>ZXm@YfnsEGjg?9aQz0sjRKVjbI1DL;Lz^H|%F+$!ttO z#^kh;{cf&N3e|q9BPGdf0nPz(|C~k>1wIh4oiKN148#-H;cVAQI;A0EfT%H|nZjTY zDvC@%39UmVbl|jQA?Qu}euZtj*u9tw;k;h8+Mt1%$CA{?WH(ldF19(rz z!V<-Au}~Pz2vQv+1%Ps6hsZbD!08A1k^iU*WDzinr8;lE>2O=0K(9<0OLx?_8AKf? zPATH;{r>7`k;w6%EG?+EC-;x4J~a5KYbzarme$ree#~M@={3CgDHD5Y;7s(#D4ajW zEmOHPVJKaB)k}4gPsGcC%m-SxouCeDFk<@8_PYL4EI>A}dAAoY6gv!K+TObD;WU6& zZqYWwj=x~)^GkYcj%>|#P@MwVk=|RN$@lq5=J;-Ua`FUw3Y#+M>B$EWPsO{ZZ#ayw z1Oy}aU9=fqSl4Mn6`Bz*z4 z4HTyS99NPObw;uR3I6jk{T4z{$b@BN2AYv4{racefGz{v^}gYEpR49*7SYDLQwEZT z-mXO-AqM{KO_T9vPX-c*(SLu$L1d`@9_&YY7l z|DhS6S*!OwXd9Bo#!JQxhi&V!o0^E!i<}2XrS3a&K3DIF-y`(dYgU>LG^0$*nEx&* z2_D+~mRzT9IYFZ67cZjy@gol0X98-+kvQmqp&}FHTENtVM3n9`jwXl zFD53&9*q?|ua%cVC_xi1l0T7IT+C#Yb`9eX0fgT?A-f;+X#1CWi;w_uK4-%YHPD5C z7nC$KLe|#Sf)d{lRs+NXQxk%HOPb^VJ`^HmDYhgj)4eYE{ag0#7>ffN^cw)Ty1+Oi z9GM;4oZdUI`O*|Z1_x!4|8i)G!sEkg;kWo82&_W9tnW-KlKa8ePS0-!Rww6sgE1IY zn$~YZNbE>c+@$N&Qc&FONlo1u$o(~fDay!3vT1^v-yWamF|mr4w&L?r(tpe0RSMre zX1-^`zk6h<@APr&gbHf$i6Bn(@#_D6bR?%X!bCB^L#WRy0cb8&)B*?D#{A3tBmb$D;x`=FQs^;*}cI zu0`iFXsN1dWUP;STcxk~{*~ADU5RaV8`H_2XO7|cVdpDeJMz+gRiTzhmVbqmD7g9a z7`T~pqToA!+{XFYM19R-h!q#aTdA^mueLMiX%S#f7n`=3jRmNgjL^Za-y6(P?<&Js>RW>zT`l zm(Q5SGZ>F_{01gUKd*PHXQ)E^yAM?*h)pdTNheX4)GCUiKuq>O)<6cPuS|bLMtJ%= zDf=7j9=bRrY%*P%otu*P|feJ-)kM@ zuZFR?G@6Xa(x*69P+nPVXj}CaLVd5=EPZTT+53zF^N=-g-ccCDHRu_{(H|Kyqkh%+ zie^$&^J`xQ5|9!>0S#H=Ho9PA*pW`;b+#Y7td~w8~}E4vd44M)F}*5&byy z3V{aIi*hzG!)_=$Vx9`bR87=aP1sNgjIf?|RI<$5PKt{=5=&u! z*Gn<#L{a59p>>lY415?#QGKU-$&;|Ih$p{zUq$-36}jNc`W!=c@lVeO)GggO#qOO@ z4_Ov?R}GZ(1U!)ZB1Sv=4y;KA%clxeXIIy3&=Z@r;i1C6rFjf~ztQF5iC3$=u!1OY z*HPHb;nntlI=P*Is9ws3|71EBJrfdrQpT)~yDEg7n8hx|-2`EdWbcg=Tq~)ndYg;A zFpBU9MY}}68^s+GP02Q2lcSs0t9SP{GakvcV-4~4)N&N9x`9d&mB;QTwH6s~lU1}v z;PL3`O+4sZG?^6U`Zv`M!UvotO?s@lq{@fv9sIf(KJvEe)S`WJ|EKtNDi1WpFt!4g zOK91waaMUMr9H25i^|oKZ!oQrja!d(ciRch*1rTYyLoQl<5ND8wJJ9fZd>BzG&8l@ zVYD4A29Ku3j;Fa}!IjeMM_vA8Xyi(TnOCHl*>*LqGGo1p(kCzK;rLf-5X}b_)zsoc zOUmKvr|-LgHXteiWy_=|fwyNWslAB%(CUjW|0Ml9$>2_U#E}{%DzgXw`HzJvyj8j# zL}=G15q)y{_r;CN9ACc#7@`qJgd8{|otQ57odk=mK>lgBqFEHEy3WM;uVS7n9JokI z&Z!U9)XVwOe9M{Rc8JZvg*h2i;@8LL0Yjp>(gF7jqY}9StY&6*L0&iZ*KK3mgav7t zrR~gSRut3OVPLMg(7(!j~}-wq!Rv>@+S*>y?l3=f1HS-_vAbs{Z{Zb zh8&0+M3&>)$CSH1aB% zRdF6YRVa(uXp<$1SusJUTWig-LRhI$`aI5E{kop{Z!o3O`pE!Lb#*G^xM(oNOR^w! zyQ6+d>nkWBoSl|b)>Ox1iM-oNSZj>?4uM#WMJ9&CH8~<^BKlSzz(5Lm9R3w9vz5jFh`T+E z3wXohS>3iJ;wmKQk`#C6xhP1qM9K>f(frNSJ0quo(t;!nzuLaKU^ZMp=$-SdxKDu! z8<(@b{D$!t$p@yphxVtOk=NyOm3S1rlrWx7`68*g(Sn5~ zw9&fM9lpEjfiNo;6c3*aGhXJW*00(H+v1NOT%%l|aPy2*dgYY?XQ^BAklQi-LqzNc zU``Yk4GOncr%NjSjGqFxmnfT_@Au|yK!orx{N$4bjTCjy_nt`bO?a`sJW=VK^(Bhx zMrQE+r47~qAJRX0F-HX@ipA>6p`)LQpv2+hB@ilOPJ7|377l5*_nWGJDeRoZ(gVS* zgBDMr=CixtPX{?D2>-!AvV=)n5qeY%I>b)Ph~&)|dDXeur_phi6@;0Ca@SG&PuSrs z8n9>ve+(!u7_d!UblQpy>7S3IyZEXMZjh5#nFqAs|f9m=UXsZAJ|Epw&sO(ij*`tV%n~VlZR`%YR85LJt zD`aIQWE4vFigN9d6&YFCdnbF`|MT7F|NDNwzw`V3&T$UMx$paay!Ow@FO%FZjXjPUhFfj@jW5oWa1X>-)XscIkJbPG=j>W^>%F>&h(E@>S^VCN%6z? zmA6jV;OB#n2ma_KLPb{DZgqvckT*8^d3ErJ$@z9<+_)$W;3od>JmVDv1!x-1P_A)*v>PGLI?ODy@nn^dkD7Vytw5=^1U?t)UYM+2=&0Zx83r&w zkOIk44t@||0!8L$rKF??P_}(Ja{Br#3kwU5plN2Or#J`DLJF^7XNRJTskP$%vIOn3 zGc(LZR8!{jVmv)ZKN8WL;~R$;Dh{P;2Od+tElp-^EWNxRZz^>4BA0fE3Nr=!$D#*X z|MGthc7Awp2_WkuhmXCC8%G*i`u@v9W$Af&RG^S=0D*NUj+>_8(d56apFkCZVyy|V zE+HWyRQ8`KSLTFg{%7*6BpEQiQX@o788@2yynF^Rdy_-5x9#?+utkf^*X^aHm6|e$ z$+6LKQ5G_-2Vb4e#`5X-igsS3MLe20n!IqNaa$fd^J7=kruTz}2bhNlP=IfBUfZC+ z8vt_m3xoH9)7RA06uYdf?0Z3ACF;iO0(Wu1G3I+8J{f+t(N51C`sb!_Vmr93#j(FQ zIYZR;xd(g)B@+YzrmYVVtRhLc6951MnAj&?O08Vy5tEr1Ew-kGUKr5(QET!WZ(hAW zok25h7R(U)%Vp^R#=|Mz1JJ70o#LE0Ocurk4UG5Enrk^eH8f-r+*A}}xXITwBPThJpkXYfDQI?WWYMSzq zuH*&!z@gWZR5y3Pp=n#{TnjHw*ITlKp2bs9Npd&O6p!VfsypFXQeNPyQ5(D8wrmUDB5rTo)*=l+ty&fyk&B{IjIr zAaUJY=GPXhkJ3J7aZ`zr)CP|waj2EDz3s3|v%nZjs$W`e_cNa$@rwQHwXXoyoCODf z8qaBA+7$(T&*_*k34s~hocz~N;bFHSdpM|*0=Nrc&DvTW)y8PjiWdq#JQ-`KrPYKR z?dp1dTi`PxOqe`;eD#R!79qnlWZ0v;_PcKj-ZU^^1@(eTUz+U$=X z>!58exR!8_bn8}O{aa&6^h3+P%R~%%u`}^~oxZKDX$X-#6X@lX4uS!rTBA?DCpNRP zUhjS%0RDZALiAhbjMVxie*T6e_g}%_VX*J+9EKTf&EwhurZrA*wsqQ7ay?M?fB;7E zPH&#KKQXHZDpB0w<)RmRl!Vh#VLx~>d2`EsF?cXF0Bm~`P|m@jA>*QfwPgp4RPJw( zb?i>ZrN4cP0zh#UhnG3}lPNn6=CaVmLyvDMq!vR}mjn&M3%@0(n%&(;Egzr1zwL6* zbN$H!O|5rkURj5hn)|B#SEN@9?g13BWQo~l8Txr7{{GYD{MoXqW_r_{_wNIw57z2( zzyt-nlpDAUM2ZHO4;cXrG~OO@XS03u$RZ{tW@)Y~eaTXGp8@$lkaoy*hv45dHci?~ za=`2lEkVD3x!VTLi%u{YeKYXd)Yz)0Rn2e7F~lVwLbQXWy5~|sleUY?MtOf;`2?98 z7;1_13ZG0{==&_sI?>vnM5AykSOV=>?aVm@v*u8^4J_BUi@~@lD za34G$9ndo;o?xyly0`L|2?2Ws{HZozHkDxZrZ)|RXDM}GH8^qsV~9!HRxXq}TxO3) z_p|c_^-D2JX(0k8njd3EV8y(X-j#oX{A@nvaMAE!wz9Hv0L&cUKr8k4+h&uWY75a% zjrQA=EJ-=s99+P-SBZ|VCJk~;0$J2~#o>?%M|JIBTXwBA-HW~IO4R&5$~3Z~5`VNm z01Nz2aIg&wSf;eYYKA{@U3O2~68Yg}*1p^?q+xzBdX|UV&JbxB!~|4nkUsi(+uw3$ z3%`vy@)Hy+!S4(zzAwYKbPyETAMZM}x;H|jS?I>FKe4Y`KxMBF0THa=DYg)zj|V#> z_}7+@O~LTytKbPnqcyN;vf`oMyFkqX2PZwR&0gVE9JuIymfAA`%BrTW?g!BsUqFfu z>-Nsbs(0Wj2EV@p>mT=FhxGQapMIrVYI?dK2LE|BxeliIa9GKybxZA(ALj4%r~?8i zWHeIoSePMG^$Z2kIyje1z>Zq<1Yd?b80G3szJ`UwKU*YV%yJ^zBe0g z)pMmcsRV%4sPGk_MNQK-Vb@aXxiNF`d~Fz$cp(__)-kGol90cju6zy@KKofDro}9$ ztjvAT%0)m+11aDB&SW?eZSfN}e+dsivCAYl>4iH{_e%nwe< zzpp4#_*@T zxT;8DIb{&jBR}giQ2Kn%+iYuKARu$a%=7lgfxuFMK7i%H*7p=9y@b~3*QF=T{K{8y zG&GZ!aKNYs4G-%9rEydOp33-62QY?%yAJ5X3y3!yHr8v=y((Kth$$WeJI5OOzmXK= z0_7;lB!TSb3sBG?Xe1Lrl*_@U=Lie;q$+0bM|r2$>eJD>CN@kc*z=ByB-k~s;wRbg zK1!Um=5hLsIIt}{)tsO+0#h}pTIg9>6=16c&i0A&Z`?G&YtWnPObXsY+Ku7C1f5f_ zh8D0Ka-TF+_m5nsF~CvmQho}~3M|-6N=#(ha0dw<3K1Nci69*QrQVk84v>G5M7M}7 zj@~95B(`Fs|Cp5(gfF|FjjbC;`IHZ>$}fxl@*+e@{u&H+&)Xwi(RctlhOH1&Cdrgvd`D=Pg}t;-t5#s&SMD6SGn1bKwAx zQXV~yw0ia|?|$~!=969AC(Xs2#cG&^936!$m|R_5tBoWE1`OZbuMypQDiY~YJ1SM; z)o}ZoMI4Fn-)hBC-MNswMc!8Bbu4mg^A{Cs-S5iW8o@35#A2`Z0JA%veNAH(|D${I z$da$)V?)lFS1x?`nD4dk23(=>eD}NZqrFA-U##8AZJJs^oCi4l)acIQ6psl+oKiEp zF%Gc>wX-tYpPh(PcpyO``@stkNx_`KB8lxc#CYxb2FdRX_E!b0Lcaq6M>)jzpiuvE zlj4{_xdqRM2e~Z@^r@huglhH`CIYc1ZnP6)xiQl};z)-{F@a4H8t0$r1UUwWy zo^c&9nu0%`4G!##uv<)QOJyNu-A`kgwqjI|NeR`{s9C{1X8iR^tAtH=iSWt<)58* zTMpCI{~f(Dv;FYb2mvah;?QO9tNO)y(`^D-yaUoAL5kOw-E)Z)0&0}&zw`CnxDeGa zQZ#9+%ef8OEo&|X{t>63l0+8ne!7->SL7Ei^M4M)a4ArN+ zJWIhZ2Q;IGxT_1372`6NhEL!BhUq zi5I*(_s&Oq1qwGxN=nii7rf%2ZZMQzru#_6eYLcoJB;5G3y1Kf2^_UTz;WOyRiX(6!SPB*G zh|gaFLfdpU$(^;ewbk_Dr&zC}jTB2_O6DL~5I1=*T{00#2>tZ1>ertQ; z%<`du(!T1d_2JIl=`NY;`_*QTx3`8IieMo-TAd$Um47T7?KKrKR2%EHAsb&& z5z22^W>LF!c}T`KvFs>GmiXFQ|LAY?^exq8S(`MMYQ3TAzM3m;eXE$Z2Qly3SpO7` zRJs@aP>~AD?zKTm)giNQ*9WKB^sP765{)LSw}17mb3Hkr?72j?hSNw3I zNmA~JOz$1;?o2q#1MKZ$%Q~vOkC7_*2M=sUqgD zSq_AV2ZW>BoIEOK?pxLW#_}gVACH~PGnSZ>Q-5^P+f0J@-J-&>tkhxtqP?p};609h z6ZK6zI6JJZhwElHUn?ng1336+FGc&cBO;L`crC z#~y)B)n^i3PE$O-ARq6gxZJ|m*(JvtTX**NcuJpJjyhA+Zz)q~yovSXPkvNy)9B6w zml~)C_XmWt13M8o=ybQ`MMfT734Up_aFZH}=BN-g=*mS;mlJbKL*nkcrbn3!s{@iw z-UcVnULvz7S}G52m{3;4QPVV_4t^EVQBd4h!IE#-0vu=Zvd(`uWWB7$Z|={@$&pCP zXEF}P_xr=(Fvy}M;e$KLOo%#t+oeE7dSm|kM1k5ppS?FU0MB{2Owo6=e|&E6K6&3a z^E%W&%cge0$S+cql_rzgx%M*^O4hcu!sY$1a(rIV`aK^{IO;SB=%qRL%D%42v7WZ& zhl_Fz&8u4)E_xGFRoeOV@6w51;wV|`uUGT`S%~J_R&15|si0_Op#FtfM8MPQB2KfA z#^mLVApi9{qnX0ib|lJ#h7v)eTNomu>y5$dy%T2;;qqO2r~;Su2Hy(+CnkB1?32zU z*91lkNR2Htu@P?_Ly=v$TK>hz`aVYWTd>pe+$5V~b6{Zm&^RgaY^fUBV!kA`Zl{c2 zt=G}6TKpdX{onPBl`M4m*@{6<*Fu5_t^ zyQ7(cp4Yi`?OpAZ!81!=r);Ul-`L|j%G;Gfdv++Fn9t@rk9OyUgK0^X-|JJ-CNJ&N zS2>a}`{=gzkB6RqNpE80QU8UuQjv-;#+iSmCq-H7Ip_R2zb}L9wbgQv{dtpSSj$rY zc(04Re&?r}!YfDRMy)J-M0?JZzq7Ny&sG!FHcd{vF2CAzu-?P|W9;a|M#T9jGX3IW zadS|T`w@tE+FRMR&_2?;{=R?b%kxN^j|fSJ|>F zTYE_!%?AW2->g&5v5Cj#qCcgn&9%$VL~53uz^#rRj~=B&Wkt=73)Cg>yyEc}82ln+ zpHC#el7EQsRAH1w>{z|j`KI-mwR?mLQk#qBHpAJnw30f%iF)odo%%i5mCR!0 zLWK&-&;cHb@gE+_aIRgU`}A?gwtvVcTQOO7RE}oMBw#xi=R2Yi;RH7Ue{L6}W@V*JL^BTAa^pitR)fjqG#S$z!n;U$X?Fvc z93SgeALCxs;yiT&NMuiI8TqJzFiP6zH*&RKpp^xBOcK6-ylP895$nSQOh&Tf^|^_R zGi6=J97d_xC)DzvAUUACFq<<+Qe5aB_8h@*SJ%#MNR5#akA;W^KarrumN}5fw32A= zOf^(#UsPQ26;X1Yr)+!1Im_96ruA*~4El~fnud#`hOvbb9C&_}8z1 za2d{5^`#@VA(77qw1#l7g806Ru1-PKfU?7Z9dz=wroLh^xBNkAa+*&4Jpo$fGDdJ( z>6;UCU^h5iV`FP=O42~W0}d!y3_%(QU;vn@IOMN`=n&90 z5N*-#vnzh=P5bbHYSV{Fa1DZ_A%X++!Ez;O5TGRxujL&SO3A~cGqcUhh8bDFTBPbZIynl*> z5U7|F(Jk`UvLp$C@Z_++9CLj0#@?rUhdK&$jFGP-E-x$oVaM1EQa#Z_D?KTC_-Srw z3CWH$3?Z@xI5DDS#=+SZ-F@bSE$8%v^1fVp1Bo(Q^o9EOD-_raqGf$~p;ude5r58} z>_)zP-D9W!qSATJ5-Y*;z9H`qL9XOGVE4Y54NJl&DU=1>!W)-@(HDpVH>|IM3vHB% z7~`u97I2w>)$rDSB!`vQ+<~x8gu*^6M{Sx>SjMLo73M!A7T&2w{D*gA=^|HbXrox? z?kIL^iYdwPsMoGgum)FKp=UPE(cNJlCmHK5F%*Uu!% zuXp<70(8N9`3f^vfm|G=9j1}Fi*-73mCINOm= zm&|v2tBo>6wZBo}hAcU-OHgk-fgTAOytkn36FA4gNeNG5b{d|;+e*x8A#9mTNE1wk z(2uemxH&c`pbpR98w~#=SMzWIx)mHk!5|cm37eDvYZGY3$!@*7KX;A>ZSlNAlkSl#&U@U2Qe=g3WNt<{y4fT86XLpreP3ko9GjPwLI zXWe(!yM-O{p9fKDgH%y{X>t$$M0%gztmEX|=t{m@s41p3lzlY|>PeZqt40I{pG-

sjkqHHJ(&NEGiyI?lvdsqzoJvrFokO;G<2VgZ-)6oeA*H3DQ63Hp#>b7e67Da;B zwoW;nGW!%NnWDL!xq6>PY`IUR3**Aibo27$AlCy!p2=V*Vhs}) zC)Xc-1*RXTSDGJhf>s!yFf7tX+QApn%zVdfLgQmE3{{o;M&{kTQARRXExvO>vsJ zsY|aI8^Mrn6c)i5&$q1TYQz53fC{3o-~%9qHJR;5ieFkQG^#lUfCQO-TLyxxSz21Q zvA?p4P&gk*%26Z-&8jTcn=u5)15D?jB1hASUIa;Fqx4>g4dglk8(0xUY7T;4n36z7 zjEgQ{bwxzvSb_GNzVjqt&4xT@2GOx&C~(n&=+s}} zR|d@Yu!4c!m7C=>%~#VwCc$ZtEj;LD zW5fCvFJ9!(Ux)XW@BR;o=SOqFka%Y!&V*M|-OoZ#N zUX3_?M()j9X7fxUn|mOmr8Nku~Th<=MjYV7JWdb>MDB3fH69nYW6-*-%zv+kh0+^JPWup=kTUB$D?M2n8(O6_+u(l~IW{Vz|paiN#^mUxrl%vsF9>#xp z6t80Cb0v1vsQUZ6I5g<`fB$;IYdv(^KtC+XHdjvN#*N|8V0Tt^aG%0L+uhxr5t9P1 zhe5t_!>88GAm_Qt(?T|MMx6m>(THGA8*^I{eW-D&5tUMF(oeemU! zl{Iz`-wF?Xdybp<1nDi)Q(GQmtIPh2)db~3gFeDuhdu4>JfBx2mEMy+1S}S@04zJS zOgkeHrySE<*oXn7V|q^PtWOFh-1*`C5{wP{6|UX);@VafSAmTwaN)+k1}x~bo*Tof z+p#ERk1zcL1HoXv%J{QNMoCq-wTdyFOTo)~h=m1k%ChYZIRvJay49}MGiBWb%d;vQ ziWaRDg-j#Ys40lR(<*>PrqYnuR~H&7%<_&0g#IcFfBo7>UKYF=J@4h9m^WW z?GXmLECQdcVHe|_ReXSw05Q4$_l=*XYl-beWIf;l2e1?-QWJ|y?AVA0X;QY>jH|GD z0CNNk3-H+>gJZNtLeY`&z?%@5YE;Bf58)vdhD4xFbXm%68Qm%9O&~UIuy5+4J4?B2K@r_6Pu&*^Zi6%*f7To1yrc|Y`Z}@VBvBz5GfZEf~FKcfU-`*@` zLtF4@!1tH?sDEZIgL^(IB60Nvnr(I6f-{y zj$h|Y88jm`DSB>k_ypxMY8T)Ta8ZqfL*5Em8dBp&sXDEG>uSWR#$~V;9j)|friR=g z;Sfc`h++6P`lGHhAzM2ijvpkq?t?#4;W%oo{hTEW2bLrlDZzv0`6)bqODo`Rz9EnN z^6NNU@4?*_pu=$9D7fx}7w3*7(Pyr45@A5GU$dyBMmK|@l97Yc9LV4`NKg*j7l%OU z7TCieS<_LCaN)l-4m2|mIw%k7g?|S#LpmLrBw;(NUzmf_7$K1$mJ9YAV@TRy1D#q^sQY?k&zlS(aYUK6e{u# z*MGD@u*M}A2a%`X1x@91SE$1V1mL0BkiA4eRBQ}ETC>2)@*Fj^9vRqD2bNK&&o%6> zPXB7c;3;PwTU!EDhkm6GU$Sy_fh`?=^^JKe#KUl4+elz*WG1w^2bq2tL$)tX;pEWN zKY?@}%iTDxh8t(Xhzj+QDp@R$_z6iX(;B~J(c!65~^W$^)b_WqjBbTz`x2rAO z1dZ$+?fyr^vSa%zr=aw8!m$mLyQD4W_^5r>GiCi}=kB4dtC*hu=1_TZVpuOVBvuVV z+;8HGI(_}oWA!HVD^*Na!@|&XFzPMBU?3GL{jXCsdM>S92d3%cm?P08wr42zC`*;? zPY!Y;!+fq?&`2uCIBONcv3*sQKGD>XbYKgdvYO+7xtf5^F-LgG*qg|)H6c6vk$Aw- zt}0fV73G7hBT_3qY3~*0CZOnVVB{%XG5L=r&^4<#i{wLz?LTWE+Z!I~%94QLM}$wF zj!Sw_uk`iB4>0sB_N7i$EV5tZp{|IBVM~oPaRZu?mES;9_!M#Gn8XU_3wUka1=pL}u4cxr=8~q)=I{$8A|xWjFD%S2BKAO7SW-+xQuH#v zkc6a=kpI4?=6}1u-oeb;!t?*W;LrP-1-L*Sxr2+FrKE|mi#h*eXKR#&wVk=7prD|m zvxAMfsjG`1+}hg3&fL(^*+$UP+Evir*v;C-(8Suo;=~r Mriy$n+Q|3+0Xl!95dZ)H literal 0 HcmV?d00001 diff --git a/docs/images/snakemake.cluster.mini-tutorial.png b/docs/images/snakemake.cluster.mini-tutorial.png new file mode 100644 index 0000000000000000000000000000000000000000..98565be61cef5e3b1dac0fb66b7aa28aea9c5c8f GIT binary patch literal 12934 zcmchebwHF~x9=ai8>AbFp}Rqb6c7*r5$R6pkXE`uQcz$-r9nWH?vO?i>5}g5x|`p7 z&OPV8=iYy>HN!m5vuDTJYk${gEu*wFl<}~suptP-d#IwI4M9jS@O~2$9sE0cJ{1jK zP;KNM$w5#>0?w5g8u%H>U0WFrl?~FYgC7_yRJ0#KkS{9)1&2b=1^6g<4T3!RAZWuB zf+R8^h}`8}-D4?m0o`0pSpmAe`<>gEpA0_1a#4Ba4qjFM`K#39AOk`CoDUV`bbKba zXI}a|v6__K+wJ-F`O%bz8hM>)*tf3?q=a8FU+f7)&KFnqu&%Lh^s@^xuNCiVDp;c` zzI{b$MW}?7`cy%VkW`V7l#~`RPP>- zr#n48FLxp=V)+T-k+dXOv=C$HQ5ZHEW5>=z#|k18!92rqxvGg^J!|3%;Uu~rVpvP& zqNr+(3@c)YcE*o$W%XL4)X(=qKBL+Qcm8lF42#n66vyRYQ~HokTsWIlGa-qa%~Uwd zNc4;IMK0TjL+rrWmkp*tk+c?Em`1!tZxsK)(=ZH_WQP3Lid#C(IH^(m{(4z)A=Tox z!qn%N@-q$mt(>@zcd*%61P~?!>tS#Ce^X{X9yW+@&(sF*&aGt3y}5MayWU4rAlg|z z#sreVN+*Jwjih5ts>#wYkW`MNI-~=c2{{kRFN6~}TUrtei`!mhI$Ey5&MtR#I_7`Q zyArw|N#j(|2iz-C4Y^1Cdk hY3Y`opwgglXm!_V$(`iB;^R%ZxyM^SIc2QDhIf2 zIc?hOc?=cQjrWRJlj(Tx1%(JSqCY?>2TprUMT|%tVv+U9#TPrZ=!G{Q9 zUM2Og(D;%_+BJThNByy)<=M+Iza4LVqy)G0VA>kJ&$q(KnLe9-@D_s@>T9tjIR|lQ z?=eEY96Gm8-@j5fx(}IFYLe1v&!W~Hbk#l-=MSWpAYz`B`IBS&qx(A^8azT#TSr=?9l*?E+Xq43mqmojJDeV|8uu9) zkxT;5KMk85*}d@=Q|gn4pze*sVRdC)g$G91quz0f5Z9RMz)1qRgiOAfNEY}LhHvp; z=a?D^;^QN^8S&6vbBV;Mg6ZJ4Omvw;4;mtfRSON*&MrQE{*0a=b%5E{-j2{>=X-fz zDI{W0b_wI)XniN{PDn@?ytZarVpM;Rb)@>$h^wosesiGoi~4PmSQ>F+?`c;IRW&sv z$hqm9b*JG_qj>dVK7=4KF%g0aii_K$$v6q`-3w1lB$V3kAQ@iw(3=WePJPN}_lu>Y zqrQIDIo}7UJmA!+*{>5gwJX2CIfB5BYGd%)<&?+81Tgh_XTZo|lyBl;{Kf0JeD;dt) zbcX!w*|W%mgl;w7MpQ`ut1VH}?TuebO3L0Qcyp;=X^91yw!TK=ql){{+Y5j3g6Frr zu(AKXVzCpwj6X$pJgxlW$3*e8lIW?bVMg^{h%a9VsYM(KeaDVfQSn9n!jQfgmbKVA zvJqi#&cjD^MR-@4@BXHh7MsU2u(3r%DKP1on9Nx{Q&K`tR$)=k`?+_zJzbj|MPe<3Bq20MNPkB~P^86`^X`nVc&>Yj+y zrKqT=1xwXpCpAhd9JNP9M{n!vi)rQ+5xHkyJ!#+A(&EtmN=iy9_owaIlv+AZXSTE- z1o@rrDB9b@K79C~rLT_(W$>9H6B84I^$k8hTy3`%YBR0z`ulSHiy3_P1NC3drZf!N z!;nOTk}O$>`|fH#Y2rgoXDzi*j?Jg@x-|eMy@^ z8mXJJ#k1vflP@+ybW+E^pFqKpk(eqh#2-I>l4IShEpUHN^*c%-zQSb6vq7ly!6Txm z2Q-`IpQ*vfYqq~leQ{e>{r&rQi>1IqH+@j3Mt=K`BxZi!Bij<=Myi2-uNtF}mwmtnA z85w^J43PPLC;F>J)1-ye0@}wF6%}Q%KbJIie{mox9CzR7VO-s6F6x27{iSzIZjY$5Q3t!&eCj1fL&eg>P+b8Gk>%Ivld- zOXJ=7WuVBg(F`TU)ew{&x@X2@nQe13mny(=iLC)aLcV^Vf$EvuWfBTY=LD5BFVO{k$j$Srf8W^*B{zwI|rY{Ta5mlnI zJ|T0nM&2`>D}(R*ii(Ode%e~v3}ssxl%@rhl{epHT3K6H0yq23Y3iGdvd8Jy#+!~0 zz%MNIB%crBC&=E2`7I?WQ&^1E36YYMA8rn1@Q0L|G-va7jUTO#OuV_TrXLU>U0hyn zQ)NA1UaVJIIe8L_MRFETY+Y45RfgB-)D1>kVbNXLsVUt>r+yneaCU`vk60u&j#~7E zit+$S8-<0QlW|kw(CTyFY0*=pZ+0WBfzLFJf-nNT!~io&n!H)=O#CFWpOx>eIr)!z(pqufGnB|7+zQYHHf- z?CfkAS|R0-GzYojk-?ZSZ)iqut}iRRMvB?ld6313YPzgh3c^ocDtrsN4ZJgy`}gk? zMky2&7Ros~azPCI{6tgVox8!_`C|K%Nj+PNGE2mXYB=gF-bs7Ot#;H&Q?rjS=kw>! zGSBOd{@{O}0oySS4BdS2op?Z(FzL3H@ylg0sncn9y=|+en+pdh?L=U^-&%0aSP7Qu zHQ_w7KK}W0{dP)P8k3L^semRy?w?920TC*j%j-cRb4jL_gNSOMeetPLp4;fN`;NadnhDt16-U?y>;)Kb`Sr=jBy4H)l%YHIZTG{b-?< z?nV2&&chL7BpEEZT>gkz&zt*ttI2gEQzsYEpJZ41-=b6-417-w=#$w^VJ&F)g$QjQ&urifQHA%F$;B~lbDr}A*w^w z{bTZgpwyY0nvMr8}CqWZ(HG${M@n4h4pnTSy@>Xma!&VDc>XZ*}zMu ztcJuUgMX&)eR?{1Je_oOWu*X@!BX=9*2`_Rwc+oOYY*Rbuy@ zMy!dLLV4tE+ot{b^ys2-CIx89hrHMAX4>1^&&&6X1`;xfEjM6D&v`Ycg0?#zG_Rq^_2Q|*o%)p~hw-zu(MQZgYl%8TEen%|SKAxUzGkjl;C;G6ZqojoKC>b0w$aVnzuJNm)(l1h&X`7`x`_!-W-!>1IzcR!S zq^R`J71K(N!!<$+x_uw5FGckZnO}MPDC&CcDFPWHu{bLc!N72+`f1-Jg3j}MfSRNs zoUv_&yvh8OC&Vo3nvXwR!a@++U}Nve6ON5ZjSQ4v_*Q&GN}g*ylhPU=*7}FJup%Qc zSM+*D9TQ!|8S6)N?a6ND>W8`utt~7U1ADWnP>5J`H^gcUI}3+7D^qrQKbpvP7# z_LVUY`n;l7bMDu!woKEmETur~6E#h=#2SHlMD$(^KPxr{EPH7E`Se-%){h{C`@L-c?n6c9P1#xGt}m+8R5a&mHp8A<+X7ib~5xVwkI5o=?0s-G(76ypfZ zsdWk_QPI#QJfz`~+gb?wJRKM#@o|&n^vujQfpWjF#B^*^DtcWCjk&uGI){~z05fKwt7M|*#tjuA@lJM4#C?OBm z#ysVx{c#^mR8KLLQ|YX4lm&%&J?MDINcOVr2^DIT3af{hXvbT(0ZKfjvYFxBrSL!S z6rsR`;!O8pN?6#etVsK-5Y18INOfbrn!=w;K`LvBju(>+Gt7i50XS^|+KlPwIJDZ$ zVh&~Eg3nhbJ;{pfjaYwidgmdkwblaoZKZ_2w;4&IOLTs>J^n!~@lG!9h$if@oZ7-7 z9ft?|j9ZwYJDKPk2NakNcbCDHK`qWxVOirTNDS4Nl~F~8%J>B$G4kEwNB5Y)(AOpC zu@sva731RQjGK=h4X>#^o;||Zcz$qxDXtMq(K$m=XlN_Co5Q)BDOl-Qb_~cw_0odj z`lcuZ8r3pccy)Xpq90y4)K#B1>Wu#A$1;d8~MV8hKn3 zX|rBK49rLZYBBoI^6|^)UUDZEaYFG*CTC5O;bxWAsF2N~dXI0$TOt~^u7X^v$m45X zgo$7UNBFyq>mP*^1dt1fpPA~CuTO0lz8z_bL)rX=Tj>~l^kZ}MlTO|?(_irZS~Kl4 zdvMjb7m~p+v*YZ(7xvREbZV&T-eb@5K|{+;Sf|)S314^qhgP zAq>AA$%TTp0a?3(21+!p#j&=g`D&AA=x%jARQ&=+*X-Jrq94psLGC9)S98Lc&`?zW z^W}7S1b@MF$E*sbtt?k_^ugy_+)4MxWyi5@+nFzLQVrbiLGzq3iiuaO2$Ps6{8E-AtzEP4k6ud-AdIE_O{%ipGV!A7^c~D*T(ojBqi(v1S=sO z9ex=6RZlWo9+1m@zD-R{Dr#zr&bxEX^vSG7lDw6!i%JR#3N2$}qbY-I-|)jf!_OalbWrK9MZfU(p8F&BXJ%mYnaDR?0JyjgKot<4lX=&$X zc{6^Lr70d$;Pvs-hLcGLG4~bRjm=Hho2!@8{#90NO~-$pYZdB@S38hnKYU;+(19Vmb(?V=BnzF)(SHk@d^AEzSS>q*hQ=RB3Ow#eO>>LE@S=W@;WzJ+1xp?h9#; z(;PHLqQzun=)TW*QR|f%MuFBd)LF027QsL{}gzKGz zC$?`R-JS10Jgky9B_t+(4VLZ33l2xte)6OO-~*kfPf;MivRseW9wWT5C|Ob*ZZ^Irk2(827N9{`VE0AfrJuj;d?m9X? zuK&YQ0GC<>_iig@W@VNB5%*Xl1mi>T_Vx}93p1Pg=5$pFEWAHkni}kBOnm$B&KF;$1?&C%b#Ljx{PTIc5ICB1jH zDz9e!b~~S%1cZU81aJ)+u#bI!Q#)S7qMXZHXwuBKyPU|1Y)jOth0r!rhdxD9v# z8}|YjkqJ<+y6L&+Wk+md7ro1aeTP*J`#)s`>j~a?9)$R!NG1v*BH5mih(pl%1msJG?>?gm}c=_{w{@^Vy zE3@X*D={@QGrPMVkl@PjaP*tnyQmn@3@}q%fSTMz^2aOhy=Tn;3)?cey&!)AO9kBH zw5CiW-=fN9$oh0HNH%EhY=3d0xV&j{`vS;3T!@o{V`10t`grUah+MyK+E`lhC8=}( z_gT+1)$md3FpEb88qOug;TxT2ne$D==)h z$&)V0o8$8oHA{%QVal++44w)s3>zC;-`!pCd6R$bDD@pi1<2!$gMjo()kOcv&P-o) zOw8K}N9(!3OYY>p<*n76pq+hwU0q#u*5rz-EC0av^}9`3jlkhM4rB^DZu~B6WE~o> zuy__2C{tWqYyl7zmxxH(_H1L;`mygftG;{pNJ!edMF2iK0p3RNy+(JLVQpJD0sY{t z&mvx+pZAO>d6uL%&mBTi%1e4L?U!a(ZBV?_)8B6i;zUQVaY}fTSs$4PU5w=DTUan3 z9vxZc_#Zq>_x$!^ruX&h*BLr6D;XJ?Er4}m`MWYRGby@Vw}35h#>B;00X^z?vNcgs zP!I{o&vIWnCG=UZxb%%O3$bU`M?hMHvSb2$zq?MVAwC#VHyDm=9&dfldSN~nbSr)N zH-^`jTj#y|r{>x}Pk6k)Pmiee-W=S}e&f_}Zalu%9O&s;t!t$VOses8cGDAMFiSK6 z(2Sl9yxpq|0D&sfjeVo|tA7BiUC;4yY}DYB z1jhA2F=o8~tzdo0*zeyRfd5GXC+oLW(S3Gn4lEKEq84{!sdrn(zFQ%HS*^ex7X(Y5 z+|vQv*#yWqWat%JrSqCt19@<@xn1z-Q`nv11OMSkUED|t>#cSevf4C!4%pS@wsTM) zg#}0}?YRBty!dxr5jkE)J(~gg+9;cllCtNPj*t+lHH`oDOZ%p2(24bAcP`=gZyg0C zCE%v00483>&^VTB2+)M0qFYp*|VsvEk}wtrH`LK!@0g?3z-25uV3fE zF+Dx)JEOeKE!ttYyJw*20?a*Af<4heH$gbErIdx_VJMdZck_JY2|M?e%`)|$E+W-mIxU+-=OTkj5hQpC+@p1Q9MGujGK8P>QUfg=!T>-NqV@_pIcL)o*_=3Bfj z3IhX!J&-0_JXr24C@=2@!0$`oweQtfkWtp3tBP*A#nYfCPe_ad&#k6?H%l?Gv5}!G z5LB>paLDQEk^#mW7Ibr%qhIe;s>^kLetxG?>AdA^@)VmdcPNg*lz zamsL0yP#pF{e0;sb-hS2MlkSe7QoAa&cGi&#F9Ci4>6w0)h*UD``ZzfA@x!~#Bn?r zXy22~0iz~=@jJ=RW!#YAjO%rAWakeoM&uz~{G!Wz3lb3Jp%D>_CsWSd)3xsV;AlDA zJj#C`VaU@B*r=9~Q5?{1`kv1nzq_w$+`&zdOu7oHy-QXe_6yed9>X-UrS7&7*lx@q zIn>(P>T|Xb;R=L=+2)_pE3a(79dWQlQ5L)6|hp! z2?z*^tpWoBue!VAl{FfdRe^ST%HwU^WPa|+>jqmy`#FDEA_=6Aw%`z#Ch{nvwScE>YxbB$<9*C{|FCOv?7|7c`V`)4^) zb%Ac+Hvfqz{;dCt0JGS;k?FCa_6+k2GM9-w({M!h`!wF5f|TOW@j) z_rTDLG=Cl25kf9{!H=)9)LGd{vDs5q{SSKB$n?&c)x3X43*+%8B(iV@kP&cFyhf%EBP!=ZKL|9 zy+3wdj9-Z(o#~G=dZeJ)kHXoZ9M1n6R2YJzLH`4295DW134}M>yM+i{XkQuZ^8}$T z@;nAC#Qv=3FKQH!oiArp%8rY*kRAl6$pmr*;Bj|JK*0)2uNH2qh(6VXy!nA7yxym! zbxVf5kY@3f6`HT%*oe~*BX$BH8wE@2L9|05$0hN z+Ja1BqFPAUtnjrtZ!N39%l7&eBo}q$(%q+gHRN+=O7By+-M7QBb(PVR?@3q)zgw zi_7ON?cum>1&C9iGmkmy6{yS!2nwFRNCs*b6b+C|u>8`D<^PzUpI0_`;L6LygbdDZ zc*L@R?dK)Nl#C2akko}9ynHFHs;U}_A9Xa6%K7w+>5~quPz8@4rXvrM*o+`kBJdVB z_>-Txl@+MrNRk1E5xoTntmRDs*nKQN$ZVX=t!A{95S zs0c<%N(z1Hr+N48-L!?Ki-1po7FNs12+6~T59h`3t*a=(FQd;+Y;7a9LO*(XkWB3` zlEJO3mK2sQS_tCE@^W6KSPG;T0S6T&C8dOci=JTzJ3BdFUkQ-)5{thJY(e5|*sPWd zrh|+P4p#*QR7h=T2WYsAffed$SnXq_JfijW^{Wo7_6UNA9D2MNB4RSwtIn>jgC|D9 zLP89pqEFWn4XG?}?-Hj%x*+9`iAOuhL;R5wrsKsRBa_|ihyX$a9Fp*erAjw<_s1qC z1TAiqa1W2Kpj=4{QHjHpgb9$=-rg77UtC;V3W{iW52@^FPwU}`2XOf7Va6m8or0}U z9hjnxe83J zYd`bOC*DK_vJQ+!!0I^j~w$3BZK2Fv&QMcU`rN* z&f+Zwf5dHybyY4BaaCx7ZtGu)lW1zHIq&`bYo3ytibb2&zByj~VNqPpytzPYLOe?0 zHCX!nIXF1c-tNXIpqE+De~F)~B)(MIO+-oTv7`{TPXU8ADAim0LZbm<93G=O)Z9%^ zhmNMl2sgsn+1abqU#!Vkz!Cs~2iD04L=-UNR*Fpl;^1*vm=3-|q0U|49lTr_cDJzV zLP<&{N50a(Mn=v>Bxt+!h`S$z^|k_0YYwV8Ag;KX%Bm{4L-!+)CkJyx0yqv-Qm;$c zEf>~{1Tvia*GcyF_G^`4#-N1aFrCq3>sFBlhYW`CyLQnMoZGVJP%SGjZ$eJ?U1oi^ zK$V67TpX>9I0gGd>&X*l4}Gt^k`i2y6$oAy2n909MP7W7vw{P3wPdk`J(eG|n^#!) zaP14~)924)`qFt(zVu%Rf|`yx+}2hwYL~lF|9stJ2?v%8&fYM5H9n9~zkQ9k7C`m( z&h(o33GPBhLPEOgZv+VnE8gd8{*-W_@fsQ%!|-FqAu<@-UA@MaFMQAgAcHC^E9(ru znF_p3%*xsv=C%NBCZ(s3Z98oG2vL%gM*!uHXumH459^HrY8w?575XBhiJ~7b!eiVJ z38>ah7N7F(z&8zxk1bXY-9d2U|C4p_6?Q7gf*5{OR20ZMAe)}&@o;m8=@oYZ$Z=O` zb4g2NfxEoC3~_~>0Rasb4-`Z#Ht!#uo_6mow1?rRj#G56Ra)ndOio4vf|grdO-9o# z2Le8E8_&Z5QORlj?*Iw{_w;;*ElK6fT^qj~57EZ(c=6o8JFKZ1G$Uuiq=*p8M3>alM?Ir&$rIHkJz(I_x|0>Hh1 z>ZyRiZ_{J+O^t;G1#8t`1iD<#53H6}S8-@zl&Crd?;=}GTY&O>tgAa(eoVW*XcE*6 z-DyZYy)=1~*K&2>YZUse4};6v%1TQ;Ba6p3H#g5G%rDwpX@D9p&=RM-_%noM2b65B zTbQ%+uPjgtQJ?ebzNBPUPAozc$RI#p{hwm}l>+xxL`P$T?>?QZ)Mrh;6NT`IN#?wt zk@SIQh%sgsAe=y|;X$DeHZ2b&i_pjEC^QTVW{~{>&IeS8&1A>E0L~9oam+;y&Fbjr z9O-{#A;w~2W(KC9!EN}C23Q5#y%-{D>X_kSE#P`R{5~|6l=K6ZuE};45)uNg^qnp; z4aUh_)MnJxNq|11NxYA5PC()78X_Q;l!X1vRR3Xb`=w43p#^GV;^|ox3{hww*HW19 z@^%QKdp^V}3VGc!J<|#;(TXDveHB8qSN-zorhQ<}1S%ItnF`AA&2Db4`hRwK4ilxu zn4+U>fV}sBO-}2ymr&>Q4B{;xQr(iJ3Kca`jW@3;+j`4KZ5^KH=v}Uf`T6ae=_W0d zxDXW8EQ<`v<;*v9S_b2hd**ZD%gA4W0sWovHY+HY-zDi=-pk}es$6I{J2+O1GEUGU zAn=)b*f(4BM^A_fmx47Fu45FpvgIiBUY6j`0=4>E)v$SG`mmBxI|f%{u;`Q)1>!Ey z=;g`2vg*#ypP2shNI$yc(b!TAN?-6%Fl>9|Xw}7ZPEX?w)Vdc;G}DKF<3$rIBxU4$ z+GcZLiI8UviNsD#HfJU2{q7{cxIO~X>RPV|q*3Q2cTpAMr5RqW9~Yc&3I3*&!n77b zrilo~pnEVsT?>y;LdODYq`N;TSpe7{#DIsdbe1XX;$B&nK9J6X@EBpum~5id;W5U& z_1h(r;1a;3xisNi*V{(O`zO8W>}1^F4RWnxLWAg-l^8ZspE5YrGc|665DXvX++xz+ z`^Tcy-8JxwWUdH7>)7zKjZs3$eRZy1^QNJt&rp&gFj^>jzr6@+edX7dyaRR#tF<6( zHh|m|mk9vUQe(UdIHQM7$e=V6a(nhm^m(z2=LTBGtr+kK1;<$64+>>nkxRqEAn5k? z#<@mW0h9JuVp0MnO3u^^6v%ckJB)`@Z7nep546nWnG=NHa^U(S=nog7ozksWF@}pC z3F$Yj43~#kM>yCHQE8k{bDcL|uZy}{r#pN!eAO{VF7xCw1brpvZTl@EP2uVf7E59A ztsn$xaD@+Xf5)VaH}%(kf}~%@Jn~MG4YZ>K+QBliW`$aq`G4Nm$UW__=)hE&H2Cv% zm*npmxQNn~%gc=h{V+EVjaSmA!8rXEFx;dWxtIrEM(yY3PD8MKd6)0%xqaF4`!|aR z`#%l7IssGsC3yz}PF(GIZP|Df%@z-8oheE0GFJ`TZc*{gByQ6GRIB~(WOZ2|$TOTh zy!ZF--!B1|UZC7TBX+Ri%_cD@4^3v%WCGA2Uo)rcu4pVJg*294)_83=lH>z3%*_ZC zh1}H)y3j;`@ar!K&04{0D1M9>hr|b_F{|5~>v4b`fkK!D_mljj%5pA zOn@cCg7*KWeRBnlnFrOgnl&U$<{}Bn!O`(fbC^#p7D*F8|L6}3a6n5w$1RS=gi!ysGMAarj>H)>ND9x_DflY%{g2tQ~SV$-r5B>`T0t&9nQ*OmYN61sTbzS%h1)kDEeWbe{ z@x3p#fH?*_jnq)XU|O56pJYC5yp;h;Rqyh(7zSv*<8)ARfsPgrDvmS{l%|tRwRif?+#Ic=&9}5*`@F@Kiy_CM zC9VL}^?4*b(IZ?hR7JRGK zz%p`?aIXag^PS|=wc#HO2>5b8 zh5W!Pl!Dps1h#%zGkG5f0i#GvpvBREkx8^6@r#PeyYJB0pX{zCe7JLXno_MM%1ZA@ z$YA8TjV?&3yNwtDFU5m$RCppzL$MO^=DjD+C~zY|B9$vu}+gCK+i2!bm@MFw9vt1aRH9}wS( zzYvF@iYSa510?Vm&R$Jc0xBJ%*ai=1jGn8#fFQR=5ajI-L08~Q-oGHokrRS`>qC%0 z5(E+3q}8hmfo~uiD#%Jf_ptx6nhIjUSI}&pzqSV-pa1nwrq@CQ{5m8jCH~T7_RoT= z3;9N!=*_-arE!1E2iK+5jz}XxT7OIbhioHiO0Tu66Y_Uel(J90i#?0WIBR6h-{)XE z>HavY=BD<`;5|lApcQLC0&a{%U(BaZgEQ&VF`B*FY`wFax9y`+kxyr|Wxh3M?)-iw z8zWoO_4G;omg}Y`qAGzjF5X{ngP$b!py0pW7#k|eAJW0za{LQps%Sr&p`Lw0Sq!F5 zdz0#Vgh=QRD)jPwX!e6RyP#OyO>I-7Qq_9SDaN-+7IEwYyz+@J*Jr~q6!7R~n@k&b zW`jOtwA%zW+3iz@`l+d8slEx|4PfIhwt)-(0$(SY^1z-=kn4YcnRt^dUxC~HMUvGq z6<+JdQm00<1z(>mg*^{(G!?N{b4iv`HbHU*=BD|T@EqUvJ0|?6EDIeRvZy2m zTMej_*>3x1l@DVq`cSCd4}SFO9#q>uy({pdhr|j-miJN=Y&=I}zV5%2rk9^YPP5SQ zzstf;Z`Y`8)7_=`-CBV8;eBR(t4`{|f7&rxC-r;zJg)<$=pR=?Iw^wU6PLm}GS-pF zo?BO@-Z)=)7L3%DC`6l)B`oLZAo@Fzue{$SXi?vX_|zqk|HqS=e6HhLSx6ch!M$tQ zYoTG4`Bb2?)|}F>n#A#qS ztyyV z2dlkRTmc(F8IofX1s3RUHtsO-zZ4d>x^AThE!~~fK;F&GB9`;DcvEkkDZQgbwcfs^ zL=*d z{vdSNfzL@60p49MR)P-OFgSX^N?(2E({C;%gAOtpgcr{zyla;PHa)ix*g5HCWnKIOF=FrFB1k{HVN_%3ocq5>AC!T3qZWpWil= zE|@)OzOiUB&i%AZ=W6CbIm4}3C$OFLe-A8Jl)Z?zF#6o)*5(O`*~Y|j6FLY`HlO5k ztbAO$rkb6bJIv?sK!cX;>asM7M`s1Nb(R(vBd2sfe(3wdrc;XxT<>Co!wU_v5lDCW zT~v^_wKijAV#0t;!B2DBke`oa@$>7CKQ&9(#gpjz>%Y5!qqCpyrK+haV%N4e=UHdg zb?Jotmm~e>d{daGx(E`)xiBFW4EZ{vsD7-c^WIOF-0>52-Iy?u5umJ+5-jH>kEcUI z$BJLyHW2OnENKrQ*5%)*2Cmr8(Gm{Qq41*lVZF@XtHJ8KkY?m0K?xs2ZR5A`RrjaM!$3z(aNNzUc^q-ix)3k zc1mhuuBgcQEL)E@`ZpG_;*dd?@vlPqcNvV;)F4h+x?VglH&e9})XP5-&T#w9l;*mb zJYrs@Rb~E4zzL5*j&CD{F|-!srV9NZ@V^`cc=-5MyG8G8>zxQ07*p-ANZ#j)ST72h zRcXJ})Lh%qyBBb8UN6g)=}r7lN5sp+<6mdLB~dnf!+UBbRlP9_R){5zD|2hTHI)8- zT0O4NvcGCZgS>DZn+9)6$Nq&zldQBf(q6;9h~->07Vvm^*WEawnK%m&ZB$Vx_s635 zejtroE;UnXzI7z*>+iQ6fAKW`#$Ltn8`BiDjG=b{Z`E|Fk)-(O3m%FL9w~NJy*CN>hD*$e3#W>@}_jK zdX}bGmenXIr=h;yrk5eH)tEawI~xv1f%*@Q|i;g@mz}DpR zAGBOemX*7mn#Uz3uK8lHw@MSg^)Go-)7M1KgbjWIf_;yNmp8k%mc)K*Nc#0_Qfx|r zhv!y#{Kjue;aX*>J^X8yJjr2Mcw^Cn2WuwDf;k>tyF}wElLpmpUnW!%*xs@j>sx(! z_ycLYL_N=EU%q_N=S!`yU6Z-lb8cz(x}EZha9b$zy0y5u9QKm2w&nmoVe_YIN@+*9 zw00Gp_4PMI1Ox$ZP|gTcWV@440u_blswycbs}_w=a?sXxUq(&lx;t)qvG<(a8Z4$;RRb9)y_~3+P}y zRyn7-ny6T#q)lB}nBlxM_kG`Iozxem(gtC!<|L6eV)S2z#ld!Ohx5qy5FE=^ATl&x>|yiQG|GYN%Oonk$@w6~+C=KBrH8?%E?m%^h&??}p;ub4}lugoW=ruv(%&B(qnOnKg{ zQ5$Xa#cNfXM$9*an=-z!`|~-~s*U>oLe_HjJDT&nEy`Zq78U3(W8dJNF2!rk4Sw@BuW_WZRFrYUrEbnwgGDJf(Q&dzIBJ7wJoY;V>V0|*K% zd#e0Ua13v+PA#)yHonn<$T-ftZ#+4^xjeSLxv;(2eA@D(oHU%yCV3DVQk$ABb?`lU*~q`wOFLCy9#VaUof{9CAL`R`wDoWqAOV+pI{y5>;Z$4FlA= z?GTx4e-F&F*84a!&4zr%hQzal8r1F_#W^* zf4(oSZ6xY-1!9rgY$m^3{Zd@o7O$!M>^JY5YRez@cl-BBwGrLaUK9f9*sw>DIHEVW z;PYb5vV?#3H)F{MT$bwN*{-;wBQGfX;GMl;m;3LKSWl72ZUz2TZaPTz^5x5An8)WQ z>V{_C?!NmwsC#(Rm4ZNnyzSN5*@=q?7at#=wwWpYm-k(6Wp>9fD5$)C9n!GhOn$!L zObrW>zn;2!9hRB^uI`*h0fkW**;gseNXK80nd6gSvs_hm^(xF$ZSF3&`YHRq^58nG zlFQ0kX|&`X!KWE1?)g>@@P_{Xb`0N~I>xMO+JPxya6`o`;Xlwf@n*`+w{PV)zjC8E zo%eL+-F8aMtgMvG-i#tr?0eOH-@}ZE`ZwhGH*Qa+Wl1D|U15ksfi0_6{p!t|Fr)6s z-rHzK$^U@n81`&Wj5mV+soy4%Xc+y}Y4#|(9;dIMR)DbbGE-Ah4t~C|?xl2=S#Exg zy*WEk%Ms&O$H%)Dwyt0LBI26vS(1hN+TY)@^U&+yHN0%v_rM7wVyJ0<$=(342@NVd z=*_x?&KLGQiH|o2{WOZD!67!^otCw49Y?qg>fId?!&11bt1AfQ=%2%I-XL<;09v2} z^-D!n)dz*j1*(Yfs?ugd6el@t0jO{LvLM3^9pulo^CMN@6as0I963g>2Oz(Ni zzJtKZ@nzn9iB<(XXdU8D-2Z7b=0R@xwoARKeZ)ESs9Q&YYIAlSY^+@;mBOCQHOV3t zY;5@D|Mm@}#}~$R50pz^#P?SWTf3|&+kAI6+Q;{ze%e)w-aox!1x;5Np(u&oIuH;N zu7GMT=ImT^qNA>k?+MU{{^XZeNrKMokyOH=jypfIi;B?Pny5g_rq$wkTlgpQN)SYN zjSUEnNH+ikkIBDRRl)kYzxxNH59a1=Yf+++`3i}2qN3EqZyoTEQBV#~%e*K-#{mw$ zea^att*N=V7$Z@~4hn~idTMcCa9*C!G{*V$trUbx(>Arn2V!9xU`0JEx`8=w?ywro0^&m zzkF${n$i{geW|JIie%k<`6!ZFG_tEphUdRRZ2Y?+1RNo6fD@@HDJjQ}hefVgt(q@Q zVHjP#`pN^OqQj$pj@*U@a`5C@APEc!?67K;Xuy2w;8+&zBmO4P5w?26fhKq}oJ<#vZtnULsIO>m0gHU*{^YKqr?PiJvC+McZ@=XRT zJnxP(b8~Xw!I6VD279l5haIAqT%N134q^;7v$XW@HEv@_)bSk}Qi48}Zld~fK59OU zInivnA{-LjZ?rA*OiMv0s82QjpBta=+?8eUcF+tTfy;4;;wG`cpeIMD2rT55lE{xf^yx7 z!O>_lup>kjWyY7<36P}v&6L-z<{^mT9L7*XP0d79F-r>zd>m>K*4aN!hy8kY-)ikP zk-eJvx2?D9O43qNyb5+l!r&t zNoB9Ggt#~yD^b{qmZRfo;gt1sIUGRW;vOEtu%rMQqIPpiVSl&Z`ydM0TtLd938l;M zx^qqQy1Oj=`c+EEu;!JD3g_=7(2qhgn@>^)P2&<0BJ0td78_kTQwJd#D>x3o}WVPRdg5jT5V zl-3~w`Y0eEAa`*M$Lj)t0KK!YR+*RLnJYu?rj+hO{J*)ER5I|f3nY!05FTZZK`vV`B!G%ISf4*7^@5?-QMi21(2!S z<K3!Iy)sg^*-(1Yt+my=Oh z+g~HJ{1n)hi;bipNKx-L^zNi*hhQD5{Phy(VX}j$+I~+KtLo}v>#uJvmX@+(Vqz`> z60{RvS;z+{4mK}YWuvG%Kfg%LyIijT?A?v0=x)2!MH`%dkR<<;GA5?$>zDr|1wh6o z0h%dUA2=`2Fhr+dOmPw1Fsv2oz*iP47&iboL#x@n?t4kEoyj+yROUpZOi_LtiZ?Zl zVIIoO0r8oypWI)#8Acq2>?aPR8}pw{ zjBecnkQp_*=ykW{6*@RLxSGOi(LruiE8YmIR>!{kmSF5%B*=`ivO$cfzAk|4m5cJ& z5?sOqMRFHyi#fBt1i%cPMpzo~Lh<{ddfXgEEz0f*x;b10dHIg|dV$01rTbI#LmFHX{3#@29hybZyckjK z@9z^=X_wRj>cZB=MXM3`Md!C~+^V^w!5}@X3ZKu@yMX{Z%~pAS15^OE6vamswY6G} zyWjeyDrZ=Q??Im*3UGK1D4&V!Z<(7xJ886d+^}?8TI+WD+xC1ZH%f#V9`wND2^s94 z(f`RHuoL)8-T3P$1gDgEM}-sAH~OUhZbQn{iaUH$3@a`-mBUbw1-m^s#0fgNV)LV} zJ|0qcdm-S_Y5d+=o8wJ?GPL9nR$v0tpQ8$0QaSg_;P+SVWHfL9! z`N%b82Mfmks~J)NBEp_P!twWb{_a{-=pcIwchQ;p`85hi6R_x#Ss0NoBF2@}S~`Cu z6%1I?c@-zCRq}M8_&CE>{;!tmf1XBsR$46Amj6 zY&Rf@kb(7l09`8GQw^xnc|ZPm7n*w~0=0HBu&5LLM=Y7+**TQ|Jha(JL10N$#FQ_cni|o2n^Zy{~1K*v2Q2ypZ1Cap|wafK*DD zew>@PusT8tD4W_=`v0a&!t~bU&Dp#F254YXgGM>Ki^}@3uOZ2P0}c%?nn80ZImP{9l?O4f+rjEg!FsL$byTI98_5s)bw_QiynKD zCb))^G>z;ZCZAtj81~9D()3U=Lz~)s|ABX5YV&t=1^soIp>v%-E~E8G2;zzJ*AI)$ zq!{2OwDve3qN8dVnRROR4atPab2#%GaZZ`y{393tOGB08nfm-zOC|(&bdPsU*jZ5I zh$8R_&%^WP9xY~7pN$N2o-2ie>#?S9M2Yg^K94>SY7kd1e{DnJ?BjguRi9+x{V1p7 z#sELXonhLNA33Ox!}+QFKH(v3)<8=H|n4|zXU6SfE9+rnsrc8CT zGWtl*3^{*wsTh+OV&B_#J+VC1FKkDLdRSPb#3irhxVO-l2^gyiRftV)BPIcuHSoe6 zTA&|Sbl{a9c}3hyIp&{aF(VlsTy9NW_e`{E7!#_a z&cMQ-%S|m~(rhXCICm1EA-#c%pFem7?&+rHfKP7+ua_zFV|zC0=~CymwL`IT6dUbv ziaPo}G|(pQZ=D(f^6Sc*tmJZlrOw!*6eJc*85wq^0WOVf?iRJZPiE+VR06~;38pLr z4Nlq{^ldqRwu~#P+dRHJ$UY{4vg*H~A7gCwQI36${;>>i&yb6k14QoYifug6*aZCHI10VmY+O8eEcMn z4kWNJgp`EFMj=pAAXZi;*PH;8hYx&)w_*h;`nX5i)QTr1iU#(se=t4)y_Xan{kVSa zQgU!iK4<_|4xZz`h}3vQjnU7OD(WS|lM*d0gF`9czp`RbCz-?gK)Cnn*0Xk@vf-L8 zUv_s=TPY}iJ9HDX;D_2bP}d;{P?q5HG3kOIza0eP;s%CHjqD5!l1)K31Bxf)4M=eW z6zm?b<7hzEfgSfIGU6d2A>sN*cSgO|8gmHFVT`b=EYCAW4Uib)nI0Qc+n$z+!_+7y z^t|t}`%qO?Wf$~>rJ_8}r`I@UFR^m&&#&$6XmBbIK>sVV))Rx9hON+>rNRi1HC(t= zhXfBHIF*IO{v6qeUy=XG}ej+ zciBNpUTgQ(7WjnjE~-K=!R;z_!*?Q8Za|kSRN78|e_W-d)u))Pk_B{0UmBVRA^P`s zw-i1(Or1{sM)=h7FFid)1lz~Hca(nYdtV+`I31QpQIco;Kw8y2rf6!%b>AwqqI>{C z_JebDbcCR&Za=`O^@FqKkC$gCX)&(ScIEPY@Ze(<1%^VXtYwBxN6Z7lk3Ky0x=&QO zf%`nS=(7LLeX6Bp8XOpi{Oi;W5(6+|-l}GvRy8mz!`<%(&8}0!{EA9~k%8R!4{)MP zv%yLQrncUt6ws!>%rG{!3}8X>0}0GSv_b z@Y19w@36h2W1T>=2bP4a&Ky7HFzw?5ryHE*U+#nRLe`;!`|M69CpkDBx&NAlDLE%M z-j@bQ81j2IcJLVenCg0Z91B{Oe5v`o>A_w2!e~M527vSEqr^6;m~M+1H+aun5x}5I zm-EC5=01u#e;w?NCSJ_um{#sgN#y~ch!1N1%7yx<#SrS?{V*pvmFq(|~=fp%0rP7eTL9!2EF*k?OG4$6L z{a~sDI48yc3Aor^Lut*H z)Zv|*z=H?dOYk4$e&D>U*g=v4GLsV%4bpsFdp2w5P&p4NJFi49uCOfFt?i1N zd?>)nU`hp|udH=(k3j)D83T+BrFRs?GrAY_wD2-YBcn{?6`Fn_pXmkA2{DoJP!ZR$ zgSZvHmxJ`lifSg4rt}y9HYid@VDB-nKzM8=gUnEGTbO1nfF|wnm zP7^sh&69_JZ1}qfkdz8y&$q_nKASH;S>y^bEjYaUIm$FfftBSl{pXBpNiN3&GPGWy z3Du;*^*$34{%sC&SNpa@8xRVst2Nx3K@EeU z$uF;p^Bz&e3HJ5%=~rmwZI=UV1B(8yogJWI!IT&NKC#$PHWEuda&Q3O7t^}eeynze z0VuqI9XqBEgi1ou41#BSK0g3}mR7mElt$@fKdqok{ z{)2_@S&tbHP-AIuQI~t7>4T-{ax=EvlpMZ*N(u9IFYw&HG-3qO@1wW1ET>}xfyxP@ zt&5VjT)PAWliDcAynsS>4bSG1QuQC-2Jaow^s;%;Hjx@*ic$p)SMDMdBJL#JQPhdZ zY9MbUl0=h&W#y?bjsS)lXmfNR(GO<`gK*0g@BySAQNpv1ozomZF9DN{5lpLx$>$s< z4VqGjdJ2_Jfx6B}(C9~}spy19i;JocYATtBTG30bIy;8B>v6^cU9;eHT@S!$16lw8 zr60!Jhtm?a&m7GicON_lMH&=OP7+MJpyW#6uuNc@1PHd^*|#5EXsr#37RNwjzZ>@TVhog2Bl7Ry6@q(w+rDX!cw-vjBAkl^FW&MKH_nI1`snWM#w9pVVoRsEhPaAg)BLTWu)hv_VQ=k+m8RhwZ|Nb3# zaW5sZiI0n>NKb!%tcF_F$x1R!n7;%qOacJZmTgl;fQc1Q48b(m?wS0PN2u^1PYZ7B z@DuW#D>`z4a)%3?&n`%glsO|T8O&}ND5HkX8LJ!Qg7%AuLziMIfJ>eo1mVN_NRvh+ zM2!CV>`}9kpFjCFsa#raEjI{n{x z3IP&6V2I|>odF*_Qeht%ZiP&{|O3S%dYy4UB)QZ4L0uvGvm|A|>PH@*uOib8d zJx<{Q6N!MVv&~$aT$<-mH6YzTdEfT|pD+SF03L&w-T1LVN_EKRHgsR_pFcI?;^KNJ z4V8=xk~_iVNV&uIm|PiSW%+>c@z!wWNk7e~@gwt&wl3j1xq|VCkMlt9SnWhksX)RY z5-S6)f^3o zJWE?!8w44AzS^_jbFEq5_fer{T&pc#Xh35iSivs0GT*O$u~&) zH##Gt$twAPDjpZ+$8zF!0}l8cuT|nrxGr-x3Zl%DPo2&XGgsBME*!%VfdI)w{sE=z4f_q{MzX ze-?hodK$~im~nd!+gyoCGL8(XjX{axcrW88-}g}XbruW*75gtKX{#b$4|`R986$}2 zYs0k*Aw^!FDPFi`h<=l0j(UcV=F*;fSHdyqmt$u(zlIm}>FSJe1!IHT0_$y7dZ4Zt0uBp6+CxlK@lboS`#h|T$O~UC?$cFBO0}VdWfsI zck}*S1 z7g*7L=xkZlglRn;pKv}A5Ncb0O6D@DvCDVs*+tBv6ko=pU$u>%rh4+sb4J0(I8F4y z0Q~Q9>6>Fq8los`H z;SuFY!06nb#3>zpT*}0fHf1=b5$8z6+M+N&-*<94CB9GQV;t?S8&BJNREzRZWdaXz zb`+-fE?DSRdt)tKbrCjq65Hc6V@Kk?HQ%zEVVRQ@J3d+LGIlq3YS~UiabziZf55eE z8<5rUEeGSEXpH6Ai!*q98mc&#>o6*n{rZnK?yw|}>qC@&qp=W)V%F9#F$xtu zHhkzWyU@Ip38aA%LqKFYbc;?W<={GZc(+VF%Jp5r%{}_i(!F_AFPp&b{A{V5q=UR^ zr=q(`+Rf7Hhv8S|I!wpQ5?szoWTD&JTQH~G)330fc#{1I|D#Uoq4Hd2UeS|Fc?9<2 zOFnZ)w)YVx)rbDOE;C0)Cs$GMBurgOPd$kg!tRmsQ)UrsJXeKBdytG}u6IE$p9gf>8+rA~ySdU#7eDdqe_%nr&&tw*gdJ^_4=!O0hdOSiv z_}CmwO^o{x&;eoBi?)5N#v;LVKp^Q|IPSAz3_&FJ&OX-0S2iGe{f!&%*8=T1|Lkgc`9H-mI$XMsXl2I12h>LbhcuDf^3 znAi{mwpE+6xs?5CbCFE*t{jEq4OPHQ4K<|yGI6ExD0L)HkKNe}?Ktr5H1`3Wh;1{L zn7^9JT3g@TsHqljHaIrRwXK;_(SFOs2UNQah%wb@V0OeY2SBkCU zf>(}2YMLs1T@j(H*%*f(QyH18kRUFRAOHF#D&Z`0oY|KRY>whuUxbUdQ<4Y%fInf7 z-dc(IVx4dKl4GHNf}k*5%lL+;%@7s%IwGA2g^iatQoy^-d8C3tgqC*MvngJ?Y% zms=_#f($fivd>&y^3vFhh)CuR7{JDXAcJvI9Oi(P8gTi8bTM{y`HgazI*v8jZa@x+ z%3duhz$B2IWUpg?@E*m03pGL_sQC9C!iy^tyP$Xn<^bdmfxP7?ZU$HmKM23I5q`^l z-6F9!GPZfT_}I3m5(^s9t|d#Hx)D4gIO4#MwKQ>cEF89OhdW0>4HKszo%MNiJ0@3P zKQ7mfM}0SSgTU{53wZ;Z2P-G)rYBtIkI8v^gH>g{BFZxQh40@12{e5iHN7Bs`M$A0 zR;eK^^f+&87xbt5S10uxbZL`y^{Rbq7XFG9i++?bU zQcc^vewXb1=Pr?{k7L5d03+SK!zGebL6z6hz1@FSQC*<=Hn9K8et?7u9cy5D#XTwP zY5rOytI*~@i;QD2Zol^PLgpMJw3Y6awhM~m6EL9_dE<%i9i5N5Sk?5<`8m^R=^uuo z@qJJc215fC54Bb8xR{ma5l-%wGgr_75yE5?61-f}{0<5pVf@bDvidXaJs*u&>m{LH zfJ`|+9e2hSRK9ZPjH!)3CyaJ_35lNp@iNF)wBt}$d`XvR|D*& ztx+OV$_EjCTU9Q-_Z=x?EbG{h|NM;eBt`m?6YXKJ^?X9gq4`^Ij zxiV0oUuTrQ!XqreG9mwFnUDYzWbSM1=N1ChM5WP!6f+YWHFCM!*YlOf3dYwV*Bj`s z9&N8Dh0Q9h*Rhxd%s2zekXz|F=JKa^>#Ke4x9CN+eY~R#DA4k!EqebB?|E8=h37wn zjyCucr+!&HNfSE%<9rYJVD1OQ)buWQ&oQH(@gL5(4#*)BC6E@5H|%#P!-KTM+*nmEv}buI(U3{j=kOL{J$&G2Qe z$IrkS(7b8I)X#hKihUtiG$XXJ4)Xp);(y#x?@<{*KmGWH(HEh^XZG<*uTepN0qJ9D zfkR!>3K|?6uFGJl*a@F8H7J!5Ro`HJJuQMqI2)~#8R4}hb$DY7wi>d+we^t7{y}#e zH^OBb736)_5z$fimBy4>>b&0)@7fjJ6a>KG7>n%Bn4{ruE%w|OvK#Wz^D8~fXDx(! z(+tl1a>T;y>sL#p&`Dpk$FlI~X&hL-5l@DwX=Ha=tjmBL1t9MTL~IGUiGSv#gvyUR z7aguf`qXat{3_YYLM)%;^38D9_2vV@)N6ThHeg@1PWL+%V<$>{@1!+R7scRBHsX(ys8N@@q-Rw2N~>It0nHuq_gz$4R@V120Er!dNr=1~=jXaNg6fS)EOkC!??Q>~ z17dt!O*HjxNOr;b~9O-*9Np-eh1)nZgLXWP$KllA#CG8kI{ zbAWp69qRcm!0zq&6#op!*>(Rkns z8w!)5!#hlOUc|#dNACc0{?FZ-D=JiV8k|bN1Q;0m>MZm4W3dVbMaia)qn}IZfT4(s zsxprkfB{)uX%FhWJl+Zuz3f%aThc4n_ql+rSDP#1TyQJXtJ&rDbSexGqz?QPtDVGp ztczfdZ|t}`=qz;SOpmE_&MV)=-K=chDpNtURE!5B!Z8Bd!T&+1ALbr+asqdDIjB>CsWjlSIWwLF0ndZco6kR*8s7)I8r|8m(a zm$-(FjV<2u=e7f&0d0k?`BGK2wV5dqA^pE6y^<7#xH<*?+ub)uD~@95U|;djsB}HD z=#1rl2=O3Ethe0=yE81ngorzJuojFbmcJi=l;-&JHT2iuRu1DMlWQPs0YdMwB(kg1Q7N7@`v8g~V64`_pr^v@+1&qgI4Dk69gHslu33=<_DVqe`L{c1 zP`MT>%o{cqJ4yeJ7Y66NBkRGI{F5d&aCz{lASJwpMb*NKNE)*cFOgU&72Z*RaTo3yPh7wpzb7vXqO z3j)*vZes#Uv%+Cp4QQC|d-j=ECZ z;&uN!#`RBv1T)g-;sMsQ^G$FheSzQ#42`pv9Nw*9x95<~CYm4|Bw&&~pKVO>L9x_D z0-IzKbh{5zr|&fkK?2WK|Gk1X!$siqu(8B)>sbYkiy}$b-+5FAh{kh&0{b_77p#KX zS3dppU=<@p_ci%@2`~XkF(o>{AYGir-N0GtvCEPT3Y2AThUS)r@avff6d~gWbWD&I z|DwG6hIrW3G{qPS@gDg7qTFkxOJj3>{_QYe!3Ih8X9D+ob681=+f3JCt+%eFFTm&* zx#!y6nn_}L^a4wWR?zKMb^}}5$>T11t%b?@>^2?0^SE0M*oYyN4(Nj~rRsIqrRbu$ z%J324+Fh|3r5K`~f!at47*m9FOLF#A?`wp*_UH&yJd_1$3DVKYgql4N>MuSBDtrJy=WnQ{aSoKA1mZD3R-Wja6FbrJ;cKrF(;4#EqO z%Il7#cm`jh5eCiz97Jwj9-@4{+Fle_kw!ZiVC@10jU^Lngx^ik&2706L;5O-IT}-} z&itgZ3}z!=^?Za&+4$FI0>wITA|zOxwcZnlW3GLAM-J(GuJpZjrB*C9zyi%WJdI)I zPE*=jO{>fu;q9%2wn}iE^oKWGbehA|KxcJopj0Zr2mkjG)1RN}^yC1ft*>?o)u~X_ zGkUN+Y|uiAh9ky5Pu~Xt;D&Wwf)QnVSr+49@2jHTGqM}L)d3B_T9uZsuXw3tz zn1S;b78d^X>sKofr#Lw|VbcMC9!Alu(#8UE1W@uo`;=8tfoEc30&Yo!H=rOzJrmk| zhyi-T_=29GU{rvXL8{7DHB-%f12WwE6^w?%puJ3e{?!1b*TFDgXC9g_0|aH5_5%cS zFj)z1oQUVE(SlhElw}o0MDUV8tOr^Xf4UMPJY3=AZ!UGou_Eoc@V7%Xcn=T&;AoR* zFEHKUMqj(MIUwA1`~U}i<$CLbUO`Ijb;AYJLcr+*0geW>veG4q9S$BasTFf|qy}7w zfOIbjoUE*_4P}t%C>9p&ZnYtX;1Yk%3m21nULG-^Y)!nL%Cb+3{g(IJ3y!aWTG&jfWuTbeNM$^0?+XN${s`@yuQj$`8!Q% zT+mDM?=ASrF9-od0r-IYHbdN&r)9BLSt4btk0z8Vvmzr+6XN24no0!vQ{a8T+d$L~ zn3;lWJ3J|9;qy_Sh74u$M`)Cz0cA{W1kB&*jEjMl=_~QcONuu>(nX|8@8dnWcQ6Q{ zW|i9)6oPKBff*p^g9Zntz|x=sWwj?5&$Nzo(xe=vfs1oK;D;CvSP2$VK`*67-v_D- zN+-c3I8N?`57LJ)5N<~}>pH#RPL9VF4tq`U6=t*e%8za-Px!rw8>qx-R>9weSvgAi z7+uEQN2wtLx;B2hbyRRE7_neH1VjfAUsL+De;tPldA;<)N2jg)$ebqs7u|Vse!d3o zOu=q3x_$iEO%Ma-HGsYb^l|X_F2ulu0th<;9Rg!Mi+o-eJj|v+?Lf69p`h>wL?#Wcn7Fw3Pj-%2RxG42TVpf9Mg!u3W`q%D z43&~p4reW<%E3Eb9G;}k=jJaz0L5r{cHzyT+i)yu2plS|jq6|&2qNkx4mvzAYLr!3 zX+M$3B#x`1tsPolUmtGS6~|t>3}x)xPz6fRhxu|a3E*V1w3ujkL=)isNbn&BWOycA zIIEUNdm44l^S89lX78(9mz~E%fQn!%J?K$5F%&EKHIEF?i^%;^c~z*I@j9WE{eX&K zof%la=suP;4IK&(z2i7r?9r-7f3u;Rnc7Vu55!s4is4gs-Thmb?|6HUM{KPEK8r^0 z{U^ci{*-^S%z4E!j`bQky7nC3#AYX1XgvZAfyn^bnM zX+OZ?NdmgKwP&LH|Frkr@mTin-=`TVD?1UwC3_|#;mVAx+sc+bvI)rwWmQC0R#usj zQ7Ahqo064;?2+v99Ou>j{r>LXc>a3+e15%p-MN$VI?vDf`5edlIF9%6J|O=XzRMW+ zH75DoQZqcp+8xgm2$hwpOlq<|R$w}R0P0kq#v6S%s)^vPQ=9`l~ zc*q6~k>oos9y-QuCHbTer1We#??&Ucj~qtDWt;X-7cllTz1AB`qB>FrlnUTmHM`an zc#DbhMXV`J`Y*Y=Cb6VlEoXpar=n?))9|T-2nEe0cb9fKms`zl*GQ0iWny2Rrv?;w z9~O5^hm1HY-MV#H_!=6b|E7C*tX#N!B|0p;q7oFF37^?fHu{6Nc^O~dS`giw;zzJHD$(uFY@dkj%KLJZ&K3w$Z!jxUEqC735cool^r zBhBKGr9scd)4}lG4f&FU`9GpY0=kHtk|({$0GC06)w%NUuRVMOnYrN!xqae~zKO=1lQEb;4Ol2_H19&Yu`}SGK1y2J@M#^zm5k(_?Cv zY-fWcsl!FZkm%LZiNo8<_yMCJr~8LGc5zss-RoPl?+Al+MHv=PA(LM(yT)AHVq2%E8l|Q+y_wTv zRWjdIyl`O_I`N(*7BPR=jEr1^CxWzW)IEmutyQ}R{q@Fy`AZQ`#+g~kBl|{2qf1Lm zwFhFMvkMqY(61DKa4=GtKce>^rCGddE0(fy*Ay-ahu>?KYvLsoqvLC`>$-kXmAs^r zz6@R=ic^QSo{I(8$V6NeA4{FkHt!PEXzULb&*v?F^4*j1>bEm#n7cYZ=#rf_CwoA> z19DJpZJmr6bGsbwASeP54SowGJ2n4k00N=Xnaa9*%)YFxH{N#=*8xgD2hUsDaf@&n zYTvy-vc2N1@^wia2WM+?O5Bd8k{Q_k3T>~@wRDykEtHtj?t7Npr{MW=z#@Xf4E9yj z&6RD7t!ldu$%IhG`0#fbzFOF3)w*(|cVr}GL(p;L%|9#h8H~DyB$QgRRW#x&Gv!Y4 zQ(Ap#!aY1%j}7AKS{$orOOss0*we7BLQ?gaz!{KbKJ^*zmY<5hfSlpoOVzwzbSb`; zuk-Kd`P8*Mw0}tS$iR*G^D!r(bBCMhyB^$gw}l?deWhO>W0EaUPWCzf5TN$rPZUSN z(TP!-E`CjG{oGoU>)nuboQa`fH#?N|O7GGBE~lpoKQU?Xz=4q8OHDRrVZ&&jwHk95 z5J!LF$esMoZ*z0GAWOo@$)WCic`lrWs>O|Xe~K)ClJb}Pq1j7JR=9o&Mc7lZ<>QZl zI>$#xR;<5R_^hk*k#SNixL>FlF~l9SX-)17;b8GQqAXCK*geo z9=x|@NR$CVClJ|E1R5UbCxLj5=+l#5oqIvl0d1zesHjLV`@XY5^JPl#hjtMIXSR7@ z_bsO&*M_N2V@E=P%F5JM>6)*U%Q!?B#a)>KO|;*amv4_avxr|hKe6>IF%XE|0o!rK zFARXr2d!7{fOZdE*l53WR1OJHf}pc|c97&I7dSgeMU{^#UypaQFK^{sfohO197UAoB(&_Kz0)@S?uHBlqCbR9e3F$3}>K*Y|S%SG9^_nw=b zp>C>n8U6BO7r5d;Akv|sxpqad%DH`7-IkO%3`oGB*p%So`q#6ee%S}aVIhYNl$>~E z>N&@#q-E{g+$g8kKJ8z#z7o!rz%#|E;c*|5ByCoV179;aj?SMrdb`|ZR06vC6pQq% zMxNXB&Wm`aJaKm4S+g|c135Yn)Y39A;PzFiG!tS#!JDt|cHw+tI@Nu%x@tlaZ7%d> zgyisd7f5N(&dsUo+SU$K|mGXoH&x|?~TJp#W zkn@^vd}H%2$({AeKnTH+-qBGq1VxiL61;;x)g>y}7q*oti{J9K>(ztHJ$bclEn!J! ztdhklMx1kBW-7U>Cd|W^-jy*Lzy=}!j zpM#E$4t_#sYu|hKawBEOOYBsE(D%&1#1wj(#IV|#@#^7~?1qL@K$t*+L0bYyyHJSJ zw-zlMw3^e;83X<_CK9`P^^OGIUH%6-wNE-*Lz_-RgaV0FWsUQ*8~AsA_{r?-Ovkg` zW#9Cd1!1nPzAvP}^j#W%MgQjQ<`QuCI#1~gKmYsOEa$S!Z7ANk<0IiKl>xgOw)Tt* z6SndQY-wiNJJ>I7lCT<4O-=GUfsMjtRD^iYOpIn8Ac+A1^{x%838m~`XiG0}`U%i3 z-7w(jLj_e5EClP#O)9uOU>_z4ns-Ob4YVExC9Do|UQE}VtECqhse5Se13Mavd|hm& zfAxfP)tAw-X4JC6u$@o-4ORU7iAz+_X&MaKMpzt>@jFHwHfCv)2;J$;jxX>LsJCe8 z>vQ?47;!KhL#e*7{!JEI{^_FRwp@=rm8$NEF&qQl35JI!M5icG+KXL{iY^H&rnI?G z3F>uUzI+MaZfTXk>-s5ptX`S-e7%`q_4Ua(l{29S2Z4N7t7G<;!zU)p&{kqZ3o-{l zFoXv?TCXg1-ik~$vPJl*WPIDK8@4VQ#KtsufRXR=W!Y!H)*O?w-33&{tlC;hyRm+em7|iXo z3gA&+pBsMgfI4oEdPWF+1ebL_2kLzr4*E32{ZxG`s`msw8gT;w#N5I{6-WTS2-tQ7 zNQ;lq@f)}D44;5_{qM8WtEmgd1@gB8^XXNdPR?QH0K8PvQ{A!WgG(pg`y{aeuwPF! z#t>e0fAW1`p{FzPfT6{msc{ErBL38PQyL5as{AHs{ zdQG8w)18!(T}=YLs@}Q;o(Q~+l*{n@`~cxw969@q4GEK~uZdsQD~yH*t;jmRZw|Xj%FKG3;D<*+shpE4PXk)EN|Pwa$>UO= z({Yl7ilv?(EifetlLxXohzWt(kxIPhZdihq@{&i1g9a4yK#Ali0-7-Ly1EHtY@OX; z#jGk+=uhU<*7rpD*~TDCV`|tUFsD)1rnmPkD@TBh&1Iq`U9H{f?1JA)G|K&PU?2?I zy6_Q*_ksP%4vYc%S4(#_3tFDrytDnK@*BG|+0kkR=Gm))T+&EI>EzQ7?b8^l&Um^J38i|=EBdw!W~ptxbX1Jrt~Dq@V-**+>?+gcjoSWbWxF=Tn*fOkdZM4giM zQ8_%0$_hXp6!J;}|2icBgVf4(GC8c$iyRZ27ojP^bM}G{`vZD1b@ncm0SA1VQykhOz9k~cZRY)yC#oeJT;NF$HHJ;CmOCp*q zf)ctF0s@N2O5>z%Berd@3 zPk%UZpgm{vi(asF;G7lY(HR&R5CoXjS)}K=iq&( z;Me~OfhzZ|GcJDb%LUacd<3$V7TxLR!BdLV1!i9Z-HX%-sXnp8PYNPZ4dMoA=cu%| z(2uB8gK`RJTz&a+ddX+MP~bsDPHqYaLje`u=*G{HFBtYLh$UYEhN-qZ)lvRd|OK_H{Ax>Lnvt33A_NL zbw1OVG|=$m5t*u2HyCpkb`R`V+kL*H?CintDrWA%t7r?FdYAt~6uPqh_$4$2-eEVr zaSXJ?U@x_Ry@PZqszj8bdBoyd^sj8iNB~>CpFf}VEu@B&94aUW)!vhTk+4d6kwDG} z2?kRMDiB?#9Q&c{1yBjPLA2w0@@G7AwgVwbt2HTAm99}LnDzvNlNt+Ed+qL{xMWDb z{Vdk0nHFjQe+|T#7->zo;<);V($R+lSL9AXCj*+|Q^=+iV>*ti)6}&jg0Y^4vN59c z^0UObb1l>BF}ue`E)e-t&zS?}XcdS|j1^^zipoc=ejn+2aYl;EfbFu6G#}l&N1V*I z@1nGq>@&^EB0HuTe;%OF*w%P{q9FjbS>CN@LymATwJ>6~gW@N_+`yy2$Hl*l7aCQ5 zKbnBCXR+5 z>(%_r{Ir*-uGYAo+O`7b6DT0&=PRuU*s@ZKp-YTTCdhbUV@ZpTJ ztHyuRU4q3X006^^5mcCCkKn3H9ravWEW8PwiQ0=BHv3YckVt|a*+*_qMAQKEg0{wU zdPz{sgOvZH4e!7H`U|=R-hUD&yQCcsqIx6F-3#7&zk=kz0Rl*Pl5y3VtP>niaou+X z?3p9rNwB1#%m_Ld;G+6%SyH4vgQ-R1MDg1S=H{$htCOtTi&y@rN_8^*;LZ%hk8Gj( z=mcyCv>hNORUa0Y-6|PLDb_Y{R<`$X5U0;GR= z%nd+kY<6}whN0+vwe1xsZ86>Vg3zPRWZKN@3&c>^q82#Dm{_aVh}~Y*2s8`>P=O>X zyw~~l%f=VjX^u`o`y5!nM{DAtr!9Ui3_Yi9fi_l1Q$1e00r@oOd4O)E%Sk&{AZYim z6J_^#LripeER|H005%OTx-24H5e}6jO%XR%_NSKLpo%*(Dr{6&nR`&E9nyoV!GthOx9~y@G0q zzWOEGmnUOBE&HI^9`(zY?Xd4=Jern$xNnXtK~)BduuUM#cBAy=OD5>xxH1}a_b71~ zlt&k9dxnOFELXP$L8BT{DI25I00R@8_!n8~dyiu`2x3?W4VKOXgH-HRtruv^fcj1@!O zlXgNHnXsIGJ|-ItslL^Tu4a z6F;b1y-qCf@Zz5ECI<6gn+P{MJcGSi2U>60v>THBv>28fk2@|h|e=*OwtO~c;wtjHC zP)rwJP=G=Y=;qQwQ~*8c?`?W%(IgYARXyOiw) zt#~_GZ>}m(y%buvcW?<}Da|y9;N&D4@TaI5jfEDi_?OB${)aj25ZP$oZqzBlg4CU( zlap?v{%ldk)l=jhwd%p(^B7{=wHH;Nd-8(I#@RYDB8Zo}mawr`a?HQ$ii>>nl7g58 z6hEnYit?+fXrRT(y(E4|Kb}C4rLy^yOdVGt&z0K`^KRSs&Se|shT?NaCj<|Wo0l=R z6BXon)WnN8FDk!rP?rBCgVEzIAfQlY|Gq_B-Z9nY+CgdM^wow-*433yIYnP~W(y3I z&+3Hi%FDunRC`Ackq=&^l(~J6w4JP|srufdXDoH*>#6f+?xzo9-^>66!)N>;0UO>)VoS_*%S#V-qHKUd%)Ym%SKq2BF%?a3`Pkeh^=- z^RVg9PXgfqsM~^n1O(7d_W&*ZoaE$W&v}nxaZE>>=tAQ6su7UfwY0a#tb9Cqj5r;F zg#NeshBWIfEg^7~xsi1wJz$mgci3dfGS zc9fdF%0CrtR@Q;`lIlNOZw+B_*jrq9t}r&DLo~$avi5v(zU8H??M^e*_r#+z^-oxS z65Rkh)(*%hzd`nod*CSj*gjhKSBZYSV|5Gacw6E5qPS@I#i0HS$<=TUeN+%PaYQ$* zV(Ie>GM`sBl~yw|Gb!wB2!e)$f%z-Gu=cCXyRy14X3x)!Bz{GlgB_pNSbU{OZ^}=( zBc{f;ozd=%1%q^n#Dr@z8Y@NRqhK-Wt3-lBC0HxyO$HB=(ih-Q$oV^l+P=#8AX9WU z%PmRcR>oW2S^0z6?}&aBOL4GW|3{_OY?+^lX22jr876de3IzinYCU-&5yYNTgl^Lu zMygBhgTH)F#QH$fBAcAtt$uBFNbI3P*x6dp_IjG(7(4Wl#t+?*FJq;Wf0L6$WT#R+S1G74MgKGBsSrW^19uJqYUX5Lo)Lt)3PVC2YEWdi-jpKKHir ztiLleGAQv(j15AaW*<8@iwX$^LlCD~+j>MuM@Z8vRS<(=5NE&PNN)y1P3FHS=wPvl zG$3mMdW4|%?$JVxjS(X?U7FwZTy>cT6`?O8?Zb%LyjmZ2*45OX<}ZD z$H5lQ<|tzbrp}k4HICkx`7Qa}NY^atrlw$Vi~Q0;mBrgYW~l$KOqM-;ssN!I?p-9} z7KKMVA+8+Yxkff#R4;qkHFA$Sm-i`ow|ADaS<;WwGoNZ0_EvO1ldk;gXZc z#LS$&zSd;&2lXMBssXlF?xkVxOY^=(Yht$g(cm;j>znIv|0HAZb3C_!=E zjMXWENHRh#@4@d|Dm3M7QsKc_Z*Dx~0#vMW=7DF1+7V*nUI>(f6>e)uh6hG5%+sHI z4JsoLet@t=og6t8)dwr|m|<)nvjo;!nKrgU?%B|S`$y%mxwWr~SLu;2)Vhq9&-=&Q zR5A8v`gmJrwnSIw1Rf|maYDsv^Y!-)z7w)^{3}Ml+S5M!=V|)v#h|W#PddiIsLGxS zr75BcbxJ)2#!E~|>rhPp(q#oN))D>WD64->{jfa<%-1gRd&^bS+NW%{gdJa)FQFlnvpz`Htf)%jp=e`L(thBps)rio9mkcSEe5a ztf0wnuCZ=Z5D%Rf0Nvrr*4ERH<|YGwXdHlkDms+{-NG_!ZwL@2sjsN=80yW@2^*u6 zzv)mp(U}Q;ENoEJ{zlp6Nw3rrK*)#$-#XmB)d9Mi1eGH(&&nE^0DM6+vt8y*2^^Uh zPGa-&7);54y^aXE9vOvydICgW5jT|myT7%J1cTrCf)aYaXGg+m?;9J70daJVm2o)S ziPZHSxR1jL<^UiCnGis=^@qRr{UE{GH#9^9A{J>MB5wSc)u_!_SXfYQuJKu%+R7h1>be92(()Nq!{s3GV!tE>{>{&Ir!`&Vby+(6dAdgCs2~v$vazP*C z^XH?8{q^e|PGt>A54O!=|G^lI^*00*U;)1Zh~F713*y{%WrI^hiZV_6ioIZN!PkCu z!8XXmlsgWcMfKY*xw!EEi)xGE8cIVkiH9Umi_i2_rXydy`a-6B8re}& zr(Gc1_(mQb6=km$ikW?r4jZn^|FITB{PK&eU)bb92p7(YCt2623PMk$CsB)Ewl$LG z55BjM7T@)HY3pZ?u(51-{x&p$OJx`cPK>4Ni$r&>I}fWR9YZ%%vuuAY}OlIbCo z65FrV3N%kfC{Q86@+GYeWKQM5PWtfH=}-WVHL|mgKOx9`l3|Fm`V1iRe1-QzbJ;4n zp*nBhiww}mWp&WMVTq&%+CihW4(qE1JPvP$YFQbE8*U#{*MUuo!*7^00P%4=>}DP5&Gsx2|qmKbtF%b zd$Pljp+mp5Mjc-4?ax&!qOfzX;dXANk|9msDn;{&V=b(j2b;5tm2We;83 zd5BJ%=V6QVi5k>+wpU;c)tDZR;la7%jMkyl${Rpb7Xm!r7qzdSkjvCh7u@dPfMqr` zL2V=Yiy8;${y+aJ=Ycc=t9=;~2Ot(Lwjlg^Qt+2^&8V_$f!ks5GpsU&HW0FDLgKLefcnEjtP@r}#y4Po@{Q)N(I zuPE%Z#|etm!r@{>^Ldal1Y}4_$mURnz8P8^pwNWA45+C+-tY$59xxhAC}hfE(4~zv zE=d}<0HdXebEX(2NSD*xy>lfVc!(a6aAFhPaUazNdbLnjIdQ=pt%pUS9~O!GFdnW{ z@@GI^LCODq|M+W=RLasRYM1HmQ-e=h$4%vvx~-56`S|%q9V5r#x$FaR_tx(2NCmWI zbUqS#crhGB)RTnZ8f__CEC~YwsJ8ss_I{DN@KOXDQ&Q4gLN01!2v=9e?+_L0DP#}G zn;IXrvCJNOh5o1mp|*^BJIw#P8_of*5XH!?c< zWI}k!hbG{no<}eo%kll?Ut8^bpM}@3o`EyjG~!@b|03ms-;Yu7q@nSl_Fep$v){a$ zU)&INzO{FPT~QHS1jyT5zjg9uQqthE3cP_cxRY^Z#K_*9t`NgXWeMjx-j<6XmMfJ@VcHo4+BNWmFZs2&ffod95QG~}!40yqVHT!&5Y&C! zo3|jR*B(KvPM`M0z;~x{{u;i(lHykQKfXE>24!q08RGDgD$aU5?}`uF}4(fHqVdEMbUFt`!Ow{~5E?H2&mI4_K*#DM&Cp2pwbRv1$wZ!Qqb^ zMpct`*5hx*Y)=GGkVg`rWP1Hllk+7msLV2oTKmSwVIXNIn$xeB|U#||zKP#EvkWo20mzJ?AUJmm4C zpX>!c_RvX|P(~>%D^muZmD#Od-jFCDon%E^L17=AMggV%9=SuAZn?uiJ$?PIhG*bF z^8cWwho~a_Axz8xYEAU;@UZrP(ocU`&^p_^p7tTBx)x?0mcCk%lG*xX4OPxow9>oY zMz2WIrVTArmV>Y5{yxt+Ipb+s6GtWstH*J2ipT|F)+`88~x#XCH1Jz*bs^ zPEyeH^OEWi>rv%Qen<4eI2H&M$i;vFX)_qzz;o+Lwia$&j;ZfB5d>-wfJ=S|cnF6G z*RqRo@@p^~3s`Ny@bjaSy(C;dhr`?zHUHnx_}PL+w!j;jf;Z9*!z^~I+Th_L&>1@D zT$bTd(`e}3oq`#OF^j%1*TNP}kfHMz2OYgXZ}Wof%pqJKJTC?3N94!KXk=+53qCR^ zyM}Do?QWa|sWouzXjc}9Xvxg(!goV`Cog=fd6B&ma^xF%GQ7)UxE{bx6`D(^Aa@J! zt=jp|!n!;UIlHa_@T~7Lko*5bA{s&tI0lS3tAFMM=s-C-prMO%+n>ND{E=pvl@JDu zMKmFBUFCD{OVhx&-EpyeOzz-fUZx^>Bs4M#EP3H0iMMZiL_!Swcd+3;NJ~qp$@fZl ztd9+@aP-wUiE#;n;B;ou01d5MQrR%e7loJR5g}iGuPL66G-&s@wHC6uYq$TL7tTZj zZf7cq6dMsBBqPpi)iuzj?`*Q#2#V3kNeUmwpcM?(>W}$hEOHw7cQ?Uk~H7No2Oa$t4AgboHbAkM0dJ)*D~!*^gKE_wNfts@e_Z{c|{@-5vF0! z5xMAQfq>TuV>pWTi`=;In}I;E9LV~zmAHQ1~$*S~e zJ`u?Mp6LIn6D*w#-X9Cqszq{h@ncKfjmk~Krmt<7 zx+RfsHPyD`9?Wb7CrUr0S?&*HQZh2(BbejFgJS`LUSxD{PqDy;)wD}?L{d2wodY3LyCaTr7FdMYaDP|QMf!88bWk*4IN2O;zSx|XbQP13rS@W?wN|L<`B*HpkNX z#m)T_6OY^G6l;2pAcopH5B!4FI1_hwwH_uZ6v2SGSFlYUD1rVN+*>C*=3%eiUz{!h z3)USltO@7-x&AlqLD4Dt>mL!{|HeJ&%&Y(BxTj4hbiyoL(Jaamh5>gGWa*QVL|`HA zktdmGg185R$<~_B8it)JkQm^LhAgqC@8)29DPgmU(vlu;8b4$+xw+aj_lZK7`ts*$ z)4MO$&e9P?Wyu^c)8w3U7}a-rmnCRUs@TS4e>_%X={y+zZ-4L-4>MEf@^^JNLkX@I zk78(P&RXc2g~(K#Xqz(7-7?xZ>ZmPo=uh(~>B$nLX`*!Q;0*n#$Qgda`1&ZfUI=-* zM6=5SIX8};%<896qd%K<`SwI?Y#`aSphZ7}9; zqJze`kZ0+S8HivY&G9|-ul281iKk8EQ&MfQc`N8hM94P3dIM--pxYSp>Odt&0gu~6 zHqBCciyNzeMm+7osRND2O|{h*e9nMrdVup~#gyd!79&3~fpNbOr4?;HbGH+ldGB=h z8gnoTD)LQI3zbAB43c~7=}9$sQIs@}desx;iZlNu-nnb}$oGKKY~KtC?*0^$+P1Aw zB;0k%Sf_5kI>&A@fszC7BBfU4&O{=8H)=3|LkpQbvz|uoC-Y^uX`g8i`EV1qEtv?v zgIDsS+dC@!rWVG#(EPX1#^z6t$V=c@i@teDoQ7Nr6|5x$JG3t#*zx!#TAk(JcMeZJ zMCH?wccmRV!#^Qx#(?+@nqE8_8DSy2!5(}+;H(qftpI)chuTDW>e|Dbw8+KYkC9E& zw?Lg7UajgMpKc?)_n_*D%5G>{mL5GW(it>8<}xLNy!o;&$MFqSC;$@+BF=DarMuyQ zum|vw4Ry_T4O|kT!5c*92q-B@DOy~2)5c)S{VPd$#`h4pjYoqZM4GBtc|Ezj05Xl^ zILRHacH3|CHE~Pv6TW!mli(|Cxc)>&CO5C*Ax~vaY(yKQ3C-KIPJlmN(_ZM1uWNyX z9}`LvWbcy9VTkCdU`4bH8>uM?fyJs}xtoG0U3mSm0`zNO^ZlJa)$IMqtr7NtHys!F zJSEE>PQpj<73u!dwWhL^_lzDI0H%1Z>llM5t6>WCqn<3#h7JWLkA-(S?cNG|hU5)Qn2Uw}VUp0` z%AN}+(Xnzc-;xiPur>K)cYw&;V-|2qurmEaB#Ch;S?X<8O?O z&Ok+D8stl=Q)vjKnj)0|dC}r>N*<4ao@-p90KkGO5$B8kd@sJVUo5HI+CPp0`G z*FT|Ur&a~bD`cu}TC`6FxuaVDNeIYaZR|%6+26qViPpT9-Va73*w>9hhF}!LqZ+VOzm5cf?w&$QWKQajnK)Pxdj(wd zZ=Eu0%HyrDEVpy4SstY3^FCP2{Q;mNbJT@ZT>nfx7^?k+r*^W87$N*cP`>1U%<9Y; z=&PNp2#N{e5JVW}?_Cv{i~eV3qwlU`cUZ*&z)!eS+5N}C45ZMY+_l5J-&Z+T5X=yw zP^9?bYd~I}0o;%XE$+AUCaB^hAn#k%zpfQ9-ZU9lNud^KWYHZ4o=s(`Qkom0>8lW$9_d@9p7$*g81dNC^rGx_Ml8w6XT`6oij<@N~8@ck^%* tv~%zhbg}et@HDq_xNa*hBy1^o&C*T)E?~Ep;S1M8R4-{_OBF0a{||S?lgt1B literal 0 HcmV?d00001 diff --git a/docs/inputs/support.md b/docs/inputs/support.md index 89c6c482..af3a5e90 100644 --- a/docs/inputs/support.md +++ b/docs/inputs/support.md @@ -11,28 +11,7 @@ to be compatible with MAVIS. ## Job Schedulers -MAVIS can be run locally without a job scheduler -(`MAVIS_SCHEDULER=LOCAL`) however, due to the computational resources -generally required, it is recommended that you use one of the supported -schedulers listed below. - -| Name | Version(s) | Environment Setting | -| -------------------------------- | ----------- | ------------------------ | -| [TORQUE](../../glossary/#torque) | `6.1.2` | `MAVIS_SCHEDULER=TORQUE` | -| [SGE](../../glossary/#sge) | `8.1.8` | `MAVIS_SCHEDULER=SGE` | -| [SLURM](../../glossary/#slurm) | `17.02.1-2` | `MAVIS_SCHEDULER=SLURM` | - -Users requiring support for other schedulers may make a request by -[submitting an issue to our github -page](https://github.com/bcgsc/mavis/issues). Additionally, developers -looking to extend the functionality may submit a pull request (Please -see the -[guidelines for contributors](../../development/) - -MAVIS running locally uses the python -`concurrent.futures` library to manage -jobs. -## +MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling ## Aligners diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index 929a3dad..bb7f00f2 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -14,112 +14,24 @@ installed ```bash git clone https://github.com/bcgsc/mavis.git -git checkout v2.0.0 +git checkout mv mavis/tests . rm -r mavis ``` -Now you should have a folder called `tests` in your current directory. -You will need to specify the scheduler if you want to test one that is -not the default. For example - -```bash -export MAVIS_SCHEDULER=LOCAL -``` - -Since this is a trivial example, it can easily be run locally. By -default MAVIS in local mode will run a maximum of 1 less than the -current cpu count processes. If you are running other things on the same -machine you may find it useful to set this directly. - -```bash -export MAVIS_CONCURRENCY_LIMIT=2 -``` - -The above will limit mavis to running 2 processes concurrently. - -Now you are ready to run MAVIS itself. This can be done in two commands -(since the config file we are going to use is already built). First set -up the pipeline - -```bash -mavis setup tests/data/pipeline_config.cfg -o output_dir -``` - -Now if you run the schedule step (without the submit flag, schedule acts -as a checker) you should see something like - -```bash -mavis schedule -o output_dir/ -``` +Now you should have a folder called `tests` in your current directory. Since this is a trivial +example, it can easily be run locally. However in order to run the snakemake file you will need +to have a copy of the config schema definition file which is included in MAVIS by default. ```text - MAVIS: 1.8.4 - hostname: gphost08.bcgsc.ca -[2018-06-01 12:19:31] arguments - command = 'schedule' - log = None - log_level = 'INFO' - output = 'output_dir/' - resubmit = False - submit = False -[2018-06-01 12:19:31] validate - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 is NOT SUBMITTED -[2018-06-01 12:19:31] annotate - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 is NOT SUBMITTED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 is NOT SUBMITTED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 is NOT SUBMITTED -[2018-06-01 12:19:31] pairing - MP_batch-s4W2Go4tinn49nkhSuusrE is NOT SUBMITTED -[2018-06-01 12:19:31] summary - MS_batch-s4W2Go4tinn49nkhSuusrE is NOT SUBMITTED - rewriting: output_dir/build.cfg +mavis/schemas/config.json ``` -Adding the submit argument will start the pipeline +Now you are ready to run MAVIS. This can be done in a single command using snakemake. ```bash -mavis schedule -o output_dir/ --submit -``` - -After this completes, run schedule without the submit flag again and you -should see something like - -```text - MAVIS: 1.8.4 - hostname: gphost08.bcgsc.ca -[2018-06-01 13:15:28] arguments - command = 'schedule' - log = None - log_level = 'INFO' - output = 'output_dir/' - resubmit = False - submit = False -[2018-06-01 13:15:28] validate - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 (zQJYndSMimaoALwcSSiYwi) is COMPLETED - MV_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 (BHFVf3BmXVrDUA5X4GGSki) is COMPLETED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 (tUpx3iabCrpR9iKu9rJtES) is COMPLETED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 (hgmH7nqPXZ49a8yTsxSUWZ) is COMPLETED - MV_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 (cEoRN582An3eAGALaSKmpJ) is COMPLETED -[2018-06-01 13:15:28] annotate - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-1 (tMHiVR8ueNokhBDnghXYo6) is COMPLETED - MA_mock-A36971_batch-s4W2Go4tinn49nkhSuusrE-2 (AsNpNdvUyhNtKmRZqRSPpR) is COMPLETED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-1 (k7qQiAzxfC2dnZwsGH7BzD) is COMPLETED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-2 (dqAuhhcVKejDvHGBXn22xb) is COMPLETED - MA_mock-A47933_batch-s4W2Go4tinn49nkhSuusrE-3 (eB69Ghed2xAdp2VRdaCJBf) is COMPLETED -[2018-06-01 13:15:28] pairing - MP_batch-s4W2Go4tinn49nkhSuusrE (6LfEgBtBsmGhQpLQp9rXmi) is COMPLETED -[2018-06-01 13:15:28] summary - MS_batch-s4W2Go4tinn49nkhSuusrE (HDJhXgKjRmseahcQ7mgNoD) is COMPLETED - rewriting: output_dir/build.cfg - run time (hh/mm/ss): 0:00:00 - run time (s): 0 +snakemake -j 1 --configfig tests/mini-tutorial.config.json ``` -If you see the above, then MAVIS has completed correctly! +Which will run the mini tutorial version and output files into a folder called `output_dir` in the +current directory diff --git a/mavis/annotate/file_io.py b/mavis/annotate/file_io.py index fb7ec50d..2683806f 100644 --- a/mavis/annotate/file_io.py +++ b/mavis/annotate/file_io.py @@ -2,20 +2,19 @@ module which holds all functions relating to loading reference files """ import json +import os import re import warnings -import os -from Bio import SeqIO import tab +from Bio import SeqIO -from .base import BioInterval, ReferenceName -from .genomic import Exon, Gene, Template, Transcript, PreTranscript -from .protein import Domain, Translation from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate from ..interval import Interval -from ..util import DEVNULL, LOG, filepath, WeakMavisNamespace - +from ..util import DEVNULL, LOG, WeakMavisNamespace, filepath +from .base import BioInterval, ReferenceName +from .genomic import Exon, Gene, PreTranscript, Template, Transcript +from .protein import Domain, Translation REFERENCE_DEFAULTS = WeakMavisNamespace() REFERENCE_DEFAULTS.add( @@ -552,3 +551,7 @@ def load(self, ignore_cache=False, verbose=True): message = 'Error in loading files: {}. {}'.format(', '.join(self.name), err) raise err.__class__(message) return self + + @classmethod + def load_from_config(cls, config, file_type: str, **kwargs): + return ReferenceFile(file_type, *config[f'reference.{file_type}'], **kwargs) diff --git a/mavis/annotate/main.py b/mavis/annotate/main.py index b968b85d..61c7fd57 100644 --- a/mavis/annotate/main.py +++ b/mavis/annotate/main.py @@ -1,29 +1,27 @@ +import hashlib import json import os -import re import time -import warnings -import hashlib +from typing import Dict, List -from .constants import DEFAULTS, PASS_FILENAME +from ..constants import COLUMNS, PRIME, PROTOCOL, sort_columns +from ..error import DrawingFitError, NotSpecifiedError +from ..illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS +from ..illustrate.constants import DiagramSettings +from ..illustrate.diagram import draw_sv_summary_diagram +from ..util import LOG, generate_complete_stamp, mkdirp, read_inputs +from .constants import PASS_FILENAME +from .file_io import ReferenceFile +from .fusion import determine_prime from .genomic import PreTranscript from .variant import ( annotate_events, + call_protein_indel, choose_more_annotated, choose_transcripts_by_priority, - call_protein_indel, flatten_fusion_transcript, flatten_fusion_translation, ) -from .fusion import determine_prime -from ..cluster.constants import DEFAULTS as CLUSTER_DEFAULTS -from ..constants import COLUMNS, PRIME, PROTOCOL, sort_columns -from ..error import DrawingFitError, NotSpecifiedError -from ..illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS -from ..illustrate.constants import DiagramSettings -from ..illustrate.diagram import draw_sv_summary_diagram -from ..util import LOG, mkdirp, read_inputs - ACCEPTED_FILTERS = { 'choose_more_annotated': choose_more_annotated, @@ -114,54 +112,38 @@ def draw(drawing_config, ann, reference_genome, template_metadata, drawings_dire def main( - inputs, - output, - library, - protocol, - reference_genome, - annotations, - template_metadata, - min_domain_mapping_match=DEFAULTS.min_domain_mapping_match, - min_orf_size=DEFAULTS.min_orf_size, - max_orf_cap=DEFAULTS.max_orf_cap, - annotation_filters=DEFAULTS.annotation_filters, + inputs: List[str], + output: str, + library: str, + config: Dict, start_time=int(time.time()), - draw_fusions_only=DEFAULTS.draw_fusions_only, - draw_non_synonymous_cdna_only=DEFAULTS.draw_non_synonymous_cdna_only, - max_proximity=CLUSTER_DEFAULTS.max_proximity, **kwargs ): """ Args: inputs (List[str]): list of input files to read output (str): path to the output directory - reference_genome (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_reference_genome` - annotations (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_reference_genes` - template_metadata (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_templates` - min_domain_mapping_match (float): min mapping match percent (0-1) to count a domain as mapped - min_orf_size (int): minimum size of an [open reading frame](/glossary/#open-reading-frame) to keep as a putative translation - max_orf_cap (int): the maximum number of [open reading frame](/glossary/#open-reading-frame) s to collect for any given event """ - # error early on missing input files - annotations.files_exist() - reference_genome.files_exist() - template_metadata.files_exist() - if not template_metadata.is_loaded(): - template_metadata.load() + reference_genome = ReferenceFile.load_from_config(config, 'reference_genome') + annotations = ReferenceFile.load_from_config(config, 'annotations') + template_metadata = ReferenceFile.load_from_config(config, 'template_metadata', eager_load=True) drawings_directory = os.path.join(output, 'drawings') tabbed_output_file = os.path.join(output, PASS_FILENAME) fa_output_file = os.path.join(output, 'annotations.fusion-cdna.fa') - annotation_filters = [] if not annotation_filters else annotation_filters.split(',') - annotation_filters = [ACCEPTED_FILTERS[a] for a in annotation_filters] + annotation_filters = [ACCEPTED_FILTERS[a] for a in config['annotate.annotation_filters']] mkdirp(drawings_directory) # test that the sequence makes sense for a random transcript bpps = read_inputs( inputs, in_={COLUMNS.protocol: PROTOCOL.values()}, - add_default={COLUMNS.protocol: protocol, COLUMNS.library: library, COLUMNS.stranded: False}, + add_default={ + COLUMNS.protocol: config['libraries'][library]['protocol'], + COLUMNS.library: library, + COLUMNS.stranded: False, + }, require=[COLUMNS.protocol, COLUMNS.library], expand_strand=False, expand_orient=True, @@ -171,14 +153,15 @@ def main( annotations.load() reference_genome.load() + annotated_events = annotate_events( bpps, reference_genome=reference_genome.content, annotations=annotations.content, - min_orf_size=min_orf_size, - min_domain_mapping_match=min_domain_mapping_match, - max_proximity=max_proximity, - max_orf_cap=max_orf_cap, + min_orf_size=config['annotate.min_orf_size'], + min_domain_mapping_match=config['annotate.min_domain_mapping_match'], + max_proximity=config['cluster.max_proximity'], + max_orf_cap=config['annotate.max_orf_cap'], log=LOG, filters=annotation_filters, ) @@ -288,9 +271,11 @@ def main( # draw the annotation and add the path to all applicable rows (one drawing for multiple annotated_events) if any( [ - not ann.fusion and not draw_fusions_only, - ann.fusion and not draw_non_synonymous_cdna_only, - ann.fusion and draw_non_synonymous_cdna_only and not cdna_synon_all, + not ann.fusion and not config['annotate.draw_fusions_only'], + ann.fusion and not config['annotate.draw_non_synonymous_cdna_only'], + ann.fusion + and config['annotate.draw_non_synonymous_cdna_only'] + and not cdna_synon_all, ] ): drawing, legend = draw( @@ -307,6 +292,7 @@ def main( rows = [ann_row] for row in rows: tabbed_fh.write('\t'.join([str(row.get(k, None)) for k in header]) + '\n') + generate_complete_stamp(output, LOG, start_time=start_time) finally: LOG('closing:', tabbed_output_file) tabbed_fh.close() diff --git a/mavis/cluster/main.py b/mavis/cluster/main.py index c1628ea6..3ea459fd 100644 --- a/mavis/cluster/main.py +++ b/mavis/cluster/main.py @@ -1,27 +1,34 @@ -import inspect import itertools import os -from shortuuid import uuid import time +from typing import Dict, List -from .cluster import merge_breakpoint_pairs -from .constants import DEFAULTS -from ..constants import COLUMNS +from shortuuid import uuid + +from ..annotate.file_io import ReferenceFile +from ..breakpoint import BreakpointPair +from ..constants import COLUMNS, SUBCOMMAND from ..util import ( + LOG, filter_on_overlap, filter_uninformative, generate_complete_stamp, - LOG, - log_arguments, mkdirp, output_tabbed_file, read_inputs, write_bed_file, ) +from .cluster import merge_breakpoint_pairs + +SECTION = SUBCOMMAND.CLUSTER def split_clusters( - clusters, outputdir, batch_id, min_clusters_per_file=0, max_files=1, write_bed_summary=True + clusters: List[BreakpointPair], + outputdir: str, + total_batches: int, + min_clusters_per_file: int = 0, + write_bed_summary: bool = True, ): """ For a set of clusters creates a bed file representation of all clusters. @@ -36,13 +43,7 @@ def split_clusters( bedfile, itertools.chain.from_iterable([b.get_bed_repesentation() for b in clusters]) ) - number_of_jobs = len(clusters) // min_clusters_per_file - if number_of_jobs > max_files: - number_of_jobs = max_files - elif number_of_jobs == 0: - number_of_jobs = 1 - - jobs = [[] for j in range(0, number_of_jobs)] + jobs: List[List[BreakpointPair]] = [[] for j in range(0, total_batches)] clusters = sorted( clusters, key=lambda x: (x.break1.chr, x.break1.start, x.break2.chr, x.break2.start) ) @@ -55,70 +56,57 @@ def split_clusters( output_files = [] for i, job in enumerate(jobs): # generate an output file - filename = os.path.join(outputdir, '{}-{}.tab'.format(batch_id, i + 1)) + filename = os.path.join(outputdir, 'batch-{}.tab'.format(i + 1)) output_files.append(filename) output_tabbed_file(job, filename) return output_files def main( - inputs, - output, - strand_specific, - library, - protocol, - disease_status, - masking, - annotations, - limit_to_chr=DEFAULTS.limit_to_chr, - cluster_initial_size_limit=DEFAULTS.cluster_initial_size_limit, - cluster_radius=DEFAULTS.cluster_radius, - uninformative_filter=DEFAULTS.uninformative_filter, - max_proximity=DEFAULTS.max_proximity, - min_clusters_per_file=DEFAULTS.min_clusters_per_file, - max_files=DEFAULTS.max_files, - batch_id=None, - split_only=False, + inputs: List[str], + output: str, + library: str, + config: Dict, start_time=int(time.time()), - **kwargs + **kwargs, ): """ Args: - inputs (List[str]): list of input files to read - output (str): path to the output directory - strand_specific (bool): is the bam using a strand specific protocol - library (str): the library to look for in each of the input files - protocol (PROTOCOL): the sequence protocol (genome or transcriptome) - masking (object): see :func:`mavis.annotate.file_io.load_masking_regions` - cluster_clique_size (int): the maximum size of cliques to search for using the exact algorithm - cluster_radius (int): distance (in breakpoint pairs) used in deciding to join bpps in a cluster - uninformative_filter (bool): if True then clusters should be filtered out if they are not - within a specified (max_proximity) distance to any annotation - max_proximity (int): the maximum distance away an annotation can be before the uninformative_filter - is applied + inputs: list of input files to read + output: path to the output directory + library: the library to look for in each of the input files + masking (ReferenceFile): see :func:`mavis.annotate.file_io.load_masking_regions` annotations (ReferenceFile): see :func:`mavis.annotate.file_io.load_reference_genes` - min_clusters_per_file (int): the minimum number of clusters to output to a file - max_files (int): the maximum number of files to split clusters into """ - if uninformative_filter: + masking = ReferenceFile.load_from_config(config, 'masking', eager_load=True) + annotations = ReferenceFile.load_from_config(config, 'annotations') + + if config[f'{SECTION}.uninformative_filter'] and not annotations.is_empty(): annotations.load() - if masking: + if not masking.is_empty(): masking.load() + lib_config = config['libraries'][library] + # output files - batch_id = 'batch-' + str(uuid()) if batch_id is None else batch_id filtered_output = os.path.join(output, 'filtered_pairs.tab') cluster_assign_output = os.path.join(output, 'cluster_assignment.tab') # load the input files breakpoint_pairs = read_inputs( inputs, - cast={COLUMNS.tools: lambda x: set(x.split(';')) if x else set() if not split_only else x}, + cast={ + COLUMNS.tools: lambda x: set(x.split(';')) + if x + else set() + if not config[f'{SECTION}.split_only'] + else x + }, add_default={ COLUMNS.library: library, - COLUMNS.protocol: protocol, + COLUMNS.protocol: lib_config['protocol'], COLUMNS.tools: '', - COLUMNS.disease_status: disease_status, + COLUMNS.disease_status: lib_config['disease_status'], COLUMNS.stranded: False, COLUMNS.tracking_id: '', }, @@ -139,15 +127,17 @@ def main( other_libs.add(bpp.library) bpp.data[COLUMNS.filter_comment] = 'Not the target library name' filtered_pairs.append(bpp) - elif None in limit_to_chr or ( - bpp.break1.chr in limit_to_chr and bpp.break2.chr in limit_to_chr + elif not config[f'{SECTION}.limit_to_chr'] or ( + bpp.break1.chr in config[f'{SECTION}.limit_to_chr'] + and bpp.break2.chr in config[f'{SECTION}.limit_to_chr'] ): unfiltered_breakpoint_pairs.append(bpp) else: other_chr.update({bpp.break1.chr, bpp.break2.chr}) bpp.data[COLUMNS.filter_comment] = 'Non standard chromosome name' filtered_pairs.append(bpp) - other_chr -= set(limit_to_chr) + if config[f'{SECTION}.limit_to_chr']: + other_chr -= set(config[f'{SECTION}.limit_to_chr']) breakpoint_pairs = unfiltered_breakpoint_pairs if other_libs: LOG( @@ -161,10 +151,10 @@ def main( for bpp in masked_pairs: filtered_pairs.append(bpp) # filter by informative - if uninformative_filter: + if config[f'{SECTION}.uninformative_filter']: LOG('filtering from', len(breakpoint_pairs), 'breakpoint pairs using informative filter') pass_clusters, uninformative_clusters = filter_uninformative( - annotations.content, breakpoint_pairs, max_proximity=max_proximity + annotations.content, breakpoint_pairs, max_proximity=config[f'{SECTION}.max_proximity'] ) LOG( 'filtered from', @@ -180,19 +170,20 @@ def main( else: LOG('did not apply uninformative filter') - output_tabbed_file(filtered_pairs, filtered_output) mkdirp(output) + output_tabbed_file(filtered_pairs, filtered_output) - if not split_only: + if not config[f'{SECTION}.split_only']: LOG('computing clusters') clusters = merge_breakpoint_pairs( breakpoint_pairs, - cluster_radius=cluster_radius, - cluster_initial_size_limit=cluster_initial_size_limit, + cluster_radius=config[f'{SECTION}.cluster_radius'], + cluster_initial_size_limit=config[f'{SECTION}.cluster_initial_size_limit'], ) - hist = {} - length_hist = {} + hist: Dict[int, int] = {} + length_hist: Dict[float, int] = {} + for cluster in clusters: input_pairs = clusters[cluster] hist[len(input_pairs)] = hist.get(len(input_pairs), 0) + 1 @@ -242,11 +233,10 @@ def main( output_files = split_clusters( breakpoint_pairs, output, - batch_id, - min_clusters_per_file=min_clusters_per_file, - max_files=max_files, + total_batches=lib_config['total_batches'], + min_clusters_per_file=config[f'{SECTION}.min_clusters_per_file'], write_bed_summary=True, ) - generate_complete_stamp(output, LOG, start_time=start_time, prefix='MAVIS-{}.'.format(batch_id)) + generate_complete_stamp(output, LOG, start_time=start_time) return output_files diff --git a/mavis/config.py b/mavis/config.py index bfeb7a52..f38483bb 100644 --- a/mavis/config.py +++ b/mavis/config.py @@ -1,39 +1,16 @@ import argparse -from configparser import ConfigParser, ExtendedInterpolation -from copy import copy as _copy -import logging import os -import re -import sys -import warnings +from copy import copy as _copy +from typing import Dict, Optional +import snakemake import tab -from . import __version__ -from .align import SUPPORTED_ALIGNER -from .annotate.constants import DEFAULTS as ANNOTATION_DEFAULTS -from .annotate.file_io import REFERENCE_DEFAULTS -from .bam.cache import BamCache +from .annotate.file_io import ReferenceFile from .bam import stats -from .cluster.constants import DEFAULTS as CLUSTER_DEFAULTS -from .constants import DISEASE_STATUS, SUBCOMMAND, PROTOCOL, float_fraction -from .illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS -from .pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from .schedule.constants import OPTIONS as SUBMIT_OPTIONS -from .schedule.constants import SCHEDULER -from .summary.constants import DEFAULTS as SUMMARY_DEFAULTS -from .tools import SUPPORTED_TOOL -from .util import ( - bash_expands, - cast, - DEVNULL, - MavisNamespace, - WeakMavisNamespace, - filepath, - NullableType, -) -from .validate.constants import DEFAULTS as VALIDATION_DEFAULTS - +from .bam.cache import BamCache +from .constants import PROTOCOL, SUBCOMMAND, float_fraction +from .util import WeakMavisNamespace, bash_expands, filepath CONVERT_OPTIONS = WeakMavisNamespace() CONVERT_OPTIONS.add( @@ -43,6 +20,49 @@ ) +def calculate_bam_stats(config: Dict, library_name: str) -> Dict: + """ + Calculate the read stats for a library from a given bam file + """ + library = config['libraries'][library_name] + annotations = ReferenceFile('annotations', *config['reference.annotations']) + + if library['protocol'] == PROTOCOL.TRANS: + if annotations is None or annotations.is_empty(): + raise AttributeError( + 'missing required attribute: annotations. Annotations must be given for transcriptomes' + ) + annotations.load() + bam = BamCache(library['bam_file'], stranded=library['strand_specific']) + if library['protocol'] == PROTOCOL.TRANS: + bam_stats = stats.compute_transcriptome_bam_stats( + bam, + annotations=annotations.content, + sample_size=config['bam_stats.sample_size'], + sample_cap=config['bam_stats.sample_cap'], + distribution_fraction=config['bam_stats.distribution_fraction'], + ) + return { + 'median_fragment_size': int(bam_stats.median_fragment_size), + 'read_length': int(bam_stats.read_length), + 'stdev_fragment_size': int(bam_stats.stdev_fragment_size), + 'strand_specific': bam_stats.stranded, + 'strand_determining_read': bam_stats.strand_determining_read, + } + bam_stats = stats.compute_genome_bam_stats( + bam, + sample_size=config['bam_stats.sample_size'], + sample_bin_size=config['bam_stats.sample_bin_size'], + sample_cap=config['bam_stats.sample_cap'], + distribution_fraction=config['bam_stats.distribution_fraction'], + ) + return { + 'median_fragment_size': int(bam_stats.median_fragment_size), + 'read_length': int(bam_stats.read_length), + 'stdev_fragment_size': int(bam_stats.stdev_fragment_size), + } + + class CustomHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): """ subclass the default help formatter to stop default printing for required arguments @@ -96,385 +116,89 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, items) -class LibraryConfig(MavisNamespace): +def validate_config(config: Dict, bam_stats: Optional[bool] = False, stage: str = '') -> None: """ - holds library specific configuration information + Check that the input JSON config conforms to the expected schema as well + as the other relevant checks such as file exsts """ + schema = 'config' if stage != SUBCOMMAND.OVERLAY else 'overlay' - def __init__( - self, - library, - protocol, - disease_status, - bam_file=None, - inputs=None, - read_length=None, - median_fragment_size=None, - stdev_fragment_size=None, - strand_specific=False, - strand_determining_read=2, - **kwargs - ): - MavisNamespace.__init__(self) - self.library = library - self.protocol = PROTOCOL.enforce(protocol) - self.bam_file = bam_file - self.read_length = NullableType(int)(read_length) - self.median_fragment_size = NullableType(int)(median_fragment_size) - self.stdev_fragment_size = NullableType(int)(stdev_fragment_size) - self.strand_specific = cast(strand_specific, bool) - self.strand_determining_read = int(strand_determining_read) - self.disease_status = DISEASE_STATUS.enforce(disease_status) - try: - self.inputs = [f for f in re.split(r'[;\s]+', inputs) if f] - except TypeError: - self.inputs = inputs if inputs is not None else [] - - for attr, value in kwargs.items(): - for namespace in [CLUSTER_DEFAULTS, VALIDATION_DEFAULTS, ANNOTATION_DEFAULTS]: - if attr not in namespace: - continue - self.add( - attr, - value, - listable=namespace.is_listable(attr), - nullable=namespace.is_nullable(attr), - cast_type=namespace.type(attr), - ) - break - - def flatten(self): - result = MavisNamespace.items(self) - result['inputs'] = '\n'.join(result['inputs']) - return result - - def is_trans(self): - return True if self.protocol == PROTOCOL.TRANS else False - - @staticmethod - def build( - library, - protocol, - bam_file, - inputs, - annotations=None, - log=DEVNULL, - distribution_fraction=0.98, - sample_cap=3000, - sample_bin_size=1000, - sample_size=500, - **kwargs - ): - """ - Builds a library config section and gathers the bam stats - """ - PROTOCOL.enforce(protocol) - - if protocol == PROTOCOL.TRANS: - if annotations is None or annotations.is_empty(): - raise AttributeError( - 'missing required attribute: annotations. Annotations must be given for transcriptomes' - ) - annotations.load() - bam = BamCache(bam_file) - if protocol == PROTOCOL.TRANS: - bamstats = stats.compute_transcriptome_bam_stats( - bam, - annotations=annotations.content, - sample_size=sample_size, - sample_cap=sample_cap, - distribution_fraction=distribution_fraction, - ) - elif protocol == PROTOCOL.GENOME: - bamstats = stats.compute_genome_bam_stats( - bam, - sample_size=sample_size, - sample_bin_size=sample_bin_size, - sample_cap=sample_cap, - distribution_fraction=distribution_fraction, - ) - else: - raise ValueError('unrecognized value for protocol', protocol) - log(bamstats) - - return LibraryConfig( - library=library, - protocol=protocol, - bam_file=bam_file, - inputs=inputs, - median_fragment_size=bamstats.median_fragment_size, - stdev_fragment_size=bamstats.stdev_fragment_size, - read_length=bamstats.read_length, - strand_determining_read=bamstats.strand_determining_read, - **kwargs - ) - - @classmethod - def parse_args(cls, *args): - # '', '(genome|transcriptome)', '', '[strand_specific]', '[/path/to/bam/file]' - if len(args) < 4: - return LibraryConfig(args[0], protocol=args[1], disease_status=args[2]) - elif len(args) < 5: - return LibraryConfig( - args[0], protocol=args[1], disease_status=args[2], strand_specific=args[3] - ) - return LibraryConfig( - args[0], - protocol=args[1], - disease_status=args[2], - strand_specific=args[3], - bam_file=args[4], + try: + snakemake.utils.validate( + config, os.path.join(os.path.dirname(__file__), f'schemas/{schema}.json') ) + except Exception as err: + short_msg = '. '.join( + [line for line in str(err).split('\n') if line.strip()][:3] + ) # these can get super long + raise snakemake.WorkflowError(short_msg) + + required = [] + if ( + stage not in {SUBCOMMAND.CONVERT} + or stage == SUBCOMMAND.CLUSTER + and not config['cluster.uninformative_filter'] + ): + required.append('reference.annotations') + + if stage == SUBCOMMAND.VALIDATE: + required.extend(['reference.aligner_reference', 'reference.reference_genome']) + + for req in required: + if req not in config: + raise snakemake.WorkflowError(f'missing required property: {req}') + + if schema == 'config': + conversion_dir = os.path.join(config['output_dir'], 'converted_outputs') + # check all assignments are conversions aliases or existing files + for libname, library in config['libraries'].items(): + assignments = [] + for i, assignment in enumerate(library['assign']): + if assignment in config.get('convert', {}): + # replace the alias with the expected output path + converted_output = os.path.join(conversion_dir, f'{assignment}.tab') + assignments.append(converted_output) + elif ( + not os.path.exists(assignment) and os.path.dirname(assignment) != conversion_dir + ): + raise FileNotFoundError(f'cannot find the expected input file {assignment}') + else: + assignments.append(assignment) + library['assign'] = assignments - -class MavisConfig(MavisNamespace): - def __init__(self, **kwargs): - # section can be named schedule or qsub to support older versions - MavisNamespace.__init__(self) - try: - content = validate_section( - kwargs.pop('schedule', kwargs.pop('qsub', {})), SUBMIT_OPTIONS, True - ) - self.schedule = content - except Exception as err: - err.args = [ - 'Error in validating the schedule section in the config. ' - + ' '.join([str(a) for a in err.args]) - ] - raise err - - # set the global defaults - for sec, defaults in [ - ('pairing', PAIRING_DEFAULTS), - ('summary', SUMMARY_DEFAULTS), - ('validate', VALIDATION_DEFAULTS), - ('annotate', ANNOTATION_DEFAULTS), - ('illustrate', ILLUSTRATION_DEFAULTS), - ('cluster', CLUSTER_DEFAULTS), - ('reference', REFERENCE_DEFAULTS), - ]: - try: - self[sec] = validate_section(kwargs.pop(sec, {}), defaults, True) - except Exception as err: - err.args = [ - 'Error in validating the {} section in the config. '.format(sec) - + ' '.join([str(a) for a in err.args]) - ] - - raise err - - SUPPORTED_ALIGNER.enforce(self.validate.aligner) - for attr, fnames in self.reference.items(): - if attr != 'aligner_reference': - self.reference[attr] = [f for f in [NullableType(filepath)(v) for v in fnames] if f] - if not self.reference[attr] and attr not in { - 'dgv_annotation', - 'masking', - 'template_metadata', + if not config['skip_stage.validate'] and stage in { + SUBCOMMAND.VALIDATE, + SUBCOMMAND.SETUP, }: - raise FileNotFoundError( - 'Error in validating the convert section of the config for tag={}. ' - 'Required reference file does not exist'.format(attr) - ) - - # set the conversion section - self.convert = kwargs.pop('convert', {}) - for attr, val in self.convert.items(): - if attr in CONVERT_OPTIONS: - self.convert[attr] = CONVERT_OPTIONS.type(attr)(val) - continue - val = [v for v in re.split(r'[;\s]+', val) if v] - if not val: - raise UserWarning( - 'Error in validating convert section of the config for tag={}. Tag requires arguments'.format( - attr - ) - ) - if val[0] == 'convert_tool_output': - try: - val[-1] = tab.cast_boolean(val[-1]) - except TypeError: - val.append(False) - if len(val) < 4 or val[-2] not in SUPPORTED_TOOL.values(): - raise UserWarning( - 'Error in validating the convert section of the config for tag={}. '.format( - attr - ), - 'Conversion using the built-in convert_tool_output requires specifying the input file(s) and ' - 'tool name. Currently supported tools include:', - SUPPORTED_TOOL.values(), - 'given', - val, - ) - expanded_inputs = [] - for file_expr in val[1:-2]: - expanded = bash_expands(file_expr) - if not expanded: - raise FileNotFoundError( - 'Error in validating the config for tag={}. ' - 'Input file(s) do not exist'.format(attr), - val[1:-2], - ) - expanded_inputs.extend(expanded) - val = [val[0]] + expanded_inputs + val[-2:] - self.convert[attr] = val - self.convert = MavisNamespace(**self.convert) - - # now add the library specific sections - self.libraries = {} - - for libname, val in kwargs.items(): # all other sections already popped - libname = nameable_string(libname) - d = {} - d.update(self.cluster.items()) - d.update(self.validate.items()) - d.update(self.annotate.items()) - d.update(val) - d['library'] = libname - val['library'] = libname - self.libraries[libname] = LibraryConfig(**val) - # now try building the LibraryConfig object - try: - lc = LibraryConfig(**d) - self.libraries[libname] = lc - except TypeError as terr: # missing required argument - try: - lc = LibraryConfig.build(**d) - self.libraries[libname] = lc - except Exception as err: - raise UserWarning( - 'Error in validating the library section of the config.', libname, err, terr - ) - for inputfile in lc.inputs: - if inputfile not in self.convert and not os.path.exists(inputfile): + if not library.get('bam_file', None) or not os.path.exists(library['bam_file']): raise FileNotFoundError( - 'Error in validating the library section of the config. Input file does not exist', - libname, - inputfile, + f'missing bam file for library ({libname}), it is a required input when the validate stage is not skipped' ) - - def has_transcriptome(self): - return any([lib.is_trans() for lib in self.libraries.values()]) - - @staticmethod - def read(filepath): - """ - reads the configuration settings from the configuration file - - Args: - filepath (str): path to the input configuration file - - Returns: - List[Namespace]: namespace arguments for each library - """ - if not os.path.exists(filepath): - raise FileNotFoundError('File does not exist: {}'.format(filepath)) - parser = ConfigParser(interpolation=ExtendedInterpolation()) - parser.read(filepath) - config_dict = {} - - # get the library sections and add the default settings - for sec in parser.sections(): - config_dict.setdefault(sec, {}).update(parser[sec].items()) - return MavisConfig(**config_dict) - - -def write_config(filename, include_defaults=False, libraries=[], conversions={}, log=DEVNULL): - """ - Args: - filename (str): path to the output file - include_defaults (bool): True if default parameters should be written to the config, False otherwise - libraries (List[LibraryConfig]): library configuration sections - conversions (Dict[str,List]): conversion commands by alias name - log (Callable): function to pass output logging to - """ - config = {} - - config['reference'] = REFERENCE_DEFAULTS.to_dict() - for filetype, fname in REFERENCE_DEFAULTS.items(): - if fname is None: - warnings.warn( - 'filetype {} has not been set. This must be done manually before the configuration file is used'.format( - filetype - ) - ) - - if libraries: - for lib in libraries: - config[lib.library] = lib.to_dict() - - if include_defaults: - config['schedule'] = SUBMIT_OPTIONS.to_dict() - config['validate'] = VALIDATION_DEFAULTS.to_dict() - config['cluster'] = CLUSTER_DEFAULTS.to_dict() - config['annotate'] = ANNOTATION_DEFAULTS.to_dict() - config['illustrate'] = ILLUSTRATION_DEFAULTS.to_dict() - config['summary'] = SUMMARY_DEFAULTS.to_dict() - - config['convert'] = CONVERT_OPTIONS.to_dict() - for alias, command in conversions.items(): - if alias in CONVERT_OPTIONS: - raise UserWarning( - 'error in writing config. Alias for conversion product cannot be a setting', - alias, - CONVERT_OPTIONS.keys(), - ) - config['convert'][alias] = '\n'.join(command) - - for sec in config: - for tag, value in config[sec].items(): - if '_regex_' in tag: - config[sec][tag] = re.sub(r'\$', '$$', config[sec][tag]) - continue - elif not isinstance(value, str): - try: - config[sec][tag] = '\n'.join([str(v) for v in value]) - continue - except TypeError: - pass - config[sec][tag] = str(value) - - conf = ConfigParser() - for sec in config: - conf[sec] = {} - for tag, val in config[sec].items(): - conf[sec][tag] = val - log('writing:', filename) - with open(filename, 'w') as configfile: - conf.write(configfile) - - -def validate_section(section, namespace, use_defaults=False): - """ - given a dictionary of values, returns a new dict with the values casted to their appropriate type or set - to a default if the value was not given - """ - new_namespace = MavisNamespace() - if use_defaults: - new_namespace.copy_from(namespace) - - for attr, value in section.items(): - if attr not in namespace: - raise KeyError('tag not recognized', attr) - else: - cast_type = namespace.type(attr) - if namespace.is_listable(attr): - value = MavisNamespace.parse_listable_string( - value, cast_type, namespace.is_nullable(attr) - ) - else: - value = cast_type(value) - try: - new_namespace.add( - attr, - value, - cast_type=cast_type, - listable=namespace.is_listable(attr), - nullable=namespace.is_nullable(attr), + # calculate the bam_stats if the have not been given + missing_stats = any( + [ + col not in library + for col in ['median_fragment_size', 'read_length', 'stdev_fragment_size'] + ] ) - except Exception as err: - raise ValueError('failed adding {}. {}'.format(attr, err)) - return new_namespace + if missing_stats and bam_stats: + library.update(calculate_bam_stats(config, libname)) + + # expand and check the input files exist for any conversions + for conversion in config.get('convert', {}).values(): + expanded = [] + for input_file in conversion['inputs']: + expanded.extend(bash_expands(input_file)) + conversion['inputs'] = expanded + + # make sure all the reference files specified exist and overload with environment variables where applicable + for ref_type in list(config.keys()): + if not ref_type.startswith('reference.'): + continue + expanded = [] + for input_file in config[ref_type]: + expanded.extend(bash_expands(input_file)) + config[ref_type] = expanded def get_metavar(arg_type): @@ -496,291 +220,5 @@ def get_metavar(arg_type): return None -def nameable_string(input_string): - """ - A string that can be used for library and/or filenames - """ - input_string = str(input_string) - if re.search(r'[;,_\s]', input_string): - raise TypeError('names cannot contain the reserved characters [;,_\\s]', input_string) - if input_string.lower() == 'none': - raise TypeError('names cannot be none', input_string) - if not input_string: - raise TypeError('names cannot be an empty string', input_string) - if not re.search(r'^[a-zA-Z]', input_string): - raise TypeError('names must start with a letter', input_string) - return input_string - - -def augment_parser(arguments, parser, required=None): - """ - Adds options to the argument parser. Separate function to facilitate the pipeline steps - all having a similar look/feel - """ - if required is None: - try: - required = bool(parser.title.startswith('required')) - except AttributeError: - pass - - for arg in arguments: - - if arg == 'help': - parser.add_argument( - '-h', '--help', action='help', help='show this help message and exit' - ) - elif arg == 'version': - parser.add_argument( - '-v', - '--version', - action='version', - version='%(prog)s version ' + __version__, - help='Outputs the version number', - ) - elif arg == 'log': - parser.add_argument('--log', help='redirect stdout to a log file', default=None) - elif arg == 'log_level': - parser.add_argument( - '--log_level', - help='level of logging to output', - choices=['INFO', 'DEBUG'], - default='INFO', - ) - elif arg == 'aligner_reference': - default = REFERENCE_DEFAULTS[arg] - parser.add_argument( - '--{}'.format(arg), - default=default, - required=required if not default else False, - help=REFERENCE_DEFAULTS.define(arg), - type=filepath, - ) - elif arg in REFERENCE_DEFAULTS: - default = REFERENCE_DEFAULTS[arg] - parser.add_argument( - '--{}'.format(arg), - default=default, - required=required if not default else False, - help=REFERENCE_DEFAULTS.define(arg), - type=filepath if required else NullableType(filepath), - nargs='*', - ) - elif arg == 'config': - parser.add_argument('config', help='path to the config file', type=filepath) - elif arg == 'bam_file': - parser.add_argument( - '--bam_file', help='path to the input bam file', required=required, type=filepath - ) - elif arg == 'read_length': - parser.add_argument( - '--read_length', - type=int, - help='the length of the reads in the bam file', - required=required, - ) - elif arg == 'stdev_fragment_size': - parser.add_argument( - '--stdev_fragment_size', - type=int, - help='expected standard deviation in insert sizes', - required=required, - ) - elif arg == 'median_fragment_size': - parser.add_argument( - '--median_fragment_size', - type=int, - help='median inset size for pairs in the bam file', - required=required, - ) - elif arg == 'library': - parser.add_argument( - '--library', help='library name', required=required, type=nameable_string - ) - elif arg == 'protocol': - parser.add_argument( - '--protocol', choices=PROTOCOL.values(), help='library protocol', required=required - ) - elif arg == 'disease_status': - parser.add_argument( - '--disease_status', - choices=DISEASE_STATUS.values(), - help='library disease status', - required=required, - ) - elif arg == 'skip_stage': - parser.add_argument( - '--skip_stage', - choices=[SUBCOMMAND.CLUSTER, SUBCOMMAND.VALIDATE], - action='append', - default=[], - help='Use flag once per stage to skip. Can skip clustering or validation or both', - ) - elif arg == 'strand_specific': - parser.add_argument( - '--strand_specific', - type=tab.cast_boolean, - default=False, - help='indicates that the input is strand specific', - ) - else: - value_type = None - help_msg = None - default_value = None - choices = None - nargs = None - if arg == 'aligner': - choices = SUPPORTED_ALIGNER.values() - help_msg = 'aligner to use for aligning contigs' - if arg == 'uninformative_filter': - help_msg = 'If flag is False then the clusters will not be filtered based on lack of annotation' - if arg == 'scheduler': - choices = SCHEDULER.keys() - - # get default values - for nspace in [ - CLUSTER_DEFAULTS, - VALIDATION_DEFAULTS, - ANNOTATION_DEFAULTS, - ILLUSTRATION_DEFAULTS, - PAIRING_DEFAULTS, - SUMMARY_DEFAULTS, - SUBMIT_OPTIONS, - CONVERT_OPTIONS, - ]: - if arg in nspace: - default_value = nspace[arg] - if nspace.is_listable(arg): - nargs = '*' - value_type = nspace.type(arg, None) - if nspace.is_nullable(arg): - value_type = NullableType(value_type) - if not help_msg: - help_msg = nspace.define(arg) - break - - if help_msg is None: - raise KeyError('invalid argument', arg) - parser.add_argument( - '--{}'.format(arg), - choices=choices, - nargs=nargs, - help=help_msg, - required=required, - default=default_value, - type=value_type, - ) - - -def generate_config(args, parser, log=DEVNULL): - """ - Args: - parser (argparse.ArgumentParser): the main parser - required: the argparse required arguments group - optional: the argparse optional arguments group - """ - libs = [] - inputs_by_lib = {} - convert = {} - try: - # process the libraries by input argument (--input) - for libconf in [LibraryConfig.parse_args(*a) for a in args.library]: - if not libconf.bam_file and SUBCOMMAND.VALIDATE not in args.skip_stage: - raise KeyError( - 'argument --library: bam file must be given if validation is not being skipped' - ) - libs.append(libconf) - inputs_by_lib[libconf.library] = set() - if ( - SUBCOMMAND.VALIDATE not in args.skip_stage - and libconf.protocol == PROTOCOL.TRANS - and (not args.annotations or args.annotations.is_empty()) - ): - parser.error( - 'argument --annotations is required to build configuration files for transcriptome libraries' - ) - - for arg_list in args.input: - inputfile = arg_list[0] - for lib in arg_list[1:]: - if lib not in inputs_by_lib: - raise KeyError( - 'argument --input: specified a library that was not configured. Please input all libraries using ' - 'the --library flag', - lib, - ) - inputs_by_lib[lib].add(inputfile) - # process the inputs by library argument (--assign) - for arg_list in args.assign: - lib = arg_list[0] - if lib not in inputs_by_lib: - raise KeyError( - 'argument --assign: specified a library that was not configured. Please input all libraries using ' - 'the --library flag', - lib, - ) - inputs_by_lib[lib].update(arg_list[1:]) - - for libconf in libs: - if not inputs_by_lib[libconf.library]: - raise KeyError( - 'argument --input: no input was given for the library', libconf.library - ) - libconf.inputs = inputs_by_lib[libconf.library] - - for alias, command in args.external_conversion: - if alias in convert: - raise KeyError('duplicate alias names are not allowed', alias) - convert[alias] = [] - open_option = False - for item in re.split(r'\s+', command): - if convert[alias]: - if open_option: - convert[alias][-1] += ' ' + item - open_option = False - else: - convert[alias].append(item) - if item[0] == '-': - open_option = True - else: - convert[alias].append(item) - - for arg in args.convert: - # should follow the pattern: alias file [file...] toolname [stranded] - alias = arg[0] - if alias in convert: - raise KeyError('duplicate alias names are not allowed: {}'.format(alias)) - if arg[-1] in SUPPORTED_TOOL.values(): - toolname = arg[-1] - stranded = False - inputfiles = arg[1:-1] - else: - toolname, stranded = arg[-2:] - inputfiles = arg[1:-2] - if not inputfiles: - raise KeyError('argument --convert is missing input file path(s): {}'.format(arg)) - stranded = str(tab.cast_boolean(stranded)) - SUPPORTED_TOOL.enforce(toolname) - convert[alias] = ['convert_tool_output'] + inputfiles + [toolname, stranded] - except KeyError as err: - parser.error(' '.join(err.args)) - - if SUBCOMMAND.VALIDATE not in args.skip_stage: - for i, libconf in enumerate(libs): - log('generating the config section for:', libconf.library) - libs[i] = LibraryConfig.build( - library=libconf.library, - protocol=libconf.protocol, - bam_file=libconf.bam_file, - inputs=inputs_by_lib[libconf.library], - strand_specific=libconf.strand_specific, - disease_status=libconf.disease_status, - annotations=args.annotations, - log=log, - sample_size=args.genome_bins - if libconf.protocol == PROTOCOL.GENOME - else args.transcriptome_bins, - distribution_fraction=args.distribution_fraction, - ) - write_config( - args.write, include_defaults=args.add_defaults, libraries=libs, conversions=convert, log=log - ) +def get_by_prefix(config, prefix): + return {k.replace(prefix, ''): v for k, v in config.items() if k.startswith(prefix)} diff --git a/mavis/constants.py b/mavis/constants.py index 46a33890..72fccdf4 100644 --- a/mavis/constants.py +++ b/mavis/constants.py @@ -2,8 +2,8 @@ module responsible for small utility functions and constants used throughout the structural_variant package """ import argparse -import re import os +import re from Bio.Alphabet import Gapped from Bio.Alphabet.IUPAC import ambiguous_dna @@ -11,7 +11,6 @@ from Bio.Seq import Seq from tab import cast_boolean, cast_null - PROGNAME = 'mavis' EXIT_OK = 0 EXIT_ERROR = 1 @@ -431,14 +430,12 @@ def float_fraction(num): SUBCOMMAND = MavisNamespace( ANNOTATE='annotate', VALIDATE='validate', - SETUP='setup', - SCHEDULE='schedule', CLUSTER='cluster', PAIR='pairing', SUMMARY='summary', - CONFIG='config', CONVERT='convert', OVERLAY='overlay', + SETUP='setup', ) """MavisNamespace: holds controlled vocabulary for allowed pipeline stage values @@ -448,7 +445,6 @@ def float_fraction(num): - convert - pairing - pipeline -- schedule - summary - validate """ diff --git a/mavis/main.py b/mavis/main.py index f3aec4cc..9c6d678c 100644 --- a/mavis/main.py +++ b/mavis/main.py @@ -1,178 +1,29 @@ #!python import argparse +import json import logging -import platform import os -import time +import platform import sys +import time +from typing import Dict import tab from . import __version__ +from . import config as _config +from . import util as _util from .align import get_aligner_version -from . import annotate as _annotate from .annotate import main as annotate_main -from .cluster.constants import DEFAULTS as CLUSTER_DEFAULTS from .cluster import main as cluster_main -from . import config as _config -from .constants import SUBCOMMAND, PROTOCOL, float_fraction, EXIT_OK -from .error import DrawingFitError -from .illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS, DiagramSettings -from .illustrate.diagram import draw_multi_transcript_overlay -from .illustrate.scatter import bam_to_scatter -from .pairing.constants import DEFAULTS as PAIRING_DEFAULTS +from .constants import SUBCOMMAND +from .overlay import check_overlay_args +from .overlay import main as overlay_main from .pairing import main as pairing_main -from .summary.constants import DEFAULTS as SUMMARY_DEFAULTS from .summary import main as summary_main -from .tools import convert_tool_output, SUPPORTED_TOOL -from . import util as _util -from .validate.constants import DEFAULTS as VALIDATION_DEFAULTS +from .tools import SUPPORTED_TOOL, convert_tool_output +from .util import filepath from .validate import main as validate_main -from .schedule import pipeline as _pipeline - - -def check_overlay_args(args, parser): - """ - parse the overlay options and check the formatting - """ - # check complex options - for marker in args.markers: - if len(marker) < 3: - marker.append(marker[-1]) - try: - marker[1] = int(marker[1]) - marker[2] = int(marker[2]) - except ValueError: - parser.error('argument --marker: start and end must be integers: {}'.format(marker)) - - defaults = [None, None, 0.5, None, True] - bam_file, density, ymax, stranded = range(1, 5) - - for plot in args.read_depth_plots: - for i, d in enumerate(defaults): - if i >= len(plot): - plot.append(d) - if not os.path.exists(plot[bam_file]): - parser.error( - 'argument --read_depth_plots: the bam file given does not exist: {}'.format( - plot[bam_file] - ) - ) - try: - plot[density] = float(plot[density]) - if plot[density] < 0 or plot[density] > 1: - raise ValueError() - except ValueError: - parser.error( - 'argument --read_depth_plots: density must be an float between 0 and 1: {}'.format( - plot[density] - ) - ) - try: - if str(plot[ymax]).lower() in ['null', 'none']: - plot[ymax] = None - else: - plot[ymax] = int(plot[ymax]) - except ValueError: - parser.error( - 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax]) - ) - try: - plot[stranded] = tab.cast_boolean(plot[stranded]) - except TypeError: - parser.error( - 'argument --read_depth_plots: stranded must be an boolean: {}'.format( - plot[stranded] - ) - ) - return args - - -def overlay_main( - gene_name, - output, - buffer_length, - read_depth_plots, - markers, - annotations, - drawing_width_iter_increase, - max_drawing_retries, - min_mapping_quality, - ymax_color='#FF0000', - **kwargs -): - """ - generates an overlay diagram - """ - annotations.load() - # check options formatting - gene_to_draw = None - - for chrom in annotations.content: - for gene in annotations.content[chrom]: - if gene_name in gene.aliases or gene_name == gene.name: - gene_to_draw = gene - _util.LOG( - 'Found target gene: {}(aka. {}) {}:{}-{}'.format( - gene.name, gene.aliases, gene.chr, gene.start, gene.end - ) - ) - break - if gene_to_draw is None: - raise KeyError('Could not find gene alias or id in annotations file', gene_name) - - settings = DiagramSettings(**kwargs) - - genomic_min = max(gene_to_draw.start - buffer_length, 1) - genomic_max = gene_to_draw.end + buffer_length - - plots = [] - for axis_name, bam_file, density, ymax, stranded in read_depth_plots: - # one plot per bam - plots.append( - bam_to_scatter( - bam_file, - gene_to_draw.chr, - genomic_min, - genomic_max, - strand=gene_to_draw.get_strand() if stranded else None, - ymax=ymax, - density=density, - axis_name=axis_name, - min_mapping_quality=min_mapping_quality, - ymax_color=ymax_color, - ) - ) - - for i, (marker_name, marker_start, marker_end) in enumerate(markers): - markers[i] = _annotate.base.BioInterval( - gene_to_draw.chr, marker_start, marker_end, name=marker_name - ) - - canvas = None - attempts = 1 - while True: - try: - canvas = draw_multi_transcript_overlay( - settings, - gene_to_draw, - vmarkers=markers, - plots=plots, - window_buffer=buffer_length, - log=_util.LOG, - ) - break - except DrawingFitError as err: - if attempts > max_drawing_retries: - raise err - _util.LOG('Drawing fit: extending window', drawing_width_iter_increase) - settings.width += drawing_width_iter_increase - attempts += 1 - - svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name)) - _util.LOG('writing:', svg_output_file) - - canvas.saveas(svg_output_file) def convert_main(inputs, outputfile, file_type, strand_specific=False, assume_no_untemplated=True): @@ -189,20 +40,15 @@ def convert_main(inputs, outputfile, file_type, strand_specific=False, assume_no _util.output_tabbed_file(bpp_results, outputfile) -def main(argv=None): - """ - sets up the parser and checks the validity of command line args - loads reference files and redirects into subcommand main functions - - Args: - argv (list): List of arguments, defaults to command line arguments - """ - if argv is None: # need to do at run time or patching will not behave as expected - argv = sys.argv[1:] - start_time = int(time.time()) - +def create_parser(argv): parser = argparse.ArgumentParser(formatter_class=_config.CustomHelpFormatter) - _config.augment_parser(['version'], parser) + parser.add_argument( + '-v', + '--version', + action='version', + version='%(prog)s version ' + __version__, + help='Outputs the version number', + ) subp = parser.add_subparsers( dest='command', help='specifies which step/stage in the pipeline or which subprogram to use' ) @@ -215,87 +61,27 @@ def main(argv=None): ) required[command] = subparser.add_argument_group('required arguments') optional[command] = subparser.add_argument_group('optional arguments') - _config.augment_parser(['help', 'version', 'log', 'log_level'], optional[command]) - - # config arguments - required[SUBCOMMAND.CONFIG].add_argument( - '-w', - '--write', - help='path to the new configuration file', - required=True, - metavar='FILEPATH', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--library', - metavar=' {genome,transcriptome} {diseased,normal} [strand_specific] [/path/to/bam/file]', - action=_config.RangeAppendAction, - help='configuration for libraries to be analyzed by mavis', - nmin=3, - nmax=5, - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--input', - help='path to an input file or filter for mavis followed by the library names it ' - 'should be used for', - nmin=2, - action=_config.RangeAppendAction, - metavar='FILEPATH [ ...]', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--assign', - help='library name followed by path(s) to input file(s) or filter names. This represents the list' - ' of inputs that should be used for the library', - action=_config.RangeAppendAction, - nmin=2, - metavar=' FILEPATH [FILEPATH ...]', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--genome_bins', - default=_util.get_env_variable('genome_bins', 100), - type=int, - metavar=_config.get_metavar(int), - help='number of bins/samples to use in calculating the fragment size stats for genomes', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--transcriptome_bins', - default=_util.get_env_variable('transcriptome_bins', 500), - type=int, - metavar=_config.get_metavar(int), - help='number of genes to use in calculating the fragment size stats for genomes', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--distribution_fraction', - default=_util.get_env_variable('distribution_fraction', 0.97), - type=float_fraction, - metavar=_config.get_metavar(float), - help='the proportion of the distribution of calculated fragment sizes to use in determining the stdev', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--convert', - nmin=3, - metavar=' FILEPATH [FILEPATH ...] {{{}}} [stranded]'.format( - ','.join(SUPPORTED_TOOL.values()) - ), - help='input file conversion for internally supported tools', - action=_config.RangeAppendAction, - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--external_conversion', - metavar=('', '<"command">'), - nargs=2, - default=[], - help='alias for use in inputs and full command (quoted)', - action='append', - ) - optional[SUBCOMMAND.CONFIG].add_argument( - '--add_defaults', - default=False, - action='store_true', - help='write current defaults for all non-specified options to the config output', - ) - _config.augment_parser(['annotations'], optional[SUBCOMMAND.CONFIG]) - # add the optional annotations file (only need this is auto generating bam stats for the transcriptome) - _config.augment_parser(['skip_stage'], optional[SUBCOMMAND.CONFIG]) + optional[command].add_argument( + '-h', '--help', action='help', help='show this help message and exit' + ) + optional[command].add_argument( + '-v', + '--version', + action='version', + version='%(prog)s version ' + __version__, + help='Outputs the version number', + ) + optional[command].add_argument('--log', help='redirect stdout to a log file', default=None) + optional[command].add_argument( + '--log_level', + help='level of logging to output', + choices=['INFO', 'DEBUG'], + default='INFO', + ) + if command not in SUBCOMMAND.CONVERT: + optional[command].add_argument( + '--config', '-c', help='path to the JSON config file', type=filepath, required=True + ) # convert required[SUBCOMMAND.CONVERT].add_argument( @@ -304,42 +90,22 @@ def main(argv=None): required=True, help='Indicates the input file type to be parsed', ) - _config.augment_parser( - ['strand_specific', 'assume_no_untemplated'], optional[SUBCOMMAND.CONVERT] + optional[SUBCOMMAND.CONVERT].add_argument( + '--strand_specific', type=tab.cast_boolean, default=False ) - required[SUBCOMMAND.CONVERT].add_argument( - '--outputfile', '-o', required=True, help='path to the outputfile', metavar='FILEPATH' + optional[SUBCOMMAND.CONVERT].add_argument( + '--assume_no_untemplated', type=tab.cast_boolean, default=True ) + for command in [SUBCOMMAND.CONVERT, SUBCOMMAND.SETUP]: + required[command].add_argument( + '--outputfile', '-o', required=True, help='path to the outputfile', metavar='FILEPATH' + ) - for command in set(SUBCOMMAND.values()) - {SUBCOMMAND.CONFIG, SUBCOMMAND.CONVERT}: + for command in set(SUBCOMMAND.values()) - {SUBCOMMAND.CONVERT, SUBCOMMAND.SETUP}: required[command].add_argument( '-o', '--output', help='path to the output directory', required=True ) - # pipeline - _config.augment_parser(['config'], required[SUBCOMMAND.SETUP]) - optional[SUBCOMMAND.SETUP].add_argument( - '--skip_stage', - choices=[SUBCOMMAND.CLUSTER, SUBCOMMAND.VALIDATE], - action='append', - default=[], - help='Use flag once per stage to skip. Can skip clustering or validation or both', - ) - - # schedule arguments - optional[SUBCOMMAND.SCHEDULE].add_argument( - '--submit', - action='store_true', - default=False, - help='submit jobs to the the scheduler specified', - ) - optional[SUBCOMMAND.SCHEDULE].add_argument( - '--resubmit', - action='store_true', - default=False, - help='resubmit jobs in error states to the the scheduler specified', - ) - # add the inputs argument for command in [ SUBCOMMAND.CLUSTER, @@ -358,79 +124,14 @@ def main(argv=None): metavar='FILEPATH', ) - # cluster - _config.augment_parser( - ['library', 'protocol', 'strand_specific', 'disease_status'], required[SUBCOMMAND.CLUSTER] - ) - _config.augment_parser( - list(CLUSTER_DEFAULTS.keys()) + ['masking', 'annotations'], optional[SUBCOMMAND.CLUSTER] - ) - optional[SUBCOMMAND.CLUSTER].add_argument( - '--batch_id', help='batch id to use for prefix of split files', type=_config.nameable_string - ) - optional[SUBCOMMAND.CLUSTER].add_argument( - '--split_only', - help='Cluster the files or simply split them without clustering', - type=tab.cast_boolean, - ) - - # validate - _config.augment_parser( - [ - 'library', - 'protocol', - 'bam_file', - 'read_length', - 'stdev_fragment_size', - 'median_fragment_size', - 'strand_specific', - 'reference_genome', - 'aligner_reference', - ], - required[SUBCOMMAND.VALIDATE], - ) - _config.augment_parser(VALIDATION_DEFAULTS.keys(), optional[SUBCOMMAND.VALIDATE]) - _config.augment_parser(['masking', 'annotations'], optional[SUBCOMMAND.VALIDATE]) - - # annotate - _config.augment_parser( - ['library', 'protocol', 'annotations', 'reference_genome'], required[SUBCOMMAND.ANNOTATE] - ) - _config.augment_parser( - ['max_proximity', 'masking', 'template_metadata'], optional[SUBCOMMAND.ANNOTATE] - ) - _config.augment_parser( - list(_annotate.constants.DEFAULTS.keys()) + list(ILLUSTRATION_DEFAULTS.keys()), - optional[SUBCOMMAND.ANNOTATE], - ) - - # pair - _config.augment_parser(['annotations'], required[SUBCOMMAND.PAIR], optional[SUBCOMMAND.PAIR]) - _config.augment_parser( - ['max_proximity'] + list(PAIRING_DEFAULTS.keys()), optional[SUBCOMMAND.PAIR] - ) - - # summary - _config.augment_parser( - [ - 'annotations', - 'flanking_call_distance', - 'split_call_distance', - 'contig_call_distance', - 'spanning_call_distance', - ], - required[SUBCOMMAND.SUMMARY], - ) - _config.augment_parser(SUMMARY_DEFAULTS.keys(), optional[SUBCOMMAND.SUMMARY]) - _config.augment_parser(['dgv_annotation'], optional[SUBCOMMAND.SUMMARY]) + # library specific commands + for command in [SUBCOMMAND.CLUSTER, SUBCOMMAND.VALIDATE, SUBCOMMAND.ANNOTATE]: + required[command].add_argument( + '--library', '-l', required=True, help='The library to run the current step on' + ) # overlay arguments required[SUBCOMMAND.OVERLAY].add_argument('gene_name', help='Gene ID or gene alias to be drawn') - _config.augment_parser(['annotations'], required[SUBCOMMAND.OVERLAY]) - _config.augment_parser( - ['drawing_width_iter_increase', 'max_drawing_retries', 'width', 'min_mapping_quality'], - optional[SUBCOMMAND.OVERLAY], - ) optional[SUBCOMMAND.OVERLAY].add_argument( '--buffer_length', default=0, @@ -457,13 +158,25 @@ def main(argv=None): action=_config.RangeAppendAction, ) - args = _util.MavisNamespace(**parser.parse_args(argv).__dict__) + return parser, _util.MavisNamespace(**parser.parse_args(argv).__dict__) + + +def main(argv=None): + """ + sets up the parser and checks the validity of command line args + loads reference files and redirects into subcommand main functions + + Args: + argv (list): List of arguments, defaults to command line arguments + """ + if argv is None: # need to do at run time or patching will not behave as expected + argv = sys.argv[1:] + start_time = int(time.time()) + parser, args = create_parser(argv) + if args.command == SUBCOMMAND.OVERLAY: args = check_overlay_args(args, parser) - if args.command == SUBCOMMAND.VALIDATE: - args.aligner_version = get_aligner_version(args.aligner) - log_conf = {'format': '{message}', 'style': '{', 'level': args.log_level} original_logging_handlers = logging.root.handlers[:] @@ -476,16 +189,23 @@ def main(argv=None): _util.LOG('MAVIS: {}'.format(__version__)) _util.LOG('hostname:', platform.node(), time_stamp=False) _util.log_arguments(args) - rfile_args = args - if args.command == SUBCOMMAND.SETUP: # load the configuration file - config = _config.MavisConfig.read(args.config) - config.output = args.output - config.skip_stage = args.skip_stage - config.command = SUBCOMMAND.SETUP - rfile_args = config.reference - args = config + config: Dict = dict() + + try: + if args.command != SUBCOMMAND.CONVERT: + with open(args.config, 'r') as fh: + config = json.load(fh) + _config.validate_config( + config, + args.command in {SUBCOMMAND.SETUP, SUBCOMMAND.VALIDATE}, + args.command, + ) + except AttributeError as err: + raise err + if args.command == SUBCOMMAND.VALIDATE: + args.aligner_version = get_aligner_version(config['validate.aligner']) # try checking the input files exist try: args.inputs = _util.bash_expands(*args.inputs) @@ -494,95 +214,70 @@ def main(argv=None): except FileNotFoundError: parser.error('--inputs file(s) for {} {} do not exist'.format(args.command, args.inputs)) - # convert reference files to objects to store both content and name for rewrite - for arg in [f for f in _annotate.file_io.REFERENCE_DEFAULTS.keys() if f != 'aligner_reference']: - try: - rfile_args[arg] = _annotate.file_io.ReferenceFile( - arg, assert_exists=True, *rfile_args[arg] - ) - except AttributeError: - pass - except FileNotFoundError: - parser.error('--{} The file specified does not exist: {}'.format(arg, rfile_args[arg])) - - # throw an error if MAVIS can't find the aligner reference - try: - rfile_args.aligner_reference = _annotate.file_io.ReferenceFile( - 'aligner_reference', rfile_args.aligner_reference, assert_exists=True - ) - except AttributeError: - pass - except FileNotFoundError: - parser.error( - '--aligner_reference file does not exist at: {}'.format(rfile_args.aligner_reference) - ) - - # for specific cases throw an argument error if missing annotations - if any( - [ - args.command == SUBCOMMAND.CLUSTER and args.uninformative_filter, - args.command == SUBCOMMAND.CONFIG - and any([PROTOCOL.TRANS in values for values in args.library]) - and SUBCOMMAND.VALIDATE not in args.skip_stage, - args.command == SUBCOMMAND.VALIDATE and args.protocol == PROTOCOL.TRANS, - args.command - in { - SUBCOMMAND.PAIR, - SUBCOMMAND.ANNOTATE, - SUBCOMMAND.SUMMARY, - SUBCOMMAND.OVERLAY, - SUBCOMMAND.SETUP, - }, - ] - ): - try: - rfile_args.annotations.files_exist(not_empty=True) - except FileNotFoundError: - parser.error('--annotations file(s) are required and do not exist') - # decide which main function to execute - ret_val = EXIT_OK command = args.command - log_to_file = args.get('log', None) - - # discard any arguments needed for redirect/setup only - for init_arg in ['command', 'log', 'log_level']: - args.discard(init_arg) try: if command == SUBCOMMAND.CLUSTER: - ret_val = cluster_main.main(**args, start_time=start_time) + cluster_main.main( + inputs=args.inputs, + output=args.output, + start_time=start_time, + config=config, + library=args.library, + ) elif command == SUBCOMMAND.VALIDATE: - validate_main.main(**args, start_time=start_time) + validate_main.main( + inputs=args.inputs, + output=args.output, + start_time=start_time, + config=config, + library=args.library, + ) elif command == SUBCOMMAND.ANNOTATE: - annotate_main.main(**args, start_time=start_time) + annotate_main.main( + inputs=args.inputs, + output=args.output, + start_time=start_time, + config=config, + library=args.library, + ) elif command == SUBCOMMAND.PAIR: - pairing_main.main(**args, start_time=start_time) + pairing_main.main( + inputs=args.inputs, + output=args.output, + start_time=start_time, + config=config, + ) elif command == SUBCOMMAND.SUMMARY: - summary_main.main(**args, start_time=start_time) + summary_main.main( + inputs=args.inputs, + output=args.output, + start_time=start_time, + config=config, + ) elif command == SUBCOMMAND.CONVERT: - convert_main(**args) - elif command == SUBCOMMAND.OVERLAY: - overlay_main(**args) - elif command == SUBCOMMAND.CONFIG: - _config.generate_config(args, parser, log=_util.LOG) - elif command == SUBCOMMAND.SCHEDULE: - build_file = os.path.join(args.output, 'build.cfg') - args.discard('output') - pipeline = _pipeline.Pipeline.read_build_file(build_file) - try: - code = pipeline.check_status(log=_util.LOG, **args) - finally: - _util.LOG('rewriting:', build_file) - pipeline.write_build_file(build_file) - if code != EXIT_OK: - sys.exit(code) # EXIT - else: # PIPELINE - config.reference = rfile_args - pipeline = _pipeline.Pipeline.build(config) - build_file = os.path.join(config.output, 'build.cfg') - _util.LOG('writing:', build_file) - pipeline.write_build_file(build_file) + convert_main( + args.inputs, + args.outputfile, + args.file_type, + args.strand_specific, + args.assume_no_untemplated, + ) + elif command == SUBCOMMAND.SETUP: + _util.LOG(f'writing: {args.outputfile}') + with open(args.outputfile, 'w') as fh: + fh.write(json.dumps(config, sort_keys=True, indent=' ')) + else: + print(args) + overlay_main( + buffer_length=args.buffer_length, + gene_name=args.gene_name, + markers=args.markers, + read_depth_plots=args.read_depth_plots, + config=config, + output=args.output, + ) duration = int(time.time()) - start_time hours = duration - duration % 3600 @@ -593,16 +288,16 @@ def main(argv=None): time_stamp=False, ) _util.LOG('run time (s): {}'.format(duration), time_stamp=False) - return ret_val except Exception as err: - if log_to_file: - logging.exception(err) # capture the error in the logging output file raise err finally: - for handler in logging.root.handlers: - logging.root.removeHandler(handler) - for handler in original_logging_handlers: - logging.root.addHandler(handler) + try: + for handler in logging.root.handlers: + logging.root.removeHandler(handler) + for handler in original_logging_handlers: + logging.root.addHandler(handler) + except Exception as err: + print(err) if __name__ == '__main__': diff --git a/mavis/overlay.py b/mavis/overlay.py new file mode 100644 index 00000000..868a90fa --- /dev/null +++ b/mavis/overlay.py @@ -0,0 +1,159 @@ +import os +from typing import Dict, List, Tuple, Union + +import tab + +from . import annotate as _annotate +from . import util as _util +from .annotate.file_io import ReferenceFile +from .error import DrawingFitError +from .illustrate.constants import DiagramSettings +from .illustrate.diagram import draw_multi_transcript_overlay +from .illustrate.scatter import bam_to_scatter + + +def check_overlay_args(args, parser): + """ + parse the overlay options and check the formatting + """ + # check complex options + for marker in args.markers: + if len(marker) < 3: + marker.append(marker[-1]) + try: + marker[1] = int(marker[1]) + marker[2] = int(marker[2]) + except ValueError: + parser.error('argument --marker: start and end must be integers: {}'.format(marker)) + + defaults = [None, None, 0.5, None, True] + bam_file, density, ymax, stranded = range(1, 5) + + for plot in args.read_depth_plots: + for i, d in enumerate(defaults): + if i >= len(plot): + plot.append(d) + if not os.path.exists(plot[bam_file]): + parser.error( + 'argument --read_depth_plots: the bam file given does not exist: {}'.format( + plot[bam_file] + ) + ) + try: + plot[density] = float(plot[density]) + if plot[density] < 0 or plot[density] > 1: + raise ValueError() + except ValueError: + parser.error( + 'argument --read_depth_plots: density must be an float between 0 and 1: {}'.format( + plot[density] + ) + ) + try: + if str(plot[ymax]).lower() in ['null', 'none']: + plot[ymax] = None + else: + plot[ymax] = int(plot[ymax]) + except ValueError: + parser.error( + 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax]) + ) + try: + plot[stranded] = tab.cast_boolean(plot[stranded]) + except TypeError: + parser.error( + 'argument --read_depth_plots: stranded must be an boolean: {}'.format( + plot[stranded] + ) + ) + return args + + +def main( + gene_name: str, + output: str, + config: Dict, + buffer_length: int, + read_depth_plots, + markers: List[Tuple[str, int, int]], + ymax_color='#FF0000', + **kwargs, +): + """ + generates an overlay diagram + """ + annotations = ReferenceFile.load_from_config(config, 'annotations') + annotations.load() + drawing_width_iter_increase = config['illustrate.drawing_width_iter_increase'] + max_drawing_retries = config['illustrate.max_drawing_retries'] + min_mapping_quality = config['validate.min_mapping_quality'] + # check options formatting + gene_to_draw = None + + for chrom in annotations.content: + for gene in annotations.content[chrom]: + if gene_name in gene.aliases or gene_name == gene.name: + gene_to_draw = gene + _util.LOG( + 'Found target gene: {}(aka. {}) {}:{}-{}'.format( + gene.name, gene.aliases, gene.chr, gene.start, gene.end + ) + ) + break + if gene_to_draw is None: + raise KeyError('Could not find gene alias or id in annotations file', gene_name) + + settings = DiagramSettings(**kwargs) + + genomic_min = max(gene_to_draw.start - buffer_length, 1) + genomic_max = gene_to_draw.end + buffer_length + + plots = [] + for axis_name, bam_file, density, ymax, stranded in read_depth_plots: + # one plot per bam + plots.append( + bam_to_scatter( + bam_file, + gene_to_draw.chr, + genomic_min, + genomic_max, + strand=gene_to_draw.get_strand() if stranded else None, + ymax=ymax, + density=density, + axis_name=axis_name, + min_mapping_quality=min_mapping_quality, + ymax_color=ymax_color, + ) + ) + + vmarkers = [] + + for i, (marker_name, marker_start, marker_end) in enumerate(markers): + vmarkers.append( + _annotate.base.BioInterval(gene_to_draw.chr, marker_start, marker_end, name=marker_name) + ) + + canvas = None + attempts = 1 + while True: + try: + canvas = draw_multi_transcript_overlay( + settings, + gene_to_draw, + vmarkers=vmarkers, + plots=plots, + window_buffer=buffer_length, + log=_util.LOG, + ) + break + except DrawingFitError as err: + if attempts > max_drawing_retries: + raise err + _util.LOG('Drawing fit: extending window', drawing_width_iter_increase) + settings.width += drawing_width_iter_increase + attempts += 1 + + svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name)) + _util.LOG('writing:', svg_output_file) + + canvas.saveas(svg_output_file) diff --git a/mavis/pairing/main.py b/mavis/pairing/main.py index 1f220d04..debd823d 100644 --- a/mavis/pairing/main.py +++ b/mavis/pairing/main.py @@ -1,40 +1,35 @@ import itertools import os import time +from typing import Dict, List -from .pairing import inferred_equivalent, product_key, pair_by_distance -from .constants import DEFAULTS from ..annotate.constants import SPLICE_TYPE +from ..annotate.file_io import ReferenceFile from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE -from ..util import generate_complete_stamp, LOG, output_tabbed_file, read_inputs +from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs +from .pairing import inferred_equivalent, pair_by_distance, product_key def main( - inputs, - output, - annotations, - flanking_call_distance=DEFAULTS.flanking_call_distance, - split_call_distance=DEFAULTS.split_call_distance, - contig_call_distance=DEFAULTS.contig_call_distance, - spanning_call_distance=DEFAULTS.spanning_call_distance, + inputs: List[str], + output: str, + config: Dict, start_time=int(time.time()), - **kwargs + **kwargs, ): """ Args: inputs (List[str]): list of input files to read output (str): path to the output directory - flanking_call_distance (int): pairing distance for pairing with an event called by [flanking read pair](/glossary/#flanking-read-pair) - split_call_distance (int): pairing distance for pairing with an event called by [split read](/glossary/#split-read) - contig_call_distance (int): pairing distance for pairing with an event called by contig or [spanning read](/glossary/#spanning-read) """ - annotations.load() + annotations = ReferenceFile.load_from_config(config, 'annotations', eager_load=True) + # load the file distances = { - CALL_METHOD.FLANK: flanking_call_distance, - CALL_METHOD.SPLIT: split_call_distance, - CALL_METHOD.CONTIG: contig_call_distance, - CALL_METHOD.SPAN: spanning_call_distance, + CALL_METHOD.FLANK: config['pairing.flanking_call_distance'], + CALL_METHOD.SPLIT: config['pairing.split_call_distance'], + CALL_METHOD.CONTIG: config['pairing.contig_call_distance'], + CALL_METHOD.SPAN: config['pairing.spanning_call_distance'], } bpps = [] @@ -93,7 +88,15 @@ def main( bpp.data[COLUMNS.inferred_pairing] = '' if product_key(bpp) in bpp_by_product_key: - raise KeyError('duplicate bpp is not unique within lib', product_key(bpp)) + diffs = {} + other = bpp_by_product_key[product_key(bpp)] + for key in (set(other.data.keys()) | set(bpp.data.keys())) - {'line_no'}: + if bpp.data.get(key) != other.data.get(key): + diffs[key] = (bpp.data.get(key), other.data.get(key)) + if diffs: + raise KeyError( + f'duplicate bpp ({product_key(bpp)}) is not unique within lib (diffs: {diffs})' + ) bpp_by_product_key[product_key(bpp)] = bpp distance_pairings = {} @@ -137,5 +140,6 @@ def main( bpp = bpp_by_product_key[pkey] bpp.data[COLUMNS.inferred_pairing] = ';'.join(sorted(pkeys)) - fname = os.path.join(output, 'mavis_paired_{}.tab'.format('_'.join(sorted(list(libraries))))) + fname = os.path.join(output, 'mavis_paired.tab') output_tabbed_file(bpps, fname) + generate_complete_stamp(output, LOG) diff --git a/mavis/schedule/constants.py b/mavis/schedule/constants.py deleted file mode 100644 index 5ee37a8c..00000000 --- a/mavis/schedule/constants.py +++ /dev/null @@ -1,109 +0,0 @@ -from ..constants import MavisNamespace -from ..util import WeakMavisNamespace - - -JOB_STATUS = MavisNamespace( - 'SUBMITTED', - 'COMPLETED', - 'ERROR', - 'RUNNING', - 'FAILED', - 'PENDING', - 'CANCELLED', - NOT_SUBMITTED='NOT SUBMITTED', - UNKNOWN='UNKNOWN', - __name__='mavis.schedule.constants.JOB_STATUS', -) - - -def cumulative_job_state(states): - """ - Given a set of states, return a single state based on the reporting priority - """ - priority = [ - JOB_STATUS.ERROR, - JOB_STATUS.FAILED, - JOB_STATUS.CANCELLED, - JOB_STATUS.NOT_SUBMITTED, - JOB_STATUS.PENDING, - JOB_STATUS.SUBMITTED, - JOB_STATUS.RUNNING, - JOB_STATUS.COMPLETED, - ] - for state in priority: - if state in states: - return state - return JOB_STATUS.NOT_SUBMITTED - - -SCHEDULER = MavisNamespace( - 'SGE', 'SLURM', 'TORQUE', 'LOCAL', __name__='mavis.schedule.constants.SCHEDULER' -) -"""MavisNamespace: scheduler types - -- [LOCAL](/glossary/#LOCAL) -- [SGE](/glossary/#SGE) -- [SLURM](/glossary/#SLURM) -- [TORQUE](/glossary/#TORQUE) -""" - -MAIL_TYPE = MavisNamespace( - 'BEGIN', 'END', 'FAIL', 'ALL', 'NONE', __name__='mavis.schedule.constants.MAIL_TYPE' -) -""" -When the scheduler should notify [mail_user](/configuration/settings/#mail_user) about a job - -- ``ALL`` - All other options (except none) -- ``BEGIN`` - Send an email when the job starts -- ``END`` - Send an email when the job has terminated -- ``FAIL`` - Send an email if the job fails -- ``NONE`` - Do not send email -""" - -STD_OPTIONS = ['memory_limit', 'queue', 'time_limit', 'import_env', 'mail_user', 'mail_type'] - -OPTIONS = WeakMavisNamespace(__name__='mavis.schedule.constants.options') -"""MavisNamespace: submission options - -- [annotation_memory](/configuration/settings/#annotation_memory) -- [concurrency_limit](/configuration/settings/#concurrency_limit) -- [import_env](/configuration/settings/#import_env) -- [mail_type](/configuration/settings/#mail_type) -- [mail_user](/configuration/settings/#mail_user) -- [memory_limit](/configuration/settings/#memory_limit) -- [queue](/configuration/settings/#queue) -- [remote_head_ssh](/configuration/settings/#remote_head_ssh) -- [scheduler](/configuration/settings/#scheduler) -- [time_limit](/configuration/settings/#time_limit) -- [trans_validation_memory](/configuration/settings/#trans_validation_memory) -- [validation_memory](/configuration/settings/#validation_memory) - -""" -OPTIONS.add('annotation_memory', 12000, defn='default memory limit (MB) for the annotation stage') -OPTIONS.add('import_env', True, defn='flag to import environment variables') -OPTIONS.add( - 'mail_type', MAIL_TYPE.NONE, cast_type=MAIL_TYPE, defn='When to notify the mail_user (if given)' -) -OPTIONS.add('mail_user', '', defn='User(s) to send notifications to') -OPTIONS.add( - 'memory_limit', 16000, defn='the maximum number of megabytes (MB) any given job is allowed' -) # 16 GB -OPTIONS.add('queue', '', cast_type=str, defn='the queue jobs are to be submitted to') -OPTIONS.add('scheduler', SCHEDULER.SLURM, defn='The scheduler being used', cast_type=SCHEDULER) -OPTIONS.add( - 'time_limit', 16 * 60 * 60, defn='the time in seconds any given jobs is allowed' -) # 16 hours -OPTIONS.add( - 'trans_validation_memory', - 18000, - defn='default memory limit (MB) for the validation stage (for transcriptomes)', -) -OPTIONS.add('validation_memory', 16000, defn='default memory limit (MB) for the validation stage') -OPTIONS.add( - 'concurrency_limit', - None, - nullable=True, - cast_type=int, - defn='The concurrency limit for tasks in any given job array or the number of concurrent processes allowed for a local run', -) -OPTIONS.add('remote_head_ssh', '', cast_type=str, defn='ssh target for remote scheduler commands') diff --git a/mavis/schedule/job.py b/mavis/schedule/job.py deleted file mode 100644 index 76557baf..00000000 --- a/mavis/schedule/job.py +++ /dev/null @@ -1,265 +0,0 @@ -from copy import copy as _copy -import os -import re -import time - -from ..constants import SUBCOMMAND, MavisNamespace -from .constants import JOB_STATUS, OPTIONS, STD_OPTIONS - - -class LogFile: - """ - stores information about the log status - """ - - STATUS = MavisNamespace('EMPTY', 'CRASH', 'INCOMPLETE', 'COMPLETE') - """MavisNamespace: The status of the job based on parsing of the logfile""" - - def __init__(self, filename, status, message=None): - """ - Args: - filename (str): path to the logfile - status (LogFile.STATUS): the status of the logfile - message (str): the message parsed from the logfile. Generally this is an error from the log - """ - self.filename = filename - self.status = self.STATUS.enforce(status) - self.message = message.strip() if message is not None else None - - @classmethod - def parse(cls, filename): - """ - given a file parse to see if it looks like a complete log file (contains run time), - was truncated, or reported an error - """ - if not os.path.isfile(filename): - raise FileNotFoundError('Log file does not exist', filename) - log = None - with open(filename, 'r') as fh: - lines = [line.strip() for line in fh.readlines() if line.strip()] - for line in lines[::-1]: - line = line.strip().lower() - if ( - line and line[0] != '\x1b' - ): # ignore lines starting with terminal control characters - if re.search( - r'(\b|^)((\S+)?error|fault|fatal|aborted|core dumped|killed|died|command not found)(\b|$)', - line, - ): - log = LogFile(filename, cls.STATUS.CRASH, line) - elif re.match(r'^\s*run time \(s\): (\d+)\s*$', line): - log = LogFile(filename, cls.STATUS.COMPLETE) - else: - log = LogFile(filename, cls.STATUS.INCOMPLETE, line) - return log - return LogFile(filename, cls.STATUS.EMPTY) - - -class Job: - def __init__( - self, - stage, - output_dir, - stdout=None, - job_ident=None, - name=None, - dependencies=None, - script=None, - created_at=None, - status=JOB_STATUS.NOT_SUBMITTED, - status_comment='', - **options - ): - """ - Args: - stage (str): the mavis pipleine stage this job belongs to - job_ident (int): the job number/id according to the scheduler being used - output_dir (str): path to the output directory where logs/stamps for this job will be written - name (str): the job name according to the scheduler being used - dependencies (List[Job]): list of jobs which must complete for this job to run - stdout (str): basename of the file to write std output to - script (str): path to the script which contains the commands for the job - created_at (int): the time stamp for when the job was created (created != submitted) - status (mavis.schedule.constants.JOB_STATUS): The current (since last checked) status of the job - status_comment (str): the comment which describes the status, generally this is used for reporting errors from the log file or failed dependencies (SLURM) - options (**dict): override default options specified by OPTIONS - """ - self.stage = SUBCOMMAND.enforce(stage) - self.job_ident = job_ident - self.name = name - self.dependencies = dependencies if dependencies else [] - self.script = script - self.status = JOB_STATUS.enforce(status) - self.output_dir = output_dir - self.stdout = ( - os.path.join(output_dir, 'job-{name}-{job_ident}.log') if not stdout else stdout - ) - - self.created_at = int(created_at if created_at else time.time()) - self.status = status - self.status_comment = status_comment - - # inputs to the function call should override the default values - for option, value in [(o, OPTIONS[o]) for o in STD_OPTIONS]: - setattr(self, option, OPTIONS.type(option)(options.get(option, value))) - - # check that nothing weird was passed in the kwargs - for option in options: - if option not in STD_OPTIONS: - raise AttributeError('unexpected attribute: {}'.format(option)) - - @property - def display_name(self): - """ - Used for identifying this job in an ini config file - """ - display_name = ( - self.name if self.job_ident is None else '{}_{}'.format(self.name, self.job_ident) - ) - display_name = re.sub(r'[\[\]#;]', '_', display_name) - return display_name - - def flatten(self): - result = {} - for attr, value in self.__dict__.items(): - if attr == 'dependencies': - value = [j.display_name for j in value] - try: - if not isinstance(value, str): - value = '\n'.join([str(v) for v in value]) - except TypeError: - pass - result[attr] = str(value) - return result - - def logfile(self): - """ - returns the path to the logfile with job name and job id substituted into the stdout pattern - """ - return self.stdout.format(name=self.name, job_ident=self.job_ident) - - def complete_stamp(self): - """ - returns the path to the expected complete stamp - """ - return os.path.join(self.output_dir, 'MAVIS-{job_ident}.COMPLETE').format( - job_ident=self.job_ident, name=self.name - ) - - def reset(self): - self.status = JOB_STATUS.NOT_SUBMITTED - self.status_comment = '' - self.job_ident = None - - -class ArrayJob(Job): - """ - Class for dealing with array jobs. Jobs with many tasks - """ - - def __init__(self, stage, task_list, **kwargs): - """ - Args: - task_list (Union[List,int]): the ids of tasks in the job array - """ - Job.__init__(self, stage, **kwargs) - self.stdout = ( - os.path.join(self.output_dir, 'job-{name}-{job_ident}-{task_ident}.log') - if 'stdout' not in kwargs - else kwargs['stdout'] - ) - - if isinstance(task_list, int): - task_list = list(range(1, task_list + 1)) - self.task_list = [Task(self, n) for n in task_list] - - @property - def tasks(self): - return len(self.task_list) - - def get_task(self, task_ident): - """ - returns a task by task id - """ - task_ident = int(task_ident) - for task in self.task_list: - if task.task_ident == task_ident: - return task - raise KeyError('task id not found', task_ident, self.task_list) - - def has_task(self, task_ident): - task_ident = int(task_ident) - for task in self.task_list: - if task.task_ident == task_ident: - return True - return False - - def remove_task(self, task_ident): - self.task_list = [task for task in self.task_list if task.task_ident != task_ident] - - def logfile(self, task_ident): - return self.stdout.format(name=self.name, job_ident=self.job_ident, task_ident=task_ident) - - def complete_stamp(self, task_ident): - """ - returns the path to the expected complete stamp - """ - return os.path.join(self.output_dir, 'MAVIS-{job_ident}.COMPLETE').format( - job_ident=self.job_ident, name=self.name, task_ident=task_ident - ) - - def flatten(self): - result = {k: v for k, v in Job.flatten(self).items() if k != 'task_list'} - result['task_list'] = '\n'.join([str(t.task_ident) for t in self.task_list]) - return result - - def copy_with_tasks(self, task_list): - copy = _copy(self) - copy.task_list = [Task(self, n) for n in task_list] - copy.dependencies = [] - copy.reset() - return copy - - def reset(self): - Job.reset(self) - for task in self.task_list: - task.reset() - - def __repr__(self): - return '{}(job_ident={}, name={}, stage={}, status={})'.format( - self.__class__.__name__, self.job_ident, self.name, self.stage, self.status - ) - - -class TorqueArrayJob(ArrayJob): - def complete_stamp(self, task_ident): - # example: MAVIS-136[1].torque01.bcgsc.ca.COMPLETE - job_ident = re.sub(r'\[\]', '[{}]'.format(task_ident), self.job_ident) - return os.path.join(self.output_dir, 'MAVIS-{job_ident}.COMPLETE').format( - job_ident=job_ident, name=self.name, task_ident=task_ident - ) - - def logfile(self, task_ident): - # example: job-MV_mock-A47933_batch-B9PE6YAtnHu4cHA2GrsEzX-1-136[1].torque01.bcgsc.ca-1.log-1 - name = '{}-{}'.format(self.name, task_ident) - job_ident = re.sub(r'\[\]', '[{}]'.format(task_ident), self.job_ident) - log = self.stdout.format(name=name, job_ident=job_ident, task_ident=task_ident) - return '{}-{}'.format(log, task_ident) - - -class Task: - def __init__(self, array_job, task_ident): - self.array_job = array_job - self.task_ident = int(task_ident) - self.status = JOB_STATUS.NOT_SUBMITTED - self.status_comment = '' - - def logfile(self): - return self.array_job.logfile(self.task_ident) - - def complete_stamp(self): - return self.array_job.complete_stamp(self.task_ident) - - def reset(self): - self.status = JOB_STATUS.NOT_SUBMITTED - self.status_comment = '' diff --git a/mavis/schedule/local.py b/mavis/schedule/local.py deleted file mode 100644 index 8037c3a1..00000000 --- a/mavis/schedule/local.py +++ /dev/null @@ -1,161 +0,0 @@ -import atexit -from concurrent import futures -from datetime import datetime -import logging -import multiprocessing -import os - -import shortuuid - -from ..util import LOG -from ..annotate.file_io import REFERENCE_DEFAULTS, ReferenceFile - -from .job import Job -from .scheduler import Scheduler -from .constants import JOB_STATUS, SCHEDULER - - -class LocalJob(Job): - def __init__(self, args, func, rank=None, response=None, *pos, **kwargs): - """ - Args: - args (list): A list of arguments to passed to the function given - func (Callable): the function to be run - rank (int): rank of the job within the pool - response (concurrent.futures.Future): the result from the subprocess - """ - self.args = args - self.func = func - self.response = response - self.rank = rank - for filetype in REFERENCE_DEFAULTS: - setattr(self, filetype, kwargs.pop(filetype, None)) - Job.__init__(self, *pos, **kwargs) - - def check_complete(self): - """ - check that the complete stamp associated with this job exists - """ - return os.path.exists(self.complete_stamp()) - - def flatten(self): - result = Job.flatten(self) - omit = { - 'script', - 'rank', - 'response', - 'func', - 'queue', - 'import _env', - 'mail_user', - 'mail_type', - } - return {k: v for k, v in result.items() if k not in omit} - - -def write_stamp_callback(response): - if response.exception() or response.cancelled() or response.running(): - return - try: - LOG('writing:', response.complete_stamp, time_stamp=True, indent_level=1) - with open(response.complete_stamp, 'w') as fh: - fh.write('end: {}\n'.format(int(datetime.timestamp(datetime.utcnow())))) - except Exception as err: - LOG('error writing the complete stamp', level=logging.CRITICAL, indent_level=1) - raise err - - -class LocalScheduler(Scheduler): - """ - Scheduler class for dealing with running mavis locally - """ - - NAME = SCHEDULER.LOCAL - """:attr:`mavis.schedule.constants.SCHEDULER`: the type of scheduler""" - - def __init__(self, *pos, **kwargs): - Scheduler.__init__(self, *pos, **kwargs) - self.concurrency_limit = ( - multiprocessing.cpu_count() - 1 - if not self.concurrency_limit - else self.concurrency_limit - ) - self.pool = None # set this at the first submission - self.submitted = {} # submitted jobs process response objects by job ID - atexit.register(self.close) # makes the pool 'auto close' on normal python exit - - def submit(self, job): - """ - Add a job to the pool - - Args: - job (LocalJob): the job to be submitted - """ - if self.pool is None: - self.pool = futures.ProcessPoolExecutor(max_workers=self.concurrency_limit) - if not job.job_ident: - job.job_ident = str(shortuuid.uuid()) - job.status = JOB_STATUS.SUBMITTED - args = [arg.format(job_ident=job.job_ident, name=job.name) for arg in job.args] - # if this job exists in the pool, return its response object - if job.job_ident in self.submitted: - return self.submitted[job.job_ident] - - # load any reference files not cached into the parent memory space - for filetype in [f for f in REFERENCE_DEFAULTS.keys() if f != 'aligner_reference']: - if getattr(job, filetype) is not None: - ref = ReferenceFile(filetype, getattr(job, filetype)) - ref.load(verbose=False) - # otherwise add it to the pool - job.response = self.pool.submit( - job.func, args - ) # no arguments, defined all in the job object - setattr(job.response, 'complete_stamp', job.complete_stamp()) - job.response.add_done_callback(write_stamp_callback) - self.submitted[job.job_ident] = job - job.rank = len(self.submitted) - LOG('submitted', job.name, indent_level=1) - return job - - def wait(self): - """ - wait for everything in the current pool to finish - """ - if self.pool is None: - return - self.pool.shutdown(True) - self.pool = None - for job in self.submitted.values(): - self.update_info(job) - - def update_info(self, job): - """ - Args: - job (LocalJob): the job to check and update the status for - """ - # check if the job has been submitted already and completed or partially run - if not job.job_ident: - job.status = JOB_STATUS.NOT_SUBMITTED - elif os.path.exists(job.complete_stamp()): - job.status = JOB_STATUS.COMPLETED - elif os.path.exists(job.logfile()) and job.job_ident not in self.submitted: - job.status = JOB_STATUS.UNKNOWN - elif job.job_ident in self.submitted: - if job.response.done(): - excpt = job.response.exception() - if excpt is None: - job.status = JOB_STATUS.COMPLETED - else: - job.status = JOB_STATUS.FAILED - job.status_comment = str(excpt) - elif job.response.running(): - job.status = JOB_STATUS.RUNNING - else: - job.status = JOB_STATUS.PENDING - else: - job.status = JOB_STATUS.UNKNOWN - - def close(self): - if self.pool is not None: - self.pool.shutdown() - self.pool = None diff --git a/mavis/schedule/pipeline.py b/mavis/schedule/pipeline.py deleted file mode 100644 index bdb42f63..00000000 --- a/mavis/schedule/pipeline.py +++ /dev/null @@ -1,945 +0,0 @@ -from configparser import ConfigParser, ExtendedInterpolation -import os -import re -import shutil -import subprocess - -from shortuuid import uuid - -from ..cluster import constants as _CLUSTER -from ..constants import SUBCOMMAND, PROTOCOL, EXIT_ERROR, EXIT_OK, EXIT_INCOMPLETE -from ..tools import convert_tool_output -from ..util import mkdirp, output_tabbed_file, LOG, DEVNULL -from ..validate import constants as _VALIDATE -from ..annotate import constants as _ANNOTATE -from ..annotate import file_io as _file_io -from ..summary import constants as _SUMMARY -from .job import Job, ArrayJob, LogFile, TorqueArrayJob -from .scheduler import SlurmScheduler, TorqueScheduler, SgeScheduler, consecutive_ranges -from .local import LocalJob, LocalScheduler -from .constants import JOB_STATUS, STD_OPTIONS, OPTIONS, SCHEDULER - -PROGNAME = shutil.which('mavis') -SHEBANG = '#!/bin/bash' -SCHEDULERS_BY_NAME = { - sched.NAME: sched for sched in [SlurmScheduler, TorqueScheduler, LocalScheduler, SgeScheduler] -} - - -def stringify_args_to_command(args): - """ - takes a list of arguments and prepares them for writing to a bash script - """ - command = [] - for argname, value in args.items(): - if isinstance(value, _file_io.ReferenceFile): - value = value.name - if isinstance(value, str): - command.append('--{} "{}"'.format(argname, value)) - else: - try: - value = ' '.join([str(v) for v in value]) - except TypeError: - pass - command.append('--{} {}'.format(argname, value)) - return command - - -def parse_run_time(filename): - """ - parses the run time listed at the end of a file following mavis conventions - """ - with open(filename, 'r') as fh: - content = fh.read().strip() - for line in [line.strip() for line in content.split('\n')][::-1]: - match = re.match(r'^\s*run time \(s\): (\d+)\s*$', line) # older style complete stamp - if match: - return int(match.group(1)) - match = re.search(r'start:\s*(\d+)\s*end:\s*(\d+)', line) - if match: - return int(match.group(2)) - int(match.group(1)) - return -1 - - -def run_conversion(config, libconf, conversion_dir, assume_no_untemplated=True): - """ - Converts files if not already converted. Returns a list of filenames - """ - inputs = [] - # run the conversions - for input_file in libconf.inputs: - output_filename = os.path.join(conversion_dir, input_file + '.tab') - if input_file in config.convert: - if not os.path.exists(output_filename): - command = config.convert[input_file] - if ( - command[0] == 'convert_tool_output' - ): # convert_tool_output FILEPATH [FILEPATH...] TOOL stranded - LOG('converting input command:', command) - output_tabbed_file( - convert_tool_output( - command[1:-2], - command[-2], - command[-1], - log=LOG, - assume_no_untemplated=assume_no_untemplated, - ), - output_filename, - ) - else: - command = ' '.join(command) + ' -o {}'.format(output_filename) - LOG('converting input command:') - LOG('>>>', command, time_stamp=False) - subprocess.check_output(command, shell=True) - inputs.append(output_filename) - else: - inputs.append(input_file) - return inputs - - -def validate_args(config, libconf): - """ - Pull arguments from the main config and library specific config to pass to validate - - Args: - config (MavisConfig): the main program config - libconf (LibraryConfig): library specific configuration - """ - allowed_args = [ - 'masking', - 'reference_genome', - 'aligner_reference', - 'library', - 'bam_file', - 'protocol', - 'read_length', - 'stdev_fragment_size', - 'median_fragment_size', - 'strand_specific', - 'annotations', - ] + list(_VALIDATE.DEFAULTS.keys()) - - # overwrite args in order of increasing specificity - args = {} - args.update(_VALIDATE.DEFAULTS.items()) - args.update({k: v.name for k, v in config.reference.items()}) - args.update(config.validate.items()) - args.update(libconf.items()) - args = {k: v for k, v in args.items() if k in allowed_args} - return args - - -def annotate_args(config, libconf): - """ - Pull arguments from the main config and library specific config to pass to annotate - - Args: - config (MavisConfig): the main program config - libconf (LibraryConfig): library specific configuration - """ - allowed_args = [ - 'reference_genome', - 'template_metadata', - 'masking', - 'annotations', - 'min_orf_size', - 'max_orf_cap', - 'library', - 'protocol', - 'min_domain_mapping_match', - 'domain_name_regex_filter', - 'max_proximity', - ] + list(_ANNOTATE.DEFAULTS.keys()) - args = {} - args.update(_ANNOTATE.DEFAULTS.items()) - args.update({k: v.name for k, v in config.reference.items()}) - args.update(config.cluster.items()) - args.update(config.illustrate.items()) - args.update(config.annotate.items()) - args.update(libconf.items()) - args = {k: v for k, v in args.items() if k in allowed_args} - return args - - -def summary_args(config): - """ - Pull arguments from the main config and library specific config to pass to summary - - Args: - config (MavisConfig): the main program config - libconf (LibraryConfig): library specific configuration - """ - allowed_args = [ - 'flanking_call_distance', - 'split_call_distance', - 'contig_call_distance', - 'spanning_call_distance', - 'dgv_annotation', - 'annotations', - ] + list(_SUMMARY.DEFAULTS.keys()) - args = {} - args.update({k: v.name for k, v in config.reference.items()}) - args.update(config.pairing.items()) - args.update(config.summary.items()) - args = {k: v for k, v in args.items() if k in allowed_args} - return args - - -def cluster_args(config, libconf): - """ - Pull arguments from the main config and library specific config to pass to cluster - - Args: - config (MavisConfig): the main program config - libconf (LibraryConfig): library specific configuration - """ - allowed_args = [ - 'masking', - 'annotations', - 'library', - 'protocol', - 'disease_status', - 'strand_specific', - ] + list(_CLUSTER.DEFAULTS.keys()) - args = {} - args.update(_CLUSTER.DEFAULTS.items()) - args.update({k: v.name for k, v in config.reference.items()}) - args.update(config.cluster.items()) - args.update(config.illustrate.items()) - args.update(config.annotate.items()) - args.update(libconf.items()) - args = {k: v for k, v in args.items() if k in allowed_args} - return args - - -class Pipeline: - ERROR_STATES = { - JOB_STATUS.ERROR, - JOB_STATUS.FAILED, - JOB_STATUS.CANCELLED, - JOB_STATUS.UNKNOWN, - JOB_STATUS.NOT_SUBMITTED, - } - - def __init__( - self, - output_dir, - scheduler, - validations=None, - annotations=None, - pairing=None, - summary=None, - checker=None, - batch_id='batch-{}'.format(uuid()), - ): - """ - Args: - output_dir (str): path to main output directory for all mavis pipeline results - scheduler (Scheduler): the class for interacting with a job scheduler - validations (List[Job]): list of validation jobs - annotations (List[Job]): list of annotation jobs - pairing (Job): pairing job - summary (Job): summary job - batch_id (str): the batch id for this pipeline run. Used in avoinfing job name conflicts - """ - self.scheduler = scheduler - self.output_dir = output_dir - self.validations = [] if validations is None else validations - self.annotations = [] if annotations is None else annotations - self.pairing = pairing - self.summary = summary - self.checker = checker - self.batch_id = batch_id - self.args = {} # for local runs only, store config to be passed to MAVIS stage - - def write_submission_script(self, subcommand, job, args, aligner_path=None): - """ - Args: - subcommand (SUBCOMMAND): the pipeline step this script will run - job (Job): the job the script is for - args (dict): arguments for the subcommand - """ - LOG('writing:', job.script, time_stamp=True) - with open(job.script, 'w') as fh: - fh.write( - """{shebang} -{aligner_path} -cd {cwd} -START_TIME=$(date +%s)\n\n""".format( - shebang=SHEBANG, - aligner_path='export PATH={}:$PATH'.format(os.path.dirname(aligner_path)) - if aligner_path - else '', - cwd=os.getcwd(), - ) - ) - commands = [PROGNAME, subcommand] + stringify_args_to_command(args) - fh.write(' \\\n\t'.join(commands) + '\n\n') - fh.write( - """ -code=$? - -if [ "$code" -ne "0" ] -then - exit $code -fi - -END_TIME=$(date +%s) - -echo "start: $START_TIME end: $END_TIME" > {}/MAVIS-${}.COMPLETE - - """.format( - args['output'], - self.scheduler.ENV_JOB_IDENT - if not isinstance(job, ArrayJob) - else self.scheduler.ENV_ARRAY_IDENT, - ) - ) - - @classmethod - def format_args(cls, subcommand, args): - command = [subcommand] - for arg, val in args.items(): - command.append('--{}'.format(arg)) - if isinstance(val, str): - command.append(val) - else: - try: - command.extend(iter(val)) - except TypeError: - command.append(val) - return [str(v) for v in command] - - @classmethod - def build(cls, config): - """ - Args: - config (MavisConfig): the main configuration. Note this is the config after all reference inputs have been loaded - Returns: - Pipeline: the pipeline instance with job dependencies information etc. - """ - from ..main import main as _main - - conversion_dir = mkdirp(os.path.join(config.output, 'converted_inputs')) - config.output = os.path.abspath(config.output) - if config.schedule.scheduler not in SCHEDULERS_BY_NAME: - raise NotImplementedError( - 'unsupported scheduler', config.schedule.scheduler, list(SCHEDULERS_BY_NAME.keys()) - ) - - scheduler = SCHEDULERS_BY_NAME[config.schedule.scheduler]( - config.schedule.get('concurrency_limit', OPTIONS.concurrency_limit), - remote_head_ssh=config.schedule.get('remote_head_ssh', OPTIONS.remote_head_ssh), - ) - pipeline = Pipeline(output_dir=config.output, scheduler=scheduler) - - annotation_output_files = [] - for libconf in config.libraries.values(): - base = os.path.join( - config.output, - '{}_{}_{}'.format(libconf.library, libconf.disease_status, libconf.protocol), - ) - LOG('setting up the directory structure for', libconf.library, 'as', base) - libconf.inputs = run_conversion(config, libconf, conversion_dir) - - # run the cluster stage - cluster_output = mkdirp( - os.path.join(base, SUBCOMMAND.CLUSTER) - ) # creates the clustering output dir - args = cluster_args(config, libconf) - args.update({'batch_id': pipeline.batch_id, 'output': cluster_output}) - args['split_only'] = SUBCOMMAND.CLUSTER in config.get('skip_stage', []) - args['inputs'] = libconf.inputs - LOG('clustering', '(split only)' if args['split_only'] else '', time_stamp=True) - clustering_log = os.path.join( - args['output'], 'MC_{}_{}.log'.format(libconf.library, pipeline.batch_id) - ) - LOG('writing:', clustering_log, time_stamp=True) - args['log'] = clustering_log - clustered_files = _main(cls.format_args(SUBCOMMAND.CLUSTER, args)) - - # make a validation job for each cluster file - validate_jobs = [] - - if SUBCOMMAND.VALIDATE not in config.skip_stage: - mkdirp(os.path.join(base, SUBCOMMAND.VALIDATE)) - for task_ident in range(1, len(clustered_files) + 1): - mkdirp( - os.path.join( - base, SUBCOMMAND.VALIDATE, '{}-{}'.format(pipeline.batch_id, task_ident) - ) - ) - args = validate_args(config, libconf) - - script_name = os.path.join(base, SUBCOMMAND.VALIDATE, 'submit.sh') - job_options = {k: v for k, v in config.schedule.items() if k in STD_OPTIONS} - job_options['memory_limit'] = config.schedule.validation_memory - - if libconf.protocol == PROTOCOL.TRANS: - job_options['memory_limit'] = config.schedule.trans_validation_memory - - if scheduler.NAME == SCHEDULER.LOCAL: - job_options['reference_genome'] = args['reference_genome'] - if libconf.protocol == PROTOCOL.TRANS: - job_options['annotations'] = args['annotations'] - - for task_ident in range(1, len(clustered_files) + 1): - args['inputs'] = [ - os.path.join( - cluster_output, '{}-{}.tab'.format(pipeline.batch_id, task_ident) - ) - ] - args['output'] = os.path.join( - base, SUBCOMMAND.VALIDATE, '{}-{}'.format(pipeline.batch_id, task_ident) - ) - job_name = 'MV_{}_{}-{}'.format( - libconf.library, pipeline.batch_id, task_ident - ) - args['log'] = os.path.join(args['output'], 'job-{name}-{job_ident}.log') - validate_job = LocalJob( - stage=SUBCOMMAND.VALIDATE, - output_dir=args['output'], - stdout=args['log'], - name=job_name, - args=cls.format_args(SUBCOMMAND.VALIDATE, args), - func=_main, - **job_options - ) - pipeline.validations.append(validate_job) - validate_jobs.append(validate_job) - else: - args['inputs'] = os.path.join( - cluster_output, - '{}-${}.tab'.format(pipeline.batch_id, scheduler.ENV_TASK_IDENT), - ) - args['output'] = os.path.join( - base, - SUBCOMMAND.VALIDATE, - '{}-${}'.format(pipeline.batch_id, scheduler.ENV_TASK_IDENT), - ) - aligner_path = shutil.which(args['aligner'].split(' ')[0]) - job_class = ArrayJob if scheduler.NAME != SCHEDULER.TORQUE else TorqueArrayJob - validate_job = job_class( - stage=SUBCOMMAND.VALIDATE, - task_list=len(clustered_files), - output_dir=os.path.join( - base, SUBCOMMAND.VALIDATE, '{}-{{task_ident}}'.format(pipeline.batch_id) - ), - script=script_name, - name='MV_{}_{}'.format(libconf.library, pipeline.batch_id), - **job_options - ) - pipeline.write_submission_script( - SUBCOMMAND.VALIDATE, validate_job, args, aligner_path=aligner_path - ) - pipeline.validations.append(validate_job) - validate_jobs.append(validate_job) - - # make an annotation job for each validation/cluster job/file - mkdirp(os.path.join(base, SUBCOMMAND.ANNOTATE)) - for task_ident in range(1, len(clustered_files) + 1): - mkdirp( - os.path.join( - base, SUBCOMMAND.ANNOTATE, '{}-{}'.format(pipeline.batch_id, task_ident) - ) - ) - args = annotate_args(config, libconf) - - script_name = os.path.join(base, SUBCOMMAND.ANNOTATE, 'submit.sh') - job_options = {k: v for k, v in config.schedule.items() if k in STD_OPTIONS} - job_options['memory_limit'] = config.schedule.annotation_memory - - if isinstance(scheduler, LocalScheduler): - job_options['annotations'] = args['annotations'] - job_options['reference_genome'] = args['reference_genome'] - if args['template_metadata']: - job_options['template_metadata'] = args['template_metadata'] - for task_ident in range(1, len(clustered_files) + 1): - args['output'] = os.path.join( - base, SUBCOMMAND.ANNOTATE, '{}-{}'.format(pipeline.batch_id, task_ident) - ) - # annotate 'clustered' files if the pipeline does not include the validation step - if SUBCOMMAND.VALIDATE not in config.skip_stage: - args['inputs'] = [ - os.path.join( - base, - SUBCOMMAND.VALIDATE, - '{}-{}'.format(pipeline.batch_id, task_ident), - _VALIDATE.PASS_FILENAME, - ) - ] - else: - args['inputs'] = [ - os.path.join( - cluster_output, '{}-{}.tab'.format(pipeline.batch_id, task_ident) - ) - ] - job_name = 'MA_{}_{}-{}'.format(libconf.library, pipeline.batch_id, task_ident) - args['log'] = os.path.join(args['output'], 'job-{name}-{job_ident}.log') - annotate_job = LocalJob( - stage=SUBCOMMAND.ANNOTATE, - script=script_name, - name=job_name, - stdout=args['log'], - output_dir=args['output'], - args=cls.format_args(SUBCOMMAND.ANNOTATE, args), - func=_main, - **job_options - ) - pipeline.annotations.append(annotate_job) - annotation_output_files.append( - os.path.join(args['output'], _ANNOTATE.PASS_FILENAME) - ) - if validate_jobs: - annotate_job.dependencies.append(validate_jobs[task_ident - 1]) - else: - args['output'] = os.path.join( - base, - SUBCOMMAND.ANNOTATE, - '{}-${}'.format(pipeline.batch_id, scheduler.ENV_TASK_IDENT), - ) - # annotate 'clustered' files if the pipeline does not include the validation step - if SUBCOMMAND.VALIDATE not in config.skip_stage: - args['inputs'] = [ - os.path.join( - base, - SUBCOMMAND.VALIDATE, - '{}-${}'.format(pipeline.batch_id, scheduler.ENV_TASK_IDENT), - _VALIDATE.PASS_FILENAME, - ) - ] - else: - args['inputs'] = [ - os.path.join( - cluster_output, - '{}-${}.tab'.format(pipeline.batch_id, scheduler.ENV_TASK_IDENT), - ) - ] - - job_class = ArrayJob if scheduler.NAME != SCHEDULER.TORQUE else TorqueArrayJob - annotate_job = job_class( - stage=SUBCOMMAND.ANNOTATE, - task_list=len(clustered_files), - script=script_name, - name='MA_{}_{}'.format(libconf.library, pipeline.batch_id), - output_dir=os.path.join( - base, SUBCOMMAND.ANNOTATE, '{}-{{task_ident}}'.format(pipeline.batch_id) - ), - **job_options - ) - pipeline.write_submission_script(SUBCOMMAND.ANNOTATE, annotate_job, args) - pipeline.annotations.append(annotate_job) - if validate_jobs: - annotate_job.dependencies.extend(validate_jobs) - - # add the expected output file names for input to pairing - for taskid in range(1, len(clustered_files) + 1): - fname = os.path.join(args['output'], _ANNOTATE.PASS_FILENAME) - fname = re.sub(r'\${}'.format(scheduler.ENV_TASK_IDENT), str(taskid), fname) - annotation_output_files.append(fname) - - # set up the pairing job - args = {} - args.update(config.pairing.items()) - args['output'] = os.path.join(config.output, SUBCOMMAND.PAIR) - args['annotations'] = config.reference.annotations - mkdirp(args['output']) - args['inputs'] = annotation_output_files - job_name = 'MP_{}'.format(pipeline.batch_id) - - script_name = os.path.join(config.output, SUBCOMMAND.PAIR, 'submit.sh') - - if isinstance(scheduler, LocalScheduler): - args['log'] = os.path.join(args['output'], 'job-{name}-{job_ident}.log') - pipeline.pairing = LocalJob( - stage=SUBCOMMAND.PAIR, - script=script_name, - output_dir=args['output'], - stdout=args['log'], - name=job_name, - dependencies=pipeline.annotations, - args=cls.format_args(SUBCOMMAND.PAIR, args), - func=_main, - **{k: v for k, v in config.schedule.items() if k in STD_OPTIONS} - ) - else: - pipeline.pairing = Job( - SUBCOMMAND.PAIR, - script=script_name, - output_dir=args['output'], - name=job_name, - dependencies=pipeline.annotations, - **{k: v for k, v in config.schedule.items() if k in STD_OPTIONS} - ) - pipeline.write_submission_script(SUBCOMMAND.PAIR, pipeline.pairing, args) - # set up the summary job - args = summary_args(config) - args['output'] = os.path.join(config.output, SUBCOMMAND.SUMMARY) - mkdirp(args['output']) - args['inputs'] = [os.path.join(config.output, SUBCOMMAND.PAIR, 'mavis_paired*.tab')] - script_name = os.path.join(args['output'], 'submit.sh') - job_name = 'MS_{}'.format(pipeline.batch_id) - if isinstance(scheduler, LocalScheduler): - args['log'] = os.path.join(args['output'], 'job-{name}-{job_ident}.log') - pipeline.summary = LocalJob( - stage=SUBCOMMAND.SUMMARY, - name=job_name, - output_dir=args['output'], - stdout=args['log'], - script=script_name, - dependencies=[pipeline.pairing], - args=cls.format_args(SUBCOMMAND.SUMMARY, args), - func=_main, - **{k: v for k, v in config.schedule.items() if k in STD_OPTIONS} - ) - else: - pipeline.summary = Job( - stage=SUBCOMMAND.SUMMARY, - name=job_name, - output_dir=args['output'], - script=script_name, - dependencies=[pipeline.pairing], - **{k: v for k, v in config.schedule.items() if k in STD_OPTIONS} - ) - pipeline.write_submission_script(SUBCOMMAND.SUMMARY, pipeline.summary, args) - return pipeline - - def _resubmit_job(self, job): - """ - Given a failed job, cancel it and all of its dependencies and then resubmit them - """ - # resubmit the job or all failed tasks for the job. Update any dependencies - failed_tasks = set() - try: - for task in job.task_list: - if task.status in self.ERROR_STATES: - failed_tasks.add(task.task_ident) - if len(failed_tasks) == len(job.task_list): - failed_tasks = [] - except AttributeError: # non-array jobs - pass - # SGE cannot submit a task list that is non-consecutive so we will cancel the entire array - if self.scheduler.NAME == SCHEDULER.SGE and len(consecutive_ranges(failed_tasks)) != 1: - failed_tasks = set() - - if failed_tasks: # resubmit failed tasks only and create a new job - new_job = job.copy_with_tasks(failed_tasks) - for task_ident in failed_tasks: - # cancel and remove the failed task - try: - self.scheduler.cancel(job, task_ident=task_ident) - job.remove_task(task_ident) - except subprocess.CalledProcessError: # ignore cancelling errors - pass - self.scheduler.submit(new_job) - else: - # 'clean' the current job so that it is no longer 'submitted' - self.scheduler.cancel(job) - job.reset() - try: - for task in job.task_list: - task.status = JOB_STATUS.NOT_SUBMITTED - task.status_comment = '' - except AttributeError: - pass - self.scheduler.submit(job) - new_job = job - - if new_job.stage == SUBCOMMAND.VALIDATE: - if new_job not in self.validations: - self.validations.append(new_job) - # cancel and resubmit annotate, pairing and summary jobs - new_annotations = [] - for ajob in self.annotations: - if ajob.dependencies == [job] and failed_tasks: # only dependent on this job - try: - new_ajob = ajob.copy_with_tasks(failed_tasks) - new_annotations.append(new_ajob) - for task in failed_tasks: - self.scheduler.cancel(ajob, task_ident=task) - ajob.remove_task(task) - new_ajob.dependencies = [new_job] - self.scheduler.submit(new_ajob) - except AttributeError: - self.scheduler.cancel(ajob) - ajob.reset() - if new_job not in ajob.dependencies: - ajob.dependencies.append(new_job) - self.scheduler.submit(ajob) - elif job in ajob.dependencies: - # dependent on multiple jobs - self.scheduler.cancel(ajob) - ajob.reset() - if new_job not in ajob.dependencies: - ajob.dependencies.append(new_job) - self.scheduler.submit(ajob) - # ignore annotation jobs not related to the failed validation job - self.annotations.extend(new_annotations) - elif new_job.stage == SUBCOMMAND.ANNOTATE: - if new_job not in self.annotations: - self.annotations.append(new_job) - - if new_job.stage in {SUBCOMMAND.VALIDATE, SUBCOMMAND.ANNOTATE}: - # cancel pairing - self.scheduler.cancel(self.pairing) - self.pairing.reset() - self.pairing.dependencies = self.annotations[:] - - # all resubmissions result in cancelling summary - self.scheduler.cancel(self.summary) - self.summary.reset() - - def _job_status(self, job, submit=False, resubmit=False, log=DEVNULL): - """ - report information regarding a particular job status - """ - run_time = -1 - if not job.job_ident and (submit or resubmit): - self.scheduler.submit(job) - elif job.job_ident and resubmit and job.status in self.ERROR_STATES: - self._resubmit_job(job) - if job.job_ident: - log('{} ({}) is {}'.format(job.name, job.job_ident, job.status)) - else: - log('{} is {}'.format(job.name, job.status)) - if job.status == JOB_STATUS.COMPLETED: - if isinstance(job, ArrayJob): - for task in job.task_list: - if not os.path.exists(task.complete_stamp()): - log('complete stamp is expected but does not exist', indent_level=1) - log(task.complete_stamp(), indent_level=2) - else: - run_time = max(run_time, parse_run_time(task.complete_stamp())) - elif not os.path.exists(job.complete_stamp()): - with log.indent() as log: - log('complete stamp is expected but does not exist') - log(job.complete_stamp()) - else: - run_time = max(run_time, parse_run_time(job.complete_stamp())) - if run_time >= 0: - if isinstance(job, ArrayJob): - log( - '{} {} COMPLETED'.format( - job.tasks, 'task is' if job.tasks == 1 else 'tasks are' - ), - indent_level=1, - ) - log('run time: {}'.format(run_time), indent_level=1) - else: - if isinstance(job, ArrayJob): - tasks_by_status = {} - for task in job.task_list: - tasks_by_status.setdefault(task.status, []).append(task) - for status, tasks in tasks_by_status.items(): - comments = set([t.status_comment for t in tasks if t.status_comment]) - context = 'tasks are' if len(tasks) != 1 else 'task is' - LOG('{} {} {}'.format(len(tasks), context, status), indent_level=2) - for comment in comments: - LOG('comment:', comment, indent_level=3) - elif job.status not in { - JOB_STATUS.PENDING, - JOB_STATUS.NOT_SUBMITTED, - JOB_STATUS.SUBMITTED, - }: - try: - content = LogFile.parse(job.logfile()) - log('{}: {}'.format(content.status, content.message), indent_level=1) - except FileNotFoundError: - log('missing log file:', job.logfile(), indent_level=1) - - return run_time - - def check_status(self, submit=False, resubmit=False, log=DEVNULL): - """ - Check all jobs for completetion. Report any failures, etc. - - Args: - submit (bool): submit any pending jobs - """ - # update the information for all jobs where possible - run_times = [[], [], [], []] - jobs_not_complete = 0 - jobs_with_errors = 0 - - for job in self.validations + self.annotations + [self.pairing, self.summary]: - self.scheduler.update_info(job) - log('validate', time_stamp=True) - for job in self.validations: - run_time = self._job_status(job, submit=submit, resubmit=resubmit, log=log.indent()) - if job.status == JOB_STATUS.COMPLETED: - if run_time >= 0: - run_times[0].append(run_time) - self.scheduler.wait() - - log('annotate', time_stamp=True) - if ( - not all([job.status == JOB_STATUS.COMPLETED for job in self.validations]) - and self.scheduler.NAME == 'LOCAL' - and (submit or resubmit) - ): - log('Stopping submission. Dependencies not complete', indent_level=1) - submit = False - resubmit = False - - for job in self.annotations: - self._job_status(job, submit=submit, resubmit=resubmit, log=log.indent()) - if job.status == JOB_STATUS.COMPLETED: - if run_time >= 0: - run_times[1].append(run_time) - self.scheduler.wait() - - log('pairing', time_stamp=True) - if ( - not all([job.status == JOB_STATUS.COMPLETED for job in self.annotations]) - and self.scheduler.NAME == 'LOCAL' - and (submit or resubmit) - ): - log('Stopping submission. Dependencies not complete', indent_level=1) - submit = False - resubmit = False - - run_time = self._job_status( - self.pairing, submit=submit, resubmit=resubmit, log=log.indent() - ) - if self.pairing.status == JOB_STATUS.COMPLETED: - if run_time >= 0: - run_times[2].append(run_time) - self.scheduler.wait() - - log('summary', time_stamp=True) - if ( - self.pairing.status != JOB_STATUS.COMPLETED - and self.scheduler.NAME == 'LOCAL' - and (submit or resubmit) - ): - log('Stopping submission. Dependencies not complete', indent_level=1) - submit = False - resubmit = False - - run_time = self._job_status( - self.summary, submit=submit, resubmit=resubmit, log=log.indent() - ) - if self.summary.status == JOB_STATUS.COMPLETED: - if run_time >= 0: - run_times[3].append(run_time) - self.scheduler.wait() - - for job in self.validations + self.annotations + [self.pairing, self.summary]: - if submit or resubmit and job.status != JOB_STATUS.COMPLETED: - self.scheduler.update_info(job) - if job.status in self.ERROR_STATES: - jobs_with_errors += 1 - elif job.status != JOB_STATUS.COMPLETED: - jobs_not_complete += 1 - - if jobs_not_complete + jobs_with_errors == 0: - if all([r for r in run_times]): - log('parallel run time:', sum([max(r) for r in run_times])) - return EXIT_OK - elif not jobs_with_errors: - return EXIT_INCOMPLETE - else: - return EXIT_ERROR - - @classmethod - def read_build_file(cls, filepath): - """ - read the configuration file which stored the build information concerning jobs and dependencies - - Args: - filepath (str): path to the input config file - """ - from ..main import main as _main - - if not os.path.exists(filepath): - raise FileNotFoundError('File does not exist: {}'.format(filepath)) - parser = ConfigParser(interpolation=ExtendedInterpolation()) - parser.read(filepath) - cast = {'None': None, 'False': False, 'True': True} - - pipeline = cls( - output_dir=parser['general']['output_dir'], - scheduler=SCHEDULERS_BY_NAME[parser['general']['scheduler']]( - concurrency_limit=parser['general']['concurrency_limit'] - if 'concurrency_limit' in parser['general'] - else OPTIONS.concurrency_limit, - remote_head_ssh=parser['general']['remote_head_ssh'] - if 'remote_head_ssh' in parser['general'] - else OPTIONS.remote_head_ssh, - ), - batch_id=parser['general']['batch_id'], - ) - - jobs = {} - for sec in parser.sections(): - if sec != 'general': - section = {} - for attr, value in parser[sec].items(): - if attr in ['dependencies', 'inputs', 'outputs', 'args', 'task_list'] and value: - section[attr] = [s.strip() for s in re.split(r'\n', value)] - elif value == 'None': - section[attr] = None - elif value in cast: - value = cast[value] - else: - section[attr] = value - if pipeline.scheduler.NAME == SCHEDULER.LOCAL: - jobs[sec] = LocalJob(func=_main, **section) - elif 'task_list' in section: - if pipeline.scheduler.NAME == SCHEDULER.TORQUE: - jobs[sec] = TorqueArrayJob(**section) - else: - jobs[sec] = ArrayJob(**section) - else: - jobs[sec] = Job(**section) - - for job in jobs.values(): - for i, prior_job_name in enumerate(job.dependencies): - job.dependencies[i] = jobs[prior_job_name] - - for job in jobs.values(): - if job.stage == SUBCOMMAND.VALIDATE: - pipeline.validations.append(job) - elif job.stage == SUBCOMMAND.ANNOTATE: - pipeline.annotations.append(job) - elif job.stage == SUBCOMMAND.PAIR: - if pipeline.pairing: - raise ValueError('mavis pipeline expects a single pairing job') - pipeline.pairing = job - elif job.stage == SUBCOMMAND.SUMMARY: - if pipeline.summary: - raise ValueError('mavis pipeline expects a single summary job') - pipeline.summary = job - elif job.stage == SUBCOMMAND.CHECKER: - if pipeline.checker: - raise ValueError('mavis pipeline expects a single checker job') - pipeline.checker = job - else: - raise NotImplementedError( - 'unexpected job stage for MAVIS pipeline: {}'.format(job.stage), job - ) - - return pipeline - - def write_build_file(self, filename): - """ - write the build.cfg file for the current pipeline. This is the file used in re-loading the pipeline - to check the status and report failures, etc. later. - - Args: - filename (str): path to the output config file - """ - parser = ConfigParser(interpolation=ExtendedInterpolation()) - parser['general'] = { - 'batch_id': self.batch_id, - 'output_dir': self.output_dir, - 'scheduler': self.scheduler.NAME, - 'remote_head_ssh': self.scheduler.remote_head_ssh, - 'concurrency_limit': str(self.scheduler.concurrency_limit), - } - - for job in [self.summary, self.pairing] + self.validations + self.annotations: - parser[job.display_name] = {k: re.sub(r'\$', '$$', v) for k, v in job.flatten().items()} - - with open(filename, 'w') as configfile: - parser.write(configfile) diff --git a/mavis/schedule/scheduler.py b/mavis/schedule/scheduler.py deleted file mode 100644 index 58c8c2a5..00000000 --- a/mavis/schedule/scheduler.py +++ /dev/null @@ -1,942 +0,0 @@ -from datetime import timedelta -import subprocess -import re -import logging -import socket - -from ..util import LOG -from ..config import NullableType - -from .job import ArrayJob -from .constants import SCHEDULER, JOB_STATUS, cumulative_job_state, MAIL_TYPE - - -def time_format(total_seconds): - """ - Converts a total seconds to a str format "H:M:S" - """ - hours, remainder = divmod(total_seconds, 60 * 60) - minutes, seconds = divmod(remainder, 60) - return "{}:{:02d}:{:02d}".format(hours, minutes, seconds) - - -def consecutive_ranges(numbers): - """ - Given a list of integers, return a list of ranges - - Example: - >>> consecutive_ranges([1, 2, 3, 4, 5, 9, 10, 14, 18]) - [(1, 5), (9, 10), (14, 14), (18, 18)] - """ - ranges = [] - for number in sorted(set(numbers)): - if not ranges or ranges[-1][1] + 1 != number: - ranges.append((number, number)) - else: - ranges[-1] = ranges[-1][0], number - return ranges - - -class Scheduler: # pragma: no cover - """ - Class responsible for methods interacting with the scheduler - """ - - ENV_TASK_IDENT = '{TASK_IDENT}' - """str: the expected pattern of environment variables which store the task id""" - ENV_JOB_IDENT = '{JOB_IDENT}' - """str: the expected pattern of environment variables which store the job id""" - HEADER_PREFIX = '#' - - def __init__(self, concurrency_limit=None, remote_head_ssh=''): - """ - Args: - concurrency_limit (int): the maximum allowed concurrent processes. Defaults to one less than the total number available - """ - self.concurrency_limit = NullableType(int)(concurrency_limit) - self.remote_head_ssh = remote_head_ssh - - def command(self, command, shell=False): - """ - Wrapper to deal with subprocess commands. If configured and not on the head node currently, will send the command through ssh - - Args: - command (list or str): the command can be a list or a string and is passed to the subprocess to be run - - Returns: - str: the content returns from stdout of the subprocess - """ - if self.remote_head_ssh and self.remote_head_ssh != socket.gethostname(): - # ssh to remote head and run the command there - if not isinstance(command, str): - command = ' '.join(command) - return ( - subprocess.check_output(['ssh', str(self.remote_head_ssh), command]) - .decode('utf8') - .strip() - ) - return subprocess.check_output(command, shell=shell).decode('utf8').strip() - - def wait(self): - pass - - def submit(self, job): - """ - submit a job to the scheduler - """ - raise NotImplementedError('abstract method') - - def update_info(self, job): - """ - update the information about the job from the scheduler - """ - raise NotImplementedError('abstract method') - - def cancel(self, job, task_ident=None): - raise NotImplementedError('abstract method') - - def format_dependencies(self, job): - """ - returns a string representing the dependency argument - """ - raise NotImplementedError('abstract method') - - -class SlurmScheduler(Scheduler): - """ - Class for formatting commands to match a SLURM scheduler system - SLURM docs can be found here https://slurm.schedmd.com - """ - - NAME = SCHEDULER.SLURM - """:attr:`mavis.schedule.constants.SCHEDULER`: the type of scheduler""" - - ENV_TASK_IDENT = 'SLURM_ARRAY_TASK_ID' - ENV_JOB_IDENT = 'SLURM_JOB_ID' - ENV_ARRAY_IDENT = 'SLURM_ARRAY_JOB_ID' - - def submit(self, job): - """ - runs a subprocess sbatch command - - Args: - job (Job): the job to be submitted - """ - command = ['sbatch'] - if job.job_ident: - raise ValueError('Job has already been submitted and has the job number', job.job_ident) - if job.queue: - command.append('--partition={}'.format(job.queue)) - if job.memory_limit: - command.extend(['--mem', str(job.memory_limit) + 'M']) - if job.time_limit: - command.extend(['-t', time_format(job.time_limit)]) - if job.import_env: - command.append('--export=ALL') - if job.dependencies: - command.append(self.format_dependencies(job)) - if job.name: - command.extend(['-J', job.name]) - if job.stdout: - command.extend( - [ - '-o', - job.stdout.format( - name='%x', - job_ident='%A' if isinstance(job, ArrayJob) else '%j', - task_ident='%a', - ), - ] - ) - if job.mail_type and job.mail_user: - command.append('--mail-type={}'.format(job.mail_type)) - command.append('--mail-user={}'.format(job.mail_user)) - # options specific to job arrays - if isinstance(job, ArrayJob): - concurrency_limit = ( - '' if self.concurrency_limit is None else '%{}'.format(self.concurrency_limit) - ) - task_ranges = [ - '{}{}'.format(s, '-{}'.format(t) if s != t else '') - for s, t in consecutive_ranges([task.task_ident for task in job.task_list]) - ] - command.append('--array={}{}'.format(','.join(task_ranges), concurrency_limit)) - - command.append(job.script) - LOG('submitting', job.name) - content = self.command(command) - - match = re.match(r'^submitted batch job (\d+)$', content, re.IGNORECASE) - if not match: - raise NotImplementedError( - 'Error in retrieving the submitted job number. Did not match the expected pattern', - content, - ) - job.job_ident = match.group(1) - job.status = JOB_STATUS.SUBMITTED - - try: - for task in job.task_list: - task.status = job.status - task.status_comment = job.status_comment - except AttributeError: - pass - - @classmethod - def parse_sacct(cls, content): - """ - parses content returned from the sacct command - - Args: - content (str): the content returned from the sacct command - """ - lines = content.strip().split('\n') - header = lines[0].split('|') - rows = [] - for line in lines[1:]: - row = {col: val for col, val in zip(header, line.split('|'))} - rows.append(row) - # now combine the .batch split jobs - results = {} - for row in rows: - jobid = re.sub(r'\.batch$', '', row['JobID']) - if row['JobName'] != 'batch': - results[jobid] = row - for row in rows: - jobid = re.sub(r'\.batch$', '', row['JobID']) - if row['JobName'] == 'batch' and jobid in results: - results[jobid].update( - {k: v for k, v in row.items() if k not in ['JobName', 'JobID']} - ) - rows = [] - for row in results.values(): - row['State'] = row['State'].split(' ')[0] - task_ident = None - if re.match(r'^\d+_\d+$', row['JobID']): - job_ident, task_ident = row['JobID'].rsplit('_', 1) - task_ident = int(task_ident) - elif re.match(r'^(\d+)_\[\d+(-\d+)?\]$', row['JobID']): - job_ident = row['JobID'].split('_', 1)[0] - else: - job_ident = row['JobID'] - rows.append( - { - 'job_ident': job_ident, - 'task_ident': task_ident, - 'name': row['JobName'], - 'status': row['State'], - 'status_comment': '', - } - ) - - return rows - - @classmethod - def parse_scontrol_show(cls, content): - """ - parse the content from the command: scontrol show job - - Args: - content (str): the content to be parsed - """ - rows = [] - for job_content in re.split(r'\n\s*\n', content): - job_content = job_content.strip() - if not job_content: # ignore empty - continue - row = {} - for pair in re.split(r'\s+', job_content): - if '=' not in pair: - continue - col, val = pair.split('=', 1) - row[col] = val - try: - task_ident = int(row.get('ArrayTaskId', '')) - except ValueError: - task_ident = None - rows.append( - { - 'job_ident': row['JobId'], - 'status': row['JobState'], - 'name': row['JobName'], - 'status_comment': row['Reason'] if row['Reason'].lower() != 'none' else '', - 'task_ident': task_ident, - } - ) - return rows - - def update_info(self, job): - """ - Pull job information about status etc from the scheduler. Updates the input job - - Args: - job (Job): the job to be updated - """ - if not job.job_ident: - return - command = ['sacct', '-j', job.job_ident, '--long', '--parsable2'] - content = self.command(command) - rows = self.parse_sacct(content) - updated = False - updated_tasks = set() - - for row in rows: - if row['job_ident'] == job.job_ident: - if row['task_ident'] is not None: - if job.has_task(row['task_ident']): - task = job.get_task(row['task_ident']) - task.status = row['status'] - task.status_comment = row['status_comment'] - updated_tasks.add(task.task_ident) - else: - job.status = row['status'] - job.status_comment = row['status_comment'] - updated = True - try: - if not updated: - job.status = cumulative_job_state([t.status for t in job.task_list]) - else: - for task in job.task_list: - if task.task_ident not in updated_tasks: - task.status = job.status - except AttributeError: - pass - - def cancel(self, job, task_ident=None): - """ - cancel a job - - Args: - job (Job): the job to be cancelled - task_ident (int): the task id to be cancelled (instead of the entire array) - """ - if not job.job_ident: - return - if task_ident is not None: - self.command(['scancel', '{}_{}'.format(job.job_ident, task_ident)]) - job.get_task(task_ident).status = JOB_STATUS.CANCELLED - LOG('cancelled task', job.name, job.job_ident, task_ident) - else: - self.command(['scancel', job.job_ident]) - job.status = JOB_STATUS.CANCELLED - LOG('cancelled job', job.name, job.job_ident) - - try: - for task in job.task_list: - task.status = JOB_STATUS.CANCELLED - except AttributeError: - pass - - def format_dependencies(self, job): - """ - returns a string representing the dependency argument - - Args: - job (Job): the job the argument is being built for - """ - try: - if len(job.dependencies) == 1 and job.tasks == job.dependencies[0].tasks: - # array job dependent on only another array job with the same number of tasks - dependency = job.dependencies[0] - if not dependency.job_ident: - raise ValueError( - 'The dependencies must be submitted before the dependent job', - job, - dependency, - ) - return '--dependency=aftercorr:{}'.format(dependency.job_ident) - except AttributeError: - pass - - dep_jobs = [] - for dependency in job.dependencies: - if not dependency.job_ident: - raise ValueError( - 'The dependencies must be submitted before the dependent job', job, dependency - ) - try: - for task in dependency.task_list: - dep_jobs.append('{}_{}'.format(dependency.job_ident, task.task_ident)) - except AttributeError: - dep_jobs.append(str(dependency.job_ident)) - - return '--dependency=afterok:{}'.format(':'.join(dep_jobs)) - - -class SgeScheduler(Scheduler): - """ - Class for managing interactions with the SGE scheduler - """ - - NAME = SCHEDULER.SGE - """:attr:`mavis.schedule.constants.SCHEDULER`: the type of scheduler""" - ENV_TASK_IDENT = 'SGE_TASK_ID' - ENV_JOB_IDENT = 'JOB_ID' - """str: expected pattern for environment variables which store the job id""" - ENV_ARRAY_IDENT = ENV_JOB_IDENT - ENV_JOB_NAME = 'JOB_NAME' - """str: expected pattern for environment variables which store the job name""" - HEADER_PREFIX = '#$' - - STATE_MAPPING = { - 'q': JOB_STATUS.PENDING, - 'h': JOB_STATUS.PENDING, - 'R': JOB_STATUS.RUNNING, - 'r': JOB_STATUS.RUNNING, - 'd': JOB_STATUS.CANCELLED, - 's': JOB_STATUS.ERROR, - 'w': JOB_STATUS.PENDING, - 'E': JOB_STATUS.ERROR, - 'T': JOB_STATUS.ERROR, - 't': JOB_STATUS.RUNNING, - } - """dict: mapping from SGE job states to their MAVIS JOB_STATUS equivalent""" - MAIL_TYPE_MAPPING = { - MAIL_TYPE.BEGIN: 'b', - MAIL_TYPE.NONE: 'n', - MAIL_TYPE.FAIL: 'as', - MAIL_TYPE.END: 'e', - MAIL_TYPE.ALL: 'abes', - } - """dict: mapping from MAVIS mail type options to SGE mail options""" - - @classmethod - def parse_qacct(cls, content): - """ - parses the information produced by qacct - - Args: - content (str): the content returned from the qacct command - - Raises - ValueError: when no job information is reported (this may happen due to a bad or too old job ID where information is no longer stored) - """ - if re.match(r'^\s*Total System Usage.*', content): - raise ValueError('Job information not found') - rows = [] - for section in re.split(r'=+\n', content)[1:]: # initial item will be empty - row = {} - for line in section.split('\n'): - if re.match(r'^[\s=]*$', line): - continue - col, val = re.split(r'\s+', line, 1) - val = val.strip() - if val == 'undefined': - val = None - row[col] = val - - if row['exit_status'] == '0' and row['failed'] == '0': - status = JOB_STATUS.COMPLETED - elif '(Killed)' in row['exit_status']: - status = JOB_STATUS.CANCELLED - else: - status = JOB_STATUS.FAILED - if ':' in row['failed']: - status_comment = row['failed'].split(':', 1)[1].strip() - else: - status_comment = '' - rows.append( - { - 'name': row['jobname'], - 'job_ident': row['jobnumber'], - 'task_ident': row['taskid'], - 'status': status, - 'status_comment': status_comment, - } - ) - return rows - - @classmethod - def parse_qstat(cls, content, job_id): - """ - parses the qstat content into rows/dicts representing individual jobs - - Args: - content (str): content returned from the qstat command - """ - header = [ - 'job-ID', - 'prior', - 'name', - 'user', - 'state', - 'submit/start at', - 'queue', - 'slots', - 'ja-task-ID', - ] - content = content.strip() - if not content: - return [] - lines = [line for line in content.split('\n') if line.strip()] - column_sizes = [] - for col in header: - match = re.search(col + r'\s*', lines[0]) - if not match: - raise ValueError( - 'Error in parsing the qstat content for the column from', col, lines[0] - ) - column_sizes.append(len(match.group(0))) - rows = [] - - for line in lines[1:]: - if re.match(r'^[\-]+$', line): - continue # ignore dashed separators - row = {} - pos = 0 - for col, size in zip(header, column_sizes): - row[col] = line[pos : pos + size].strip() - pos += size - task_ident = row['ja-task-ID'] - if not task_ident or set(task_ident) & set(',:-'): - task_ident = None - if row['job-ID'] == job_id: - rows.append( - { - 'task_ident': task_ident, - 'job_ident': row['job-ID'], - 'name': row['name'], - 'status': cls.convert_state(row['state']), - 'status_comment': '', - } - ) - return rows - - @classmethod - def convert_state(cls, state): - states = set() - for char in state: - states.add(cls.STATE_MAPPING[char]) - return cumulative_job_state(states) - - def submit(self, job): - """ - runs a subprocess sbatch command - - Args: - job (Job): the job to be submitted - """ - command = ['qsub', '-j', 'y'] # always join output - if job.job_ident: - raise ValueError('Job has already been submitted and has the job number', job.job_ident) - if job.queue: - command.extend(['-q', job.queue]) - if job.memory_limit: - command.extend( - ['-l', 'mem_free={0}M,mem_token={0}M,h_vmem={0}M'.format(job.memory_limit)] - ) - if job.time_limit: - command.extend(['-l', 'h_rt={}'.format(time_format(job.time_limit))]) - if job.import_env: - command.append('-V') - if job.dependencies: - command.append(self.format_dependencies(job)) - if job.name: - command.extend(['-N', job.name]) - if job.mail_type and job.mail_user: - command.extend(['-m', self.MAIL_TYPE_MAPPING[job.mail_type]]) - command.extend(['-M', job.mail_user]) - # options specific to job arrays - if isinstance(job, ArrayJob): - task_ranges = consecutive_ranges([t.task_ident for t in job.task_list]) - if len(task_ranges) != 1: - raise ValueError( - 'SGE does not support array jobs with non-consecutive task ranges', task_ranges - ) - command.extend(['-t', '{}-{}'.format(*task_ranges[0])]) - if job.stdout: - command.extend( - [ - '-o', - job.stdout.format( - name='\\${}'.format(self.ENV_JOB_NAME), - job_ident='\\${}'.format(self.ENV_JOB_IDENT), - task_ident='\\$TASK_ID', - ), - ] - ) - - command.append(job.script) - command = ' '.join(command) - LOG(command, level=logging.DEBUG) - LOG('submitting', job.name) - content = self.command(command, shell=True) - - # example: Your job-array 3760559.1-1:1 ("MV_mock-A36971_batch-E6aEZJnTQAau598tcsMjAE") has been submitted - # example: Your job 3766949 ("MP_batch-TvkFvM52v3ncuNQZb2M9TD") has been submitted - match = re.match( - r'^Your job(-array)? (\d+)(\.\d+-\d+:1)? .* has been submitted$', content, re.IGNORECASE - ) - if not match: - raise NotImplementedError( - 'Error in retrieving the submitted job number. Did not match the expected pattern', - content, - ) - job.job_ident = match.group(2) - job.status = JOB_STATUS.SUBMITTED - - try: - for task in job.task_list: - task.status = job.status - task.status_comment = job.status_comment - except AttributeError: - pass - - def update_info(self, job): - """ - runs a subprocess scontrol command to get job details and add them to the current job - - Args: - job (Job): the job information is being gathered for - - Raises - ValueError: if the job information could not be retrieved - """ - if not job.job_ident: - return - try: - content = self.command(['qstat', '-u', "*"]) - rows = self.parse_qstat(content, job.job_ident) - except subprocess.CalledProcessError: # job not queued - rows = [] - - updated = False - if not rows: - # job no longer scheduled - command = ['qacct', '-j', job.job_ident] - content = self.command(command) - rows = self.parse_qacct(content) - # job is still on the scheduler - for row in rows: - if row['job_ident'] != job.job_ident: - continue - try: - if row['task_ident'] and not job.has_task(row['task_ident']): - continue - except AttributeError: - pass - if row['task_ident']: - task_ident = int(row['task_ident']) - task = job.get_task(task_ident) - task.status = row['status'] - task.status_comment = row['status_comment'].strip() - else: - job.status = row['status'] - job.status_comment = row['status_comment'].strip() - updated = True - - try: - if not updated: - job.status = cumulative_job_state([task.status for task in job.task_list]) - except AttributeError: - pass # only applies to array jobs - - def cancel(self, job, task_ident=None): - """ - cancel a job or a specific task of an array job - - Args: - job (Job): the job to cancel - task_ident (int): if specified, will cancel the given task instead of the whole array or job - """ - if not job.job_ident: - return - try: - if task_ident is not None: - self.command(['qdel', job.job_ident, '-t', str(task_ident)]) - job.get_task(int(task_ident)).status = JOB_STATUS.CANCELLED - LOG('cancelled task', job.name, job.job_ident, task_ident) - else: - self.command(['qdel', job.job_ident]) - job.status = JOB_STATUS.CANCELLED - LOG('cancelled job', job.name, job.job_ident) - - try: - for task in job.task_list: - task.status = JOB_STATUS.CANCELLED - except AttributeError: - pass - except subprocess.CalledProcessError: - LOG('unable to cancel job', job.job_ident) - - def format_dependencies(self, job): - """ - returns a string representing the dependency argument - """ - # special case array dependency - try: - if len(job.dependencies) == 1 and job.tasks == job.dependencies[0].tasks: - dependency = job.dependencies[0] - if not dependency.job_ident: - raise ValueError( - 'The dependencies must be submitted before the dependent job', - job, - dependency, - ) - return '-hold_jid_ad {}'.format(dependency.job_ident) - except AttributeError: - pass - for dependency in job.dependencies: - if not dependency.job_ident: - raise ValueError( - 'The dependencies must be submitted before the dependent job', job, dependency - ) - - return '-hold_jid {}'.format(','.join([d.job_ident for d in job.dependencies])) - - -class TorqueScheduler(SgeScheduler): - """ - Class for managing interactions with the Torque scheduler - """ - - NAME = SCHEDULER.TORQUE - """:attr:`mavis.schedule.constants.SCHEDULER`: the type of scheduler""" - ENV_TASK_IDENT = 'PBS_ARRAYID' - ENV_JOB_IDENT = 'PBS_JOBID' - """str: expected pattern for environment variables which store the job id""" - ENV_ARRAY_IDENT = ENV_JOB_IDENT - ENV_JOB_NAME = 'PBS_JOBNAME' - """str: expected pattern for environment variables which store the job name""" - TAB_SIZE = 8 - MAIL_TYPE_MAPPING = { - MAIL_TYPE.BEGIN: 'b', - MAIL_TYPE.NONE: 'p', - MAIL_TYPE.FAIL: 'fa', - MAIL_TYPE.END: 'e', - MAIL_TYPE.ALL: 'abef', - } - """dict: mapping from MAVIS mail type options to Torque mail options""" - STATE_MAPPING = { - 'C': JOB_STATUS.COMPLETED, - 'E': JOB_STATUS.RUNNING, - 'H': JOB_STATUS.PENDING, - 'Q': JOB_STATUS.PENDING, - 'T': JOB_STATUS.RUNNING, - 'W': JOB_STATUS.PENDING, - 'S': JOB_STATUS.ERROR, - 'R': JOB_STATUS.RUNNING, - } - """dict: mapping from Torque job states to their MAVIS JOB_STATUS equivalent""" - - def format_dependencies(self, job): - """ - returns a string representing the dependency argument - """ - arr_dependencies = [] - job_dependencies = [] - - for dep in job.dependencies: - if not dep.job_ident: - raise ValueError('Dependencies must be submitted beforehand', job, dep) - - if isinstance(dep, ArrayJob): - task_ident = re.sub( - r'\[\]', '[][{}]'.format(dep.tasks) if dep.tasks > 1 else '[]', dep.job_ident - ) - arr_dependencies.append(task_ident) - else: - job_dependencies.append(dep.job_ident) - - result = [] - if arr_dependencies: - result.append('afterokarray:{}'.format(':'.join(arr_dependencies))) - if job_dependencies: - result.append('afterok:{}'.format(':'.join(job_dependencies))) - - return '-W depend={}'.format(','.join(result)) - - @classmethod - def parse_qstat(cls, content): - """ - parses the qstat content into rows/dicts representing individual jobs - - Args: - content (str): content returned from the qstat command - """ - content = re.sub(r'\t', ' ' * cls.TAB_SIZE, content) # PBS torque tab size is 8 - jobs = re.split(r'\s*\n\n\s*', content.strip()) - rows = [] - - for job in jobs: - if job.startswith('request_version') or not job: - continue - row = {} - lines = job.split('\n') - task_ident = None - row['Job Id'] = lines[0].split(':', 1)[1].strip() - match = re.match(r'^(\d+)\[(\d+)\](.*)$', row['Job Id']) - if match: - row['Job Id'] = '{}[]{}'.format(match.group(1), match.group(3)) - task_ident = int(match.group(2)) - tab_size = None - columns = [] - values = [] - for line in lines[1:]: - if not line.strip(): - continue - match = re.match(r'^(\s*)(\S.*)', line) - curr_tab_size = len(match.group(1)) - if tab_size is None: - tab_size = curr_tab_size - - if curr_tab_size > tab_size or '=' not in line: - if not values: - raise NotImplementedError( - 'Unexpected indentation prior to setting column', line - ) - values[-1] = values[-1] + line.strip() - elif curr_tab_size == tab_size: - col, val = line.split('=', 1) - columns.append(col.strip()) - values.append(val.strip()) - else: - raise NotImplementedError('Unexpected indentation', line) - for col, val in zip(columns, values): - row[col] = val - status = cls.STATE_MAPPING[row['job_state']] - if status == JOB_STATUS.COMPLETED: - if 'exit_status' in row: - if row['exit_status'] != '0': - status = JOB_STATUS.FAILED - else: - status = JOB_STATUS.CANCELLED - rows.append( - { - 'job_ident': row['Job Id'], - 'name': row['Job_Name'], - 'status': status, - 'task_ident': task_ident, - 'status_comment': '', - } - ) - return rows - - def submit(self, job): - """ - runs a subprocess qsub command - - Args: - job (Job): the job to be submitted - """ - command = ['qsub', '-j', 'oe'] # always join output as stdout - if job.job_ident: - raise ValueError('Job has already been submitted and has the job number', job.job_ident) - if job.queue: - command.extend(['-q', job.queue]) - if job.memory_limit: - command.extend(['-l', 'mem={0}mb'.format(job.memory_limit)]) - if job.time_limit: - command.extend(['-l', 'walltime={}'.format(time_format(job.time_limit))]) - if job.import_env: - command.append('-V') - if job.dependencies: - command.append(self.format_dependencies(job)) - if job.name: - command.extend(['-N', job.name]) - if job.stdout: - command.extend( - [ - '-o', - job.stdout.format( - name='${}'.format(self.ENV_JOB_NAME), - job_ident='${}'.format(self.ENV_JOB_IDENT), - task_ident='${}'.format(self.ENV_TASK_IDENT), - ), - ] - ) - if job.mail_type and job.mail_user: - command.extend(['-m', self.MAIL_TYPE_MAPPING[job.mail_type]]) - command.extend(['-M', job.mail_user]) - # options specific to job arrays - if isinstance(job, ArrayJob): - concurrency_limit = ( - '' if self.concurrency_limit is None else '%{}'.format(self.concurrency_limit) - ) - task_ranges = [ - '{}{}'.format(s, '-{}'.format(t) if s != t else '') - for s, t in consecutive_ranges([task.task_ident for task in job.task_list]) - ] - command.extend(['-t', '{}{}'.format(','.join(task_ranges), concurrency_limit)]) - - command.append(job.script) - LOG('submitting', job.name) - content = self.command(command) - - job.job_ident = content.strip() - job.status = JOB_STATUS.SUBMITTED - job.status_comment = '' - - # update task status - try: - for task in job.task_list: - task.status = job.status - task.status_comment = job.status_comment - except AttributeError: - pass - - def update_info(self, job): - """ - runs a subprocess scontrol command to get job details and add them to the current job - - Args: - job (Job): the job information is being gathered for - - Raises - ValueError: if the job information could not be retrieved - """ - if job.job_ident is None: - job.status = JOB_STATUS.NOT_SUBMITTED - return - command = ['qstat', '-f', '-t', job.job_ident] # always split into tasks - content = self.command(command) - rows = self.parse_qstat(content) - tasks_updated = False - - for row in rows: - if row['job_ident'] != job.job_ident: - continue - if isinstance(job, ArrayJob) and row['task_ident']: - task_ident = int(row['task_ident']) - try: - task = job.get_task(task_ident) - except KeyError: - pass - else: - task.status = row['status'] - task.status_comment = row['status_comment'] - tasks_updated = True - else: - job.status = row['status'] - job.status_comment = row['status_comment'] - - if tasks_updated: - job.status = cumulative_job_state([t.status for t in job.task_list]) - - def cancel(self, job, task_ident=None): - """ - cancel a job - - Args: - job (Job): the job to be cancelled - task_ident (int): if specified then a single task will be cancelled instead of the whole job or array - """ - if not job.job_ident: - return - try: - if task_ident is not None: - self.command(['qdel', job.job_ident, '-t', str(task_ident)]) - job.get_task(int(task_ident)).status = JOB_STATUS.CANCELLED - LOG('cancelled task', job.name, job.job_ident, task_ident) - else: - self.command(['qdel', job.job_ident]) - job.status = JOB_STATUS.CANCELLED - LOG('cancelled job', job.name, job.job_ident) - - try: - for task in job.task_list: - task.status = JOB_STATUS.CANCELLED - except AttributeError: - pass - except subprocess.CalledProcessError: - LOG('failed to cancel {}'.format(job.job_ident), level=logging.DEBUG) diff --git a/mavis/schemas/config.json b/mavis/schemas/config.json new file mode 100644 index 00000000..c50a8632 --- /dev/null +++ b/mavis/schemas/config.json @@ -0,0 +1,781 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "annotate.annotation_filters": { + "default": [ + "choose_more_annotated", + "choose_transcripts_by_priority" + ], + "description": "A comma separated list of filters to apply to putative annotations", + "items": { + "enum": [ + "choose_more_annotated", + "choose_transcripts_by_priority" + ], + "type": "string" + }, + "type": "array" + }, + "annotate.draw_fusions_only": { + "default": true, + "description": "Flag to indicate if events which do not produce a fusion transcript should produce illustrations", + "type": "boolean" + }, + "annotate.draw_non_synonymous_cdna_only": { + "default": true, + "description": "Flag to indicate if events which are synonymous at the cdna level should produce illustrations", + "type": "boolean" + }, + "annotate.max_orf_cap": { + "default": 3, + "description": "The maximum number of orfs to return (best putative orfs will be retained)", + "type": "integer" + }, + "annotate.min_domain_mapping_match": { + "default": 0.9, + "description": "A number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript to be displayed", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "annotate.min_orf_size": { + "default": 300, + "description": "The minimum length (in base pairs) to retain a putative open reading frame (orf)", + "type": "integer" + }, + "bam_stats.distribution_fraction": { + "default": 0.97, + "description": "the proportion of the distribution to use in computing stdev", + "maximum": 1, + "minimum": 0.01, + "type": "number" + }, + "bam_stats.sample_bin_size": { + "default": 1000, + "description": "how large to make the sample bin (in bp)", + "type": "integer" + }, + "bam_stats.sample_cap": { + "default": 1000, + "description": "maximum number of reads to collect for any given sample region", + "type": "integer" + }, + "bam_stats.sample_size": { + "default": 500, + "description": "the number of genes/bins to compute stats over", + "type": "integer" + }, + "cluster.cluster_initial_size_limit": { + "default": 25, + "description": "The maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering phase (combining based on overlap)", + "type": "integer" + }, + "cluster.cluster_radius": { + "default": 100, + "description": "Maximum distance allowed between paired breakpoint pairs", + "type": "integer" + }, + "cluster.limit_to_chr": { + "default": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y" + ], + "description": "A list of chromosome names to use. breakpointpairs on other chromosomes will be filteredout. for example '1 2 3 4' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4", + "items": { + "type": "string" + }, + "type": [ + "array", + "null" + ] + }, + "cluster.max_files": { + "default": 200, + "description": "The maximum number of files to output from clustering/splitting", + "minimum": 1, + "type": "integer" + }, + "cluster.max_proximity": { + "default": 5000, + "description": "The maximum distance away from an annotation before the region in considered to be uninformative", + "type": "integer" + }, + "cluster.min_clusters_per_file": { + "default": 50, + "description": "The minimum number of breakpoint pairs to output to a file", + "minimum": 1, + "type": "integer" + }, + "cluster.split_only": { + "default": false, + "description": "just split the input files, do not merge input breakpoints into clusters", + "type": "boolean" + }, + "cluster.uninformative_filter": { + "default": false, + "description": "Flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered out prior to clustering", + "type": "boolean" + }, + "convert": { + "additionalProperties": { + "properties": { + "assume_no_untemplated": { + "default": false, + "description": "Assume the lack of untemplated information means that there IS not untemplated sequence expected at the breakpoints", + "type": "boolean" + }, + "file_type": { + "description": "the tool the file is input from or 'mavis' for standard mavis-style tab files", + "enum": [ + "manta", + "delly", + "transabyss", + "pindel", + "chimerascan", + "mavis", + "defuse", + "breakdancer", + "vcf", + "breakseq", + "cnvnator", + "strelka", + "starfusion" + ], + "type": "string" + }, + "inputs": { + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "strand_specific": { + "default": false, + "type": "boolean" + } + }, + "required": [ + "inputs", + "file_type" + ], + "type": "object" + }, + "type": "object" + }, + "illustrate.domain_color": { + "default": "#ccccb3", + "description": "Domain fill color", + "type": "string" + }, + "illustrate.domain_mismatch_color": { + "default": "#b2182b", + "description": "Domain fill color on 0%% match", + "type": "string" + }, + "illustrate.domain_name_regex_filter": { + "default": "^PF\\d+$", + "description": "The regular expression used to select domains to be displayed (filtered by name)", + "type": "string" + }, + "illustrate.domain_scaffold_color": { + "default": "#000000", + "description": "The color of the domain scaffold", + "type": "string" + }, + "illustrate.drawing_width_iter_increase": { + "default": 500, + "description": "The amount (in pixels) by which to increase the drawing width upon failure to fit", + "type": "integer" + }, + "illustrate.exon_min_focus_size": { + "default": 10, + "description": "Minimum size of an exon for it to be granted a label or min exon width", + "type": "integer" + }, + "illustrate.gene1_color": { + "default": "#657e91", + "description": "The color of genes near the first gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.gene1_color_selected": { + "default": "#518dc5", + "description": "The color of the first gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.gene2_color": { + "default": "#325556", + "description": "The color of genes near the second gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.gene2_color_selected": { + "default": "#4c9677", + "description": "The color of the second gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.label_color": { + "default": "#000000", + "description": "The label color", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.mask_fill": { + "default": "#ffffff", + "description": "Color of mask (for deleted region etc.)", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.mask_opacity": { + "default": 0.7, + "description": "Opacity of the mask layer", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "illustrate.max_drawing_retries": { + "default": 5, + "description": "The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output", + "type": "integer" + }, + "illustrate.novel_exon_color": { + "default": "#5D3F6A", + "description": "Novel exon fill color", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.scaffold_color": { + "default": "#000000", + "description": "The color used for the gene/transcripts scaffolds", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.splice_color": { + "default": "#000000", + "description": "Splicing lines color", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.width": { + "default": 1000, + "description": "The drawing width in pixels", + "type": "integer" + }, + "illustratebreakpoint_color": { + "default": "#000000", + "description": "Breakpoint outline color", + "type": "string" + }, + "libraries": { + "additionalProperties": { + "additionalProperties": false, + "properties": { + "assign": { + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "total_batches": { + "type": "integer", + "min": 1, + "description": "The number of jobs to slit a library into for cluster/validate/annotate" + }, + "bam_file": { + "type": "string" + }, + "disease_status": { + "enum": [ + "diseased", + "normal" + ], + "type": "string" + }, + "median_fragment_size": { + "type": "integer" + }, + "protocol": { + "enum": [ + "genome", + "transcriptome" + ], + "type": "string" + }, + "read_length": { + "type": "integer" + }, + "stdev_fragment_size": { + "type": "integer" + }, + "strand_determining_read": { + "default": 2, + "description": "1 or 2. the read in the pair which determines if (assuming a stranded protocol) the first or second read in the pair matches the strand sequenced", + "type": "integer" + }, + "strand_specific": { + "default": false, + "type": "boolean" + } + }, + "required": [ + "disease_status", + "protocol", + "assign" + ], + "type": "object" + }, + "minProperties": 1, + "type": "object" + }, + "log": { + "type": "string" + }, + "log_level": { + "default": "INFO", + "enum": [ + "INFO", + "DEBUG" + ], + "type": "string" + }, + "output_dir": { + "type": "string" + }, + "pairing.contig_call_distance": { + "default": 10, + "description": "The maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair", + "type": "integer" + }, + "pairing.flanking_call_distance": { + "default": 50, + "description": "The maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair", + "type": "integer" + }, + "pairing.input_call_distance": { + "default": 20, + "description": "The maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair", + "type": "integer" + }, + "pairing.spanning_call_distance": { + "default": 20, + "description": "The maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair", + "type": "integer" + }, + "pairing.split_call_distance": { + "default": 20, + "description": "The maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair", + "type": "integer" + }, + "reference.aligner_reference": { + "examples": [ + "tests/data/mock_reference_genome.2bit" + ], + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "reference.annotations": { + "examples": [ + "tests/data/mock_annotations.json" + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "reference.dgv_annotation": { + "examples": [ + [ + "tests/data/mock_dgv_annotation.txt" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "reference.masking": { + "examples": [ + [ + "tests/data/mock_masking.tab" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "reference.reference_genome": { + "examples": [ + [ + "tests/data/mock_reference_genome.fa" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "reference.template_metadata": { + "examples": [ + [ + "tests/data/cytoBand.txt" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "skip_stage.validate": { + "default": false, + "description": "skip the validation stage of the MAVIS pipeline", + "type": "boolean" + }, + "summary.filter_cdna_synon": { + "default": true, + "description": "Filter all annotations synonymous at the cdna level", + "type": "boolean" + }, + "summary.filter_min_complexity": { + "default": 0.2, + "description": "Filter event calls based on call sequence complexity", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "summary.filter_min_flanking_reads": { + "default": 10, + "description": "Minimum number of flanking pairs for a call by flanking pairs", + "type": "integer" + }, + "summary.filter_min_linking_split_reads": { + "default": 1, + "description": "Minimum number of linking split reads for a call by split reads", + "type": "integer" + }, + "summary.filter_min_remapped_reads": { + "default": 5, + "description": "Minimum number of remapped reads for a call by contig", + "type": "integer" + }, + "summary.filter_min_spanning_reads": { + "default": 5, + "description": "Minimum number of spanning reads for a call by spanning reads", + "type": "integer" + }, + "summary.filter_min_split_reads": { + "default": 5, + "description": "Minimum number of split reads for a call by split reads", + "type": "integer" + }, + "summary.filter_protein_synon": { + "default": false, + "description": "Filter all annotations synonymous at the protein level", + "type": "boolean" + }, + "summary.filter_trans_homopolymers": { + "default": true, + "description": "Filter all single bp ins/del/dup events that are in a homopolymer region of at least 3 bps and are not paired to a genomic event", + "type": "boolean" + }, + "validate.aligner": { + "default": "blat", + "description": "The aligner to use to map the contigs/reads back to the reference e.g blat or bwa", + "enum": [ + "bwa mem", + "blat" + ], + "type": "string" + }, + "validate.assembly_kmer_size": { + "default": 0.74, + "description": "The percent of the read length to make kmers for assembly", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.assembly_max_paths": { + "default": 8, + "description": "The maximum number of paths to resolve. this is used to limit when there is a messy assembly graph to resolve. the assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater than the given setting", + "type": "integer" + }, + "validate.assembly_min_edge_trim_weight": { + "default": 3, + "description": "This is used to simplify the debruijn graph before path finding. edges with less than this frequency will be discarded if they are non-cutting, at a fork, or the end of a path", + "type": "integer" + }, + "validate.assembly_min_exact_match_to_remap": { + "default": 15, + "description": "The minimum length of exact matches to initiate remapping a read to a contig", + "type": "integer" + }, + "validate.assembly_min_remap_coverage": { + "default": 0.9, + "description": "Minimum fraction of the contig sequence which the remapped sequences must align over", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.assembly_min_remapped_seq": { + "default": 3, + "description": "The minimum input sequences that must remap for an assembled contig to be used", + "type": "integer" + }, + "validate.assembly_min_uniq": { + "default": 0.1, + "description": "Minimum percent uniq required to keep separate assembled contigs. if contigs are more similar then the lower scoring, then shorter, contig is dropped", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.assembly_strand_concordance": { + "default": 0.51, + "description": "When the number of remapped reads from each strand are compared, the ratio must be above this number to decide on the strand", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.blat_limit_top_aln": { + "default": 10, + "description": "Number of results to return from blat (ranking based on score)", + "type": "integer" + }, + "validate.blat_min_identity": { + "default": 0.9, + "description": "The minimum percent identity match required for blat results when aligning contigs", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.call_error": { + "default": 10, + "description": "Buffer zone for the evidence window", + "type": "integer" + }, + "validate.clean_aligner_files": { + "default": false, + "description": "Remove the aligner output files after the validation stage is complete. not required for subsequent steps but can be useful in debugging and deep investigation of events", + "type": "boolean" + }, + "validate.contig_aln_max_event_size": { + "default": 50, + "description": "Relates to determining breakpoints when pairing contig alignments. for any given read in a putative pair the soft clipping is extended to include any events of greater than this size. the softclipping is added to the side of the alignment as indicated by the breakpoint we are assigning pairs to", + "type": "integer" + }, + "validate.contig_aln_merge_inner_anchor": { + "default": 20, + "description": "The minimum number of consecutive exact match base pairs to not merge events within a contig alignment", + "type": "integer" + }, + "validate.contig_aln_merge_outer_anchor": { + "default": 15, + "description": "Minimum consecutively aligned exact matches to anchor an end for merging internal events", + "type": "integer" + }, + "validate.contig_aln_min_anchor_size": { + "default": 50, + "description": "The minimum number of aligned bases for a contig (m or =) in order to simplify. do not have to be consecutive", + "type": "integer" + }, + "validate.contig_aln_min_extend_overlap": { + "default": 10, + "description": "Minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment", + "type": "integer" + }, + "validate.contig_aln_min_query_consumption": { + "default": 0.9, + "description": "Minimum fraction of the original query sequence that must be used by the read(s) of the alignment", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.contig_aln_min_score": { + "default": 0.9, + "description": "Minimum score for a contig to be used as evidence in a call by contig", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.fetch_min_bin_size": { + "default": 50, + "description": "The minimum size of any bin for reading from a bam file. increasing this number will result in smaller bins being merged or less bins being created (depending on the fetch method)", + "type": "integer" + }, + "validate.fetch_reads_bins": { + "default": 5, + "description": "Number of bins to split an evidence window into to ensure more even sampling of high coverage regions", + "type": "integer" + }, + "validate.fetch_reads_limit": { + "default": 3000, + "description": "Maximum number of reads, cap, to loop over for any given evidence window", + "type": "integer" + }, + "validate.filter_secondary_alignments": { + "default": true, + "description": "Filter secondary alignments when gathering read evidence", + "type": "boolean" + }, + "validate.fuzzy_mismatch_number": { + "default": 1, + "description": "The number of events/mismatches allowed to be considered a fuzzy match", + "type": "integer" + }, + "validate.max_sc_preceeding_anchor": { + "default": 6, + "description": "When remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of where we expect it. for example for a softclipped read on a breakpoint with a left orientation this limits the amount of softclipping that is allowed on the right. if this is set to none then there is no limit on softclipping", + "type": "integer" + }, + "validate.min_anchor_exact": { + "default": 6, + "description": "Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum number of consecutive exact matches to anchor a read to initiate targeted realignment", + "type": "integer" + }, + "validate.min_anchor_fuzzy": { + "default": 10, + "description": "Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum length of a fuzzy match to anchor a read to initiate targeted realignment", + "type": "integer" + }, + "validate.min_anchor_match": { + "default": 0.9, + "description": "Minimum percent match for a read to be kept as evidence", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.min_call_complexity": { + "default": 0.1, + "description": "The minimum complexity score for a call sequence. is an average for non-contig calls. filters low complexity contigs before alignment. see [contig_complexity](#contig_complexity)", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "validate.min_double_aligned_to_estimate_insertion_size": { + "default": 2, + "description": "The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping", + "type": "integer" + }, + "validate.min_flanking_pairs_resolution": { + "default": 10, + "description": "The minimum number of flanking reads required to call a breakpoint by flanking evidence", + "type": "integer" + }, + "validate.min_linking_split_reads": { + "default": 2, + "description": "The minimum number of split reads which aligned to both breakpoints", + "type": "integer" + }, + "validate.min_mapping_quality": { + "default": 5, + "description": "The minimum mapping quality of reads to be used as evidence", + "type": "integer" + }, + "validate.min_non_target_aligned_split_reads": { + "default": 1, + "description": "The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local alignment to the target region to call a breakpoint by split read evidence", + "type": "integer" + }, + "validate.min_sample_size_to_apply_percentage": { + "default": 10, + "description": "Minimum number of aligned bases to compute a match percent. if there are less than this number of aligned bases (match or mismatch) the percent comparator is not used", + "type": "integer" + }, + "validate.min_softclipping": { + "default": 6, + "description": "Minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence", + "type": "integer" + }, + "validate.min_spanning_reads_resolution": { + "default": 5, + "description": "Minimum number of spanning reads required to call an event by spanning evidence", + "type": "integer" + }, + "validate.min_splits_reads_resolution": { + "default": 3, + "description": "Minimum number of split reads required to call a breakpoint by split reads", + "type": "integer" + }, + "validate.outer_window_min_event_size": { + "default": 125, + "description": "The minimum size of an event in order for flanking read evidence to be collected", + "type": "integer" + }, + "validate.stdev_count_abnormal": { + "default": 3, + "description": "The number of standard deviations away from the normal considered expected and therefore not qualifying as flanking reads", + "type": "number" + }, + "validate.trans_fetch_reads_limit": { + "default": 12000, + "description": "Related to [fetch_reads_limit](#fetch_reads_limit). overrides fetch_reads_limit for transcriptome libraries when set. if this has a value of none then fetch_reads_limit will be used for transcriptome libraries instead", + "type": [ + "integer", + "null" + ] + }, + "validate.trans_min_mapping_quality": { + "default": 0, + "description": "Related to [min_mapping_quality](#min_mapping_quality). overrides the min_mapping_quality if the library is a transcriptome and this is set to any number not none. if this value is none, min_mapping_quality is used for transcriptomes aswell as genomes", + "type": [ + "integer", + "null" + ] + }, + "validate.write_evidence_files": { + "default": true, + "description": "Write the intermediate bam and bed files containing the raw evidence collected and contigs aligned. not required for subsequent steps but can be useful in debugging and deep investigation of events", + "type": "boolean" + } + }, + "anyOf": [ + { + "not": { + "properties": { + "skip_stage.validate": { + "const": true + } + }, + "required": [ + "reference.aligner_reference" + ] + } + } + ], + "type": "object" +} diff --git a/mavis/schemas/overlay.json b/mavis/schemas/overlay.json new file mode 100644 index 00000000..3fe89cf5 --- /dev/null +++ b/mavis/schemas/overlay.json @@ -0,0 +1,142 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "illustrate.breakpoint_color": { + "default": "#000000", + "description": "Breakpoint outline color", + "type": "string" + }, + "illustrate.domain_color": { + "default": "#ccccb3", + "description": "Domain fill color", + "type": "string" + }, + "illustrate.domain_mismatch_color": { + "default": "#b2182b", + "description": "Domain fill color on 0%% match", + "type": "string" + }, + "illustrate.domain_name_regex_filter": { + "default": "^PF\\d+$", + "description": "The regular expression used to select domains to be displayed (filtered by name)", + "type": "string" + }, + "illustrate.domain_scaffold_color": { + "default": "#000000", + "description": "The color of the domain scaffold", + "type": "string" + }, + "illustrate.drawing_width_iter_increase": { + "default": 500, + "description": "The amount (in pixels) by which to increase the drawing width upon failure to fit", + "type": "integer" + }, + "illustrate.exon_min_focus_size": { + "default": 10, + "description": "Minimum size of an exon for it to be granted a label or min exon width", + "type": "integer" + }, + "illustrate.gene1_color": { + "default": "#657e91", + "description": "The color of genes near the first gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.gene1_color_selected": { + "default": "#518dc5", + "description": "The color of the first gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.gene2_color": { + "default": "#325556", + "description": "The color of genes near the second gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.gene2_color_selected": { + "default": "#4c9677", + "description": "The color of the second gene", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.label_color": { + "default": "#000000", + "description": "The label color", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.mask_fill": { + "default": "#ffffff", + "description": "Color of mask (for deleted region etc.)", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.mask_opacity": { + "default": 0.7, + "description": "Opacity of the mask layer", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "illustrate.max_drawing_retries": { + "default": 5, + "description": "The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output", + "type": "integer" + }, + "illustrate.novel_exon_color": { + "default": "#5D3F6A", + "description": "Novel exon fill color", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.scaffold_color": { + "default": "#000000", + "description": "The color used for the gene/transcripts scaffolds", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.splice_color": { + "default": "#000000", + "description": "Splicing lines color", + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" + }, + "illustrate.width": { + "default": 1000, + "description": "The drawing width in pixels", + "type": "integer" + }, + "log": { + "type": "string" + }, + "log_level": { + "default": "INFO", + "enum": [ + "INFO", + "DEBUG" + ], + "type": "string" + }, + "reference.annotations": { + "examples": [ + "tests/data/mock_annotations.json" + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + "validate.min_mapping_quality": { + "default": 5, + "description": "The minimum mapping quality of reads to be used as evidence", + "type": "integer" + } + }, + "required": [ + "reference.annotations" + ], + "type": "object" +} diff --git a/mavis/summary/main.py b/mavis/summary/main.py index b0a51352..7e46f22d 100644 --- a/mavis/summary/main.py +++ b/mavis/summary/main.py @@ -1,11 +1,15 @@ -from functools import partial import os import re import time +from functools import partial +from typing import Dict, List import tab -from .constants import DEFAULTS, HOMOPOLYMER_MIN_LENGTH +from ..annotate.file_io import ReferenceFile +from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE +from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs, soft_cast +from .constants import HOMOPOLYMER_MIN_LENGTH from .summary import ( annotate_dgv, filter_by_annotations, @@ -14,9 +18,6 @@ get_pairing_state, group_by_distance, ) -from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE -from ..pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from ..util import generate_complete_stamp, LOG, output_tabbed_file, read_inputs, soft_cast def soft_cast_null(value): @@ -26,36 +27,17 @@ def soft_cast_null(value): return value -def main( - inputs, - output, - annotations, - dgv_annotation=None, - filter_cdna_synon=DEFAULTS.filter_cdna_synon, - filter_protein_synon=DEFAULTS.filter_protein_synon, - filter_min_remapped_reads=DEFAULTS.filter_min_remapped_reads, - filter_min_spanning_reads=DEFAULTS.filter_min_spanning_reads, - filter_min_flanking_reads=DEFAULTS.filter_min_flanking_reads, - filter_min_split_reads=DEFAULTS.filter_min_split_reads, - filter_trans_homopolymers=DEFAULTS.filter_trans_homopolymers, - filter_min_linking_split_reads=DEFAULTS.filter_min_linking_split_reads, - filter_min_complexity=DEFAULTS.filter_min_complexity, - flanking_call_distance=PAIRING_DEFAULTS.flanking_call_distance, - split_call_distance=PAIRING_DEFAULTS.split_call_distance, - contig_call_distance=PAIRING_DEFAULTS.contig_call_distance, - spanning_call_distance=PAIRING_DEFAULTS.spanning_call_distance, - start_time=int(time.time()), - **kwargs -): - annotations.load() - if dgv_annotation: +def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time())): + annotations = ReferenceFile.load_from_config(config, 'annotations', eager_load=True) + dgv_annotation = ReferenceFile.load_from_config(config, 'dgv_annotation') + if not dgv_annotation.is_empty(): dgv_annotation.load() # pairing threshold parameters to be defined in config file distances = { - CALL_METHOD.FLANK: flanking_call_distance, - CALL_METHOD.SPLIT: split_call_distance, - CALL_METHOD.CONTIG: contig_call_distance, - CALL_METHOD.SPAN: spanning_call_distance, + CALL_METHOD.FLANK: config['pairing.flanking_call_distance'], + CALL_METHOD.SPLIT: config['pairing.split_call_distance'], + CALL_METHOD.CONTIG: config['pairing.contig_call_distance'], + CALL_METHOD.SPAN: config['pairing.spanning_call_distance'], } bpps = [] @@ -147,17 +129,17 @@ def main( for bpp in bpps: # filter by synonymous and RNA homopolymers - if filter_protein_synon and bpp.protein_synon: + if config['summary.filter_protein_synon'] and bpp.protein_synon: bpp.data[COLUMNS.filter_comment] = 'synonymous protein' filtered_pairs.append(bpp) continue - elif filter_cdna_synon and bpp.cdna_synon: + elif config['summary.filter_cdna_synon'] and bpp.cdna_synon: bpp.data[COLUMNS.filter_comment] = 'synonymous cdna' filtered_pairs.append(bpp) continue elif all( [ - filter_trans_homopolymers, + config['summary.filter_trans_homopolymers'], bpp.protocol == PROTOCOL.TRANS, bpp.data.get(COLUMNS.repeat_count, None), bpp.event_type in [SVTYPE.DUP, SVTYPE.INS, SVTYPE.DEL], @@ -182,7 +164,7 @@ def main( continue # filter based on the sequence call complexity sc = str(bpp.data.get(COLUMNS.call_sequence_complexity, 'none')).lower() - if sc != 'none' and float(sc) < filter_min_complexity: + if sc != 'none' and float(sc) < config['summary.filter_min_complexity']: bpp.data[COLUMNS.filter_comment] = 'low complexity' filtered_pairs.append(bpp) continue @@ -192,11 +174,11 @@ def main( # filter based on minimum evidence levels bpps, filtered = filter_by_evidence( bpps, - filter_min_remapped_reads=filter_min_remapped_reads, - filter_min_spanning_reads=filter_min_spanning_reads, - filter_min_flanking_reads=filter_min_flanking_reads, - filter_min_split_reads=filter_min_split_reads, - filter_min_linking_split_reads=filter_min_linking_split_reads, + filter_min_remapped_reads=config['summary.filter_min_remapped_reads'], + filter_min_spanning_reads=config['summary.filter_min_spanning_reads'], + filter_min_flanking_reads=config['summary.filter_min_flanking_reads'], + filter_min_split_reads=config['summary.filter_min_split_reads'], + filter_min_linking_split_reads=config['summary.filter_min_linking_split_reads'], ) for pair in filtered: pair.data[COLUMNS.filter_comment] = 'low evidence' @@ -340,7 +322,7 @@ def main( rows = [] for lib in bpps_by_library: LOG('annotating dgv for', lib) - if dgv_annotation: + if not dgv_annotation.is_empty(): annotate_dgv( bpps_by_library[lib], dgv_annotation.content, distance=10 ) # TODO make distance a parameter @@ -401,3 +383,4 @@ def main( ): lib_rows.append(row) output_tabbed_file(lib_rows, filename, header=output_columns) + generate_complete_stamp(output, LOG) diff --git a/mavis/validate/base.py b/mavis/validate/base.py index e47b12ae..911e4984 100644 --- a/mavis/validate/base.py +++ b/mavis/validate/base.py @@ -1,6 +1,6 @@ import itertools import logging -from .constants import DEFAULTS + from ..assemble import assemble from ..bam import cigar as _cigar from ..bam import read as _read @@ -13,13 +13,14 @@ ORIENT, PROTOCOL, PYSAM_READ_FLAGS, - reverse_complement, STRAND, SVTYPE, + reverse_complement, ) from ..error import NotSpecifiedError from ..interval import Interval from ..util import DEVNULL +from .constants import DEFAULTS class Evidence(BreakpointPair): diff --git a/mavis/validate/constants.py b/mavis/validate/constants.py index 652cef7b..a1a84ed5 100644 --- a/mavis/validate/constants.py +++ b/mavis/validate/constants.py @@ -1,5 +1,5 @@ -from ..constants import float_fraction from ..align import SUPPORTED_ALIGNER +from ..constants import float_fraction from ..util import WeakMavisNamespace PASS_FILENAME = 'validation-passed.tab' diff --git a/mavis/validate/main.py b/mavis/validate/main.py index 547d1452..6dc14203 100644 --- a/mavis/validate/main.py +++ b/mavis/validate/main.py @@ -4,36 +4,38 @@ import re import time import warnings +from typing import Dict, List import pysam from shortuuid import uuid -from .call import call_events -from .constants import DEFAULTS, PASS_FILENAME -from .evidence import GenomeEvidence, TranscriptomeEvidence -from ..align import align_sequences, select_contig_alignments, SUPPORTED_ALIGNER +from ..align import SUPPORTED_ALIGNER, align_sequences, select_contig_alignments from ..annotate.base import BioInterval +from ..annotate.file_io import ReferenceFile from ..bam import cigar as _cigar from ..bam.cache import BamCache from ..breakpoint import BreakpointPair -from ..constants import CALL_METHOD, COLUMNS, MavisNamespace, PROTOCOL -from ..util import filter_on_overlap, LOG, mkdirp, output_tabbed_file, read_inputs, write_bed_file +from ..config import get_by_prefix +from ..constants import CALL_METHOD, COLUMNS, PROTOCOL +from ..util import ( + LOG, + filter_on_overlap, + generate_complete_stamp, + mkdirp, + output_tabbed_file, + read_inputs, + write_bed_file, +) +from .call import call_events +from .constants import PASS_FILENAME +from .evidence import GenomeEvidence, TranscriptomeEvidence def main( - inputs, - output, - bam_file, - strand_specific, - library, - protocol, - median_fragment_size, - stdev_fragment_size, - read_length, - reference_genome, - annotations, - masking, - aligner_reference, + inputs: List[str], + output: str, + library: str, + config: Dict, start_time=int(time.time()), **kwargs ): @@ -52,16 +54,15 @@ def main( aligner_reference (mavis.annotate.file_io.ReferenceFile): path to the aligner reference file (e.g 2bit file for blat) """ mkdirp(output) - # check the files exist early to avoid waiting for errors - if protocol == PROTOCOL.TRANS: - annotations.load() - reference_genome.load() - masking.load() - - validation_settings = {} - validation_settings.update(DEFAULTS.items()) - validation_settings.update({k: v for k, v in kwargs.items() if k in DEFAULTS}) - validation_settings = MavisNamespace(**validation_settings) + reference_genome = ReferenceFile.load_from_config(config, 'reference_genome', eager_load=True) + annotations = ReferenceFile.load_from_config( + config, + 'annotations', + eager_load=bool(config['libraries'][library]['protocol'] == PROTOCOL.TRANS), + ) + masking = ReferenceFile.load_from_config(config, 'masking') + if not masking.is_empty(): + masking.load() raw_evidence_bam = os.path.join(output, 'raw_evidence.bam') contig_bam = os.path.join(output, 'contigs.bam') @@ -71,21 +72,23 @@ def main( passed_bed_file = os.path.join(output, 'validation-passed.bed') failed_output_file = os.path.join(output, 'validation-failed.tab') contig_aligner_fa = os.path.join(output, 'contigs.fa') - if validation_settings.aligner == SUPPORTED_ALIGNER.BLAT: + if config['validate.aligner'] == SUPPORTED_ALIGNER.BLAT: contig_aligner_output = os.path.join(output, 'contigs.blat_out.pslx') contig_aligner_log = os.path.join(output, 'contigs.blat.log') - elif validation_settings.aligner == SUPPORTED_ALIGNER.BWA_MEM: + elif config['validate.aligner'] == SUPPORTED_ALIGNER.BWA_MEM: contig_aligner_output = os.path.join(output, 'contigs.bwa_mem.sam') contig_aligner_log = os.path.join(output, 'contigs.bwa_mem.log') else: - raise NotImplementedError('unsupported aligner', validation_settings.aligner) + raise NotImplementedError('unsupported aligner', config['validate.aligner']) igv_batch_file = os.path.join(output, 'igv.batch') - input_bam_cache = BamCache(bam_file, strand_specific) + input_bam_cache = BamCache( + config['libraries'][library]['bam_file'], config['libraries'][library]['strand_specific'] + ) bpps = read_inputs( inputs, add_default={COLUMNS.cluster_id: None, COLUMNS.stranded: False}, - add={COLUMNS.protocol: protocol, COLUMNS.library: library}, + add={COLUMNS.protocol: config['libraries'][library]['protocol'], COLUMNS.library: library}, expand_strand=False, expand_orient=True, cast={COLUMNS.cluster_id: lambda x: str(uuid()) if not x else x}, @@ -103,10 +106,10 @@ def main( stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, data=bpp.data, - stdev_fragment_size=stdev_fragment_size, - read_length=read_length, - median_fragment_size=median_fragment_size, - **dict(validation_settings.items()) + stdev_fragment_size=config['libraries'][library]['stdev_fragment_size'], + read_length=config['libraries'][library]['read_length'], + median_fragment_size=config['libraries'][library]['median_fragment_size'], + **get_by_prefix(config, 'validate.') ) evidence_clusters.append(evidence) except ValueError as err: @@ -125,10 +128,11 @@ def main( stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, data=bpp.data, - stdev_fragment_size=stdev_fragment_size, - read_length=read_length, - median_fragment_size=median_fragment_size, - **dict(validation_settings.items()) + stdev_fragment_size=config['libraries'][library]['stdev_fragment_size'], + read_length=config['libraries'][library]['read_length'], + median_fragment_size=config['libraries'][library]['median_fragment_size'], + strand_determining_read=config['libraries'][library]['strand_determining_read'], + **get_by_prefix(config, 'validate.') ) evidence_clusters.append(evidence) except ValueError as err: @@ -141,7 +145,12 @@ def main( extended_masks[chrom] = [] for mask in masks: extended_masks[chrom].append( - BioInterval(chrom, mask.start - read_length, mask.end + read_length, name=mask.name) + BioInterval( + chrom, + mask.start - config['libraries'][library]['read_length'], + mask.end + config['libraries'][library]['read_length'], + name=mask.name, + ) ) evidence_clusters, filtered_evidence_clusters = filter_on_overlap( @@ -215,12 +224,12 @@ def main( reference_genome=reference_genome.content, aligner_fa_input_file=contig_aligner_fa, aligner_output_file=contig_aligner_output, - clean_files=validation_settings.clean_aligner_files, - aligner=kwargs.get('aligner', validation_settings.aligner), - aligner_reference=aligner_reference.name[0], + clean_files=config['validate.clean_aligner_files'], + aligner=kwargs.get('aligner', config['validate.aligner']), + aligner_reference=config['reference.aligner_reference'][0], aligner_output_log=contig_aligner_log, - blat_min_identity=kwargs.get('blat_min_identity', validation_settings.blat_min_identity), - blat_limit_top_aln=kwargs.get('blat_limit_top_aln', validation_settings.blat_limit_top_aln), + blat_min_identity=kwargs.get('blat_min_identity', config['validate.blat_min_identity']), + blat_limit_top_aln=kwargs.get('blat_limit_top_aln', config['validate.blat_limit_top_aln']), log=LOG, ) for evidence in evidence_clusters: @@ -331,7 +340,7 @@ def main( itertools.chain.from_iterable([e.get_bed_repesentation() for e in event_calls]), ) - if validation_settings.write_evidence_files: + if config['validate.write_evidence_files']: with pysam.AlignmentFile(contig_bam, 'wb', template=input_bam_cache.fh) as fh: LOG('writing:', contig_bam, time_stamp=True) for evidence in evidence_clusters: @@ -376,4 +385,11 @@ def main( fh.write('load {} name="{}"\n'.format(contig_bam, 'aligned contigs')) fh.write('load {} name="{}"\n'.format(evidence_bed, 'evidence windows')) fh.write('load {} name="{}"\n'.format(raw_evidence_bam, 'raw evidence')) - fh.write('load {} name="{} {} input"\n'.format(bam_file, library, protocol)) + fh.write( + 'load {} name="{} {} input"\n'.format( + config['libraries'][library]['bam_file'], + library, + config['libraries'][library]['protocol'], + ) + ) + generate_complete_stamp(output, LOG, start_time=start_time) diff --git a/setup.py b/setup.py index 07683e14..dd23fb29 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -VERSION = '2.2.7' +VERSION = '2.2.8' def parse_md_readme(): @@ -15,7 +15,7 @@ def parse_md_readme(): rst_lines = parse_from_file('README.md').split('\n') long_description = [ - '.. image:: http://mavis.bcgsc.ca/images/acronym.svg\n\n|\n' + '.. image:: http://mavis.bcgsc.ca/docs/latest/_static/acronym.svg\n\n|\n' ] # backup since pip can't handle raw directives i = 0 while i < len(rst_lines): @@ -85,10 +85,12 @@ def check_nonpython_dependencies(): 'colour', 'networkx==1.11.0', 'numpy>=1.13.1', + 'pandas>=1.1, <2', 'pysam>=0.9, <=0.15.2', 'pyvcf==0.6.8', 'shortuuid>=0.5.0', 'svgwrite', + 'snakemake>=6.1.1, <7', ] DEPLOY_REQS = ['twine', 'm2r', 'wheel'] @@ -122,6 +124,8 @@ def check_nonpython_dependencies(): 'calculate_ref_alt_counts = tools.calculate_ref_alt_counts:main', ] }, + include_package_data=True, + data_files=[('mavis', ['mavis/schemas/config.json', 'mavis/schemas/overlay.json'])], project_urls={'mavis': 'http://mavis.bcgsc.ca'}, ) check_nonpython_dependencies() diff --git a/tests/end_to_end/__init__.py b/tests/end_to_end/__init__.py index 3422c76b..e69de29b 100644 --- a/tests/end_to_end/__init__.py +++ b/tests/end_to_end/__init__.py @@ -1,21 +0,0 @@ -import glob -import os - - -def glob_exists(*pos, strict=False, n=1): - globexpr = os.path.join(*pos) - file_list = glob.glob(globexpr) - if strict and len(file_list) == n: - return file_list[0] if len(file_list) == 1 else file_list - elif not strict and len(file_list) > 0: - return file_list - else: - print(globexpr) - print(file_list) - return False - - -def glob_not_exists(*pos): - globexpr = os.path.join(*pos) - file_list = glob.glob(globexpr) - return not file_list diff --git a/tests/end_to_end/test_config.py b/tests/end_to_end/test_config.py deleted file mode 100644 index 59c3f47d..00000000 --- a/tests/end_to_end/test_config.py +++ /dev/null @@ -1,202 +0,0 @@ -import argparse -import glob -import itertools -import os -import shutil -import statistics -import sys -import tempfile -import unittest -from unittest import mock - -from mavis.constants import SUBCOMMAND -from mavis.main import main -from mavis.tools import SUPPORTED_TOOL -from mavis.util import unique_exists - -from ..util import get_data - - -ARGERROR_EXIT_CODE = 2 - - -class TestConfig(unittest.TestCase): - def setUp(self): - if 'MAVIS_ANNOTATIONS' in os.environ: - del os.environ['MAVIS_ANNOTATIONS'] - self.temp_output = tempfile.mkdtemp() - # [--library {genome,transcriptome} {diseased,normal} [strand_specific] [/path/to/bam/file]] - self.genome = ['--library', 'mock_genome', 'genome', 'diseased'] - self.genome_bam = get_data('mock_reads_for_events.sorted.bam') - self.trans = ['--library', 'mock_trans', 'transcriptome', 'diseased'] - self.trans_bam = get_data('mock_trans_reads_for_events.sorted.bam') - self.annotations = get_data('mock_reference_annotations.json') - self.args = ['mavis', SUBCOMMAND.CONFIG] - self.input = get_data('mock_sv_events.tsv') - - def run_main(self, exit_status=0): - outputfile = os.path.join(self.temp_output, 'config.cfg') - self.args.extend(['-w', outputfile]) - with mock.patch.object(sys, 'argv', [str(a) for a in self.args]): - print('sys.argv', sys.argv) - try: - return_code = main() - except SystemExit as ex: - return_code = ex.code - self.assertEqual(exit_status, return_code) - - def test_no_libs_no_annotations(self): - self.run_main() - - def test_no_input_error(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.run_main(ARGERROR_EXIT_CODE) - - def test_input_missing_library(self): - self.args.extend( - self.genome - + ['False', self.genome_bam, '--input', self.input, 'mock_genome', 'bad_genome'] - ) - self.run_main(ARGERROR_EXIT_CODE) - - def test_assign_missing_library(self): - self.args.extend( - self.genome - + [ - 'False', - self.genome_bam, - '--input', - self.input, - 'mock_genome', - '--assign', - 'bad_genome', - self.input, - ] - ) - self.run_main(ARGERROR_EXIT_CODE) - - def test_skip_no_annotations(self): - self.args.extend( - self.trans - + [ - 'False', - self.trans_bam, - '--input', - self.input, - 'mock_trans', - '--skip_stage', - SUBCOMMAND.VALIDATE, - ] - ) - self.run_main() - - def test_requires_annotations_trans(self): - self.args.extend( - self.trans + ['False', self.trans_bam, '--input', self.input, 'mock_trans'] - ) - self.run_main(ARGERROR_EXIT_CODE) - - def test_require_bam_noskip_error(self): - self.args.extend( - self.genome + ['--annotations', self.annotations, '--input', self.input, 'mock_genome'] - ) - self.run_main(ARGERROR_EXIT_CODE) - - def test_genome_only(self): - # should be ok without the annotations file - self.args.extend( - self.genome + ['False', self.genome_bam, '--input', self.input, 'mock_genome'] - ) - self.run_main() - - def test_genome_include_defaults(self): - # should be ok without the annotations file - self.args.extend( - self.genome - + ['False', self.genome_bam, '--input', self.input, 'mock_genome', '--add_defaults'] - ) - self.run_main() - - def test_trans_with_annotations(self): - self.args.extend( - itertools.chain( - self.genome, - [False, self.genome_bam], - self.trans, - [ - True, - self.trans_bam, - '--input', - self.input, - 'mock_genome', - 'mock_trans', - '--annotations', - self.annotations, - ], - ) - ) - with self.assertRaises(statistics.StatisticsError): # too few annotations to calc median - self.run_main() - - def test_convert_multiple(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.args.extend( - [ - '--convert', - 'ta', - 'transabyss_events.tab', - 'transabyss_indels_output.tab', - 'transabyss', - ] - ) - self.args.extend(['--assign', 'mock_genome', 'ta']) - self.run_main() - - def test_convert_multiple_strand(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.args.extend( - [ - '--convert', - 'ta', - 'transabyss_events.tab', - 'transabyss_indels_output.tab', - 'transabyss', - 'False', - ] - ) - self.args.extend(['--assign', 'mock_genome', 'ta']) - self.run_main() - - def test_convert_quoted(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.args.extend(['--convert', 'ta', 'transabyss_{events,indels_output}.tab', 'transabyss']) - self.args.extend(['--assign', 'mock_genome', 'ta']) - self.run_main() - - def test_convert_quoted_strand(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.args.extend( - ['--convert', 'ta', 'transabyss_{events,indels_output}.tab', 'transabyss', 'False'] - ) - self.args.extend(['--assign', 'mock_genome', 'ta']) - self.run_main() - - def test_convert_argument_error(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.args.extend(['--convert', 'ta', 'transabyss', 'False']) - self.args.extend(['--assign', 'mock_genome', 'ta']) - self.run_main(ARGERROR_EXIT_CODE) - - def test_convert_argument_error2(self): - self.args.extend(self.genome + ['False', self.genome_bam]) - self.args.extend(['--convert', 'ta', 'transabyss']) - self.args.extend(['--assign', 'mock_genome', 'ta']) - self.run_main(ARGERROR_EXIT_CODE) - - def tearDown(self): - # remove the temp directory and outputs - shutil.rmtree(self.temp_output) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index d92a7eae..ff58b064 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -6,15 +6,13 @@ import unittest from unittest.mock import patch - from mavis.constants import ORIENT, SUBCOMMAND, SVTYPE from mavis.main import main from mavis.tools import SUPPORTED_TOOL -from mavis.util import unique_exists, read_bpp_from_input_file +from mavis.util import read_bpp_from_input_file, unique_exists from ..util import get_data - TEMP_OUTPUT = None @@ -41,7 +39,7 @@ def run_main(self, inputfile, file_type, strand_specific=False): strand_specific, ] with patch.object(sys, 'argv', args): - self.assertEqual(0, main()) + main() print('output', outputfile) self.assertTrue(unique_exists(outputfile)) result = {} diff --git a/tests/end_to_end/test_full_pipeline.py b/tests/end_to_end/test_full_pipeline.py deleted file mode 100644 index 29ddae18..00000000 --- a/tests/end_to_end/test_full_pipeline.py +++ /dev/null @@ -1,406 +0,0 @@ -import os -import shutil -import sys -import tempfile -import unittest -from unittest import mock - -from mavis.constants import SUBCOMMAND, EXIT_OK, EXIT_ERROR -from mavis.main import main -from mavis.util import unique_exists - -from . import glob_exists, glob_not_exists -from ..util import get_data - - -CONFIG = get_data('pipeline_config.cfg') -BWA_CONFIG = get_data('bwa_pipeline_config.cfg') -CLEAN_CONFIG = get_data('clean_pipeline_config.cfg') -MOCK_GENOME = 'mock-A36971' -MOCK_TRANS = 'mock-A47933' -ENV = {e: v for e, v in os.environ.items() if not e.startswith('MAVIS_')} -ENV.update({'MAVIS_SCHEDULER': 'LOCAL', 'MAVIS_CONCURRENCY_LIMIT': '2'}) - - -def print_file_tree(dirname): - for root, dirs, files in os.walk(dirname): - level = root.replace(dirname, '').count(os.sep) - indent = ' ' * 4 * (level) - print('{}{}/'.format(indent, os.path.basename(root))) - subindent = ' ' * 4 * (level + 1) - for f in files: - print('{}{}'.format(subindent, f)) - - -@unittest.skipIf( - not int(os.environ.get('RUN_FULL', 1)), - 'slower tests will not be run unless the environment variable RUN_FULL is given', -) -class TestPipeline(unittest.TestCase): - def setUp(self): - # create the temp output directory to store file outputs - self.temp_output = tempfile.mkdtemp() - print('output dir', self.temp_output) - - def check_annotate(self, lib): - # run annotation - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.ANNOTATE)) - # check the generated files - for filename in [ - 'annotations.tab', - 'annotations.fusion-cdna.fa', - 'drawings', - 'drawings/*svg', - 'drawings/*json', - 'MAVIS-*.COMPLETE', - ]: - filename = os.path.join(self.temp_output, lib, SUBCOMMAND.ANNOTATE, '*-1', filename) - self.assertTrue(glob_exists(filename), msg=filename) - - def check_validate(self, lib): - # run validation - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE)) - - for suffix in [ - 'contigs.bam', - 'contigs.fa', - 'contigs.sorted.bam', - 'contigs.sorted.bam.bai', - 'evidence.bed', - 'igv.batch', - 'raw_evidence.bam', - 'raw_evidence.sorted.bam', - 'raw_evidence.sorted.bam.bai', - 'validation-failed.tab', - 'validation-passed.tab', - 'MAVIS-*.COMPLETE', - ]: - self.assertTrue( - glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', suffix), msg=suffix - ) - - def check_aligner_output_files(self, lib, mem=False): - if mem: - self.assertTrue( - glob_exists( - self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', 'contigs.bwa_mem.sam' - ) - ) - self.assertTrue( - glob_exists( - self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', 'contigs.bwa_mem.log' - ) - ) - else: - self.assertTrue( - glob_exists( - self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', 'contigs.blat_out.pslx' - ) - ) - - def check_cluster(self, lib, skipped=False): - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER)) - logfile = os.path.join( - self.temp_output, lib, SUBCOMMAND.CLUSTER, 'MC_{}*batch-*.log'.format(lib) - ) - self.assertTrue(glob_exists(logfile), msg=logfile) - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER, 'batch-*-1.tab')) - self.assertTrue( - glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER, 'filtered_pairs.tab') - ) - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER, 'clusters.bed')) - if skipped: - self.assertFalse( - glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER, 'cluster_assignment.tab') - ) - else: - self.assertTrue( - glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER, 'cluster_assignment.tab') - ) - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.CLUSTER, 'MAVIS-*.COMPLETE')) - - def check_pairing(self): - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR)) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'mavis_paired*.tab')) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'MAVIS-*.COMPLETE')) - - def check_summary(self, count=3): - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY)) - self.assertTrue( - glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'mavis_summary*.tab', n=count) - ) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'MAVIS-*.COMPLETE')) - - @mock.patch('os.environ', ENV.copy()) - def test_pipeline_with_bwa(self): - main([SUBCOMMAND.SETUP, BWA_CONFIG, '-o', self.temp_output]) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib) - self.check_validate(lib) - self.check_aligner_output_files(lib, mem=True) - self.check_annotate(lib) - - self.check_pairing() - self.check_summary() - - @mock.patch('os.environ', ENV.copy()) - def test_error_on_bad_config(self): - with self.assertRaises(SystemExit) as err: - main([SUBCOMMAND.SETUP, 'thing/that/doesnot/exist.cfg', '-o', self.temp_output]) - self.assertEqual(2, err.exception.code) - - @mock.patch('os.environ', ENV.copy()) - def test_error_on_bad_input_file(self): - with self.assertRaises(FileNotFoundError): - main([SUBCOMMAND.SETUP, get_data('bad_input_file.cfg'), '-o', self.temp_output]) - - @mock.patch('os.environ', ENV.copy()) - def test_missing_reference(self): - with self.assertRaises(OSError): - main([SUBCOMMAND.SETUP, get_data('missing_reference.cfg'), '-o', self.temp_output]) - - @mock.patch('os.environ', ENV.copy()) - def test_full_pipeline(self): - main([SUBCOMMAND.SETUP, CONFIG, '-o', self.temp_output]) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib) - self.check_validate(lib) - self.check_aligner_output_files(lib) - self.check_annotate(lib) - - self.check_pairing() - self.check_summary() - - retcode = main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output]) - self.assertEqual(EXIT_OK, retcode) - - @mock.patch('os.environ', ENV.copy()) - def test_no_optional_files(self): - main([SUBCOMMAND.SETUP, get_data('no_opt_pipeline.cfg'), '-o', self.temp_output]) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib) - self.check_validate(lib) - self.check_aligner_output_files(lib) - self.check_annotate(lib) - self.check_pairing() - self.check_summary() - - @mock.patch('os.environ', ENV.copy()) - def test_reference_from_env(self): - os.environ.update( - { - 'MAVIS_TEMPLATE_METADATA': get_data('cytoBand.txt'), - 'MAVIS_ANNOTATIONS': get_data('mock_annotations.json'), - 'MAVIS_MASKING': get_data('mock_masking.tab'), - 'MAVIS_REFERENCE_GENOME': get_data('mock_reference_genome.fa'), - 'MAVIS_ALIGNER_REFERENCE': get_data('mock_reference_genome.2bit'), - 'MAVIS_DGV_ANNOTATION': get_data('mock_dgv_annotation.txt'), - } - ) - main([SUBCOMMAND.SETUP, get_data('reference_from_env.cfg'), '-o', self.temp_output]) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*']: - self.check_cluster(lib) - self.check_validate(lib) - self.check_aligner_output_files(lib) - self.check_annotate(lib) - self.check_pairing() - self.check_summary(count=2) - - @mock.patch('os.environ', ENV.copy()) - def test_clean_files(self): - main([SUBCOMMAND.SETUP, CLEAN_CONFIG, '-o', self.temp_output]) - - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib) - self.assertTrue(glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE)) - - for suffix in ['evidence.bed', 'validation-failed.tab', 'validation-passed.tab']: - self.assertTrue( - glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', suffix) - ) - for suffix in [ - 'contigs.bam', - 'contigs.blat_out.pslx', - 'contigs.fa', - 'contigs.sorted.bam', - 'contigs.sorted.bam.bai', - 'igv.batch', - 'raw_evidence.bam', - 'raw_evidence.sorted.bam', - 'raw_evidence.sorted.bam.bai', - ]: - self.assertFalse( - glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', suffix), - msg=suffix, - ) - self.assertTrue( - glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE + '/*-1', 'MAVIS-*.COMPLETE') - ) - - self.check_annotate(lib) - self.check_pairing() - self.check_summary() - - @mock.patch('os.environ', ENV.copy()) - def test_skip_clustering(self): - main([SUBCOMMAND.SETUP, CONFIG, '-o', self.temp_output, '--skip_stage', SUBCOMMAND.CLUSTER]) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib, skipped=True) - self.check_validate(lib) - self.check_aligner_output_files(lib) - self.check_annotate(lib) - self.check_pairing() - self.check_summary() - - @mock.patch('os.environ', ENV.copy()) - def test_skip_validation(self): - main( - [SUBCOMMAND.SETUP, CONFIG, '-o', self.temp_output, '--skip_stage', SUBCOMMAND.VALIDATE] - ) - - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib) - self.assertFalse(glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE)) - self.check_annotate(lib) - self.check_pairing() - self.check_summary() - - @mock.patch('os.environ', ENV.copy()) - def test_skip_cluster_and_validate(self): - args = [ - SUBCOMMAND.SETUP, - CONFIG, - '-o', - self.temp_output, - '--skip_stage', - SUBCOMMAND.VALIDATE, - '--skip_stage', - SUBCOMMAND.CLUSTER, - ] - main(args) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - main([SUBCOMMAND.SCHEDULE, '-o', self.temp_output, '--submit']) - - # check that the subdirectories were built - for lib in [MOCK_GENOME + '_*', MOCK_TRANS + '_*']: - self.check_cluster(lib, skipped=True) - self.assertFalse(glob_exists(self.temp_output, lib, SUBCOMMAND.VALIDATE)) - self.check_annotate(lib) - self.check_pairing() - self.check_summary() - - def tearDown(self): - # remove the temp directory and outputs - print_file_tree(self.temp_output) - shutil.rmtree(self.temp_output) - - -class TestSetUp(unittest.TestCase): - def setUp(self): - # create the temp output directory to store file outputs - self.temp_output = tempfile.mkdtemp() - print('output dir', self.temp_output) - - @mock.patch('os.environ', ENV.copy()) - def test_slurm(self): - os.environ['MAVIS_SCHEDULER'] = 'SLURM' - args = [SUBCOMMAND.SETUP, CONFIG, '-o', self.temp_output] - main(args) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.VALIDATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.ANNOTATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'submit.sh')) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'submit.sh')) - - @mock.patch('os.environ', ENV.copy()) - def test_slurm_skip_validate(self): - os.environ['MAVIS_SCHEDULER'] = 'SLURM' - args = [ - SUBCOMMAND.SETUP, - CONFIG, - '-o', - self.temp_output, - '--skip_stage', - SUBCOMMAND.VALIDATE, - ] - main(args) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - self.assertTrue(glob_not_exists(self.temp_output, '*', SUBCOMMAND.VALIDATE, 'submit.sh')) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.ANNOTATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'submit.sh')) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'submit.sh')) - - @mock.patch('os.environ', ENV.copy()) - def test_slurm_skip_cluster(self): - os.environ['MAVIS_SCHEDULER'] = 'SLURM' - args = [ - SUBCOMMAND.SETUP, - CONFIG, - '-o', - self.temp_output, - '--skip_stage', - SUBCOMMAND.CLUSTER, - ] - main(args) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.VALIDATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.ANNOTATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'submit.sh')) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'submit.sh')) - - @mock.patch('os.environ', ENV.copy()) - def test_sge(self): - os.environ['MAVIS_SCHEDULER'] = 'SGE' - args = [SUBCOMMAND.SETUP, CONFIG, '-o', self.temp_output] - main(args) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.VALIDATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.ANNOTATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'submit.sh')) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'submit.sh')) - - @mock.patch('os.environ', ENV.copy()) - def test_torque(self): - os.environ['MAVIS_SCHEDULER'] = 'TORQUE' - args = [SUBCOMMAND.SETUP, CONFIG, '-o', self.temp_output] - main(args) - self.assertTrue(glob_exists(self.temp_output, 'build.cfg')) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.VALIDATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, '*', SUBCOMMAND.ANNOTATE, 'submit.sh', n=2)) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.PAIR, 'submit.sh')) - self.assertTrue(glob_exists(self.temp_output, SUBCOMMAND.SUMMARY, 'submit.sh')) - - def tearDown(self): - # remove the temp directory and outputs - print_file_tree(self.temp_output) - shutil.rmtree(self.temp_output) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/end_to_end/test_help.py b/tests/end_to_end/test_help.py index df10b85e..76823d7f 100644 --- a/tests/end_to_end/test_help.py +++ b/tests/end_to_end/test_help.py @@ -28,14 +28,6 @@ def test_pipeline(self): else: self.assertEqual(0, returncode) - def test_config(self): - with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CONFIG, '-h']): - try: - returncode = main() - except SystemExit as err: - self.assertEqual(0, err.code) - else: - self.assertEqual(0, returncode) def test_cluster(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CLUSTER, '-h']): diff --git a/tests/end_to_end/test_overlay.py b/tests/end_to_end/test_overlay.py index 2759f0c1..db664c55 100644 --- a/tests/end_to_end/test_overlay.py +++ b/tests/end_to_end/test_overlay.py @@ -1,30 +1,39 @@ +import json import os import shutil import subprocess import sys import tempfile -import unittest - from unittest.mock import patch +import pytest from mavis.constants import SUBCOMMAND from mavis.main import main -from . import glob_exists -from ..util import get_data - +from ..util import get_data, glob_exists ANNOTATIONS = get_data('annotations_subsample.json') BAM = get_data('mock_reads_for_events.sorted.bam') -class TestOverlayOptions(unittest.TestCase): - def setUp(self): - # create the temp output directory to store file outputs - self.temp_output = tempfile.mkdtemp() - print('output dir', self.temp_output) +@pytest.fixture +def output_dir(): + temp_output = tempfile.mkdtemp() + yield temp_output + shutil.rmtree(temp_output) + - def test_basic(self): +@pytest.fixture(scope='module') +def config_json(): + _, p = tempfile.mkstemp() + print(p) + with open(p, 'w') as fh: + fh.write(json.dumps({'reference.annotations': [ANNOTATIONS]})) + yield p + + +class TestOverlayOptions: + def test_basic(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -32,22 +41,21 @@ def test_basic(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, ], ): try: - print(sys.argv) returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_marker(self): + def test_marker(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -55,10 +63,10 @@ def test_marker(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--marker', 'm', '49364900', @@ -67,12 +75,12 @@ def test_marker(self): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_marker_range(self): + def test_marker_range(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -80,10 +88,10 @@ def test_marker_range(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--marker', 'm', '49364900', @@ -93,12 +101,12 @@ def test_marker_range(self): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_marker_not_enough_args(self): + def test_marker_not_enough_args(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -106,10 +114,10 @@ def test_marker_not_enough_args(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--marker', 'm', ], @@ -117,10 +125,10 @@ def test_marker_not_enough_args(self): try: returncode = main() except SystemExit as err: - self.assertNotEqual(0, err.code) + assert err.code != 0 else: - self.assertNotEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) with patch.object( sys, @@ -128,23 +136,23 @@ def test_marker_not_enough_args(self): [ 'mavis', SUBCOMMAND.OVERLAY, - '--annotations', - ANNOTATIONS, + '--config', + config_json, 'GAGE4', '--output', - self.temp_output, + output_dir, '--marker', ], ): try: returncode = main() except SystemExit as err: - self.assertNotEqual(0, err.code) + assert err.code != 0 else: - self.assertNotEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_marker_not_int(self): + def test_marker_not_int(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -152,10 +160,10 @@ def test_marker_not_int(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--marker', 'm', 'k', @@ -164,12 +172,12 @@ def test_marker_not_int(self): try: returncode = main() except SystemExit as err: - self.assertNotEqual(0, err.code) + assert err.code != 0 else: - self.assertNotEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_read_depth_plot(self): + def test_read_depth_plot(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -177,10 +185,10 @@ def test_read_depth_plot(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--read_depth_plot', 'axis', BAM, @@ -189,12 +197,12 @@ def test_read_depth_plot(self): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_read_depth_plot_binned(self): + def test_read_depth_plot_binned(self, config_json, output_dir): with patch.object( sys, 'argv', @@ -202,10 +210,10 @@ def test_read_depth_plot_binned(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--read_depth_plot', 'axis', BAM, @@ -215,12 +223,12 @@ def test_read_depth_plot_binned(self): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) - def test_read_depth_plot_not_binned_but_stranded(self): + def test_read_depth_plot_not_binned_but_stranded(self, config_json, output_dir): # no ymax with patch.object( sys, @@ -229,10 +237,10 @@ def test_read_depth_plot_not_binned_but_stranded(self): 'mavis', SUBCOMMAND.OVERLAY, 'GAGE4', - '--annotations', - ANNOTATIONS, + '--config', + config_json, '--output', - self.temp_output, + output_dir, '--read_depth_plot', 'axis', BAM, @@ -244,11 +252,7 @@ def test_read_depth_plot_not_binned_but_stranded(self): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) - self.assertTrue(glob_exists(os.path.join(self.temp_output, '*GAGE4*.svg'))) - - def tearDown(self): - # remove the temp directory and outputs - shutil.rmtree(self.temp_output) + assert returncode is None + assert glob_exists(os.path.join(output_dir, '*GAGE4*.svg')) diff --git a/tests/end_to_end/test_pairing.py b/tests/end_to_end/test_pairing.py deleted file mode 100644 index 769c1c9e..00000000 --- a/tests/end_to_end/test_pairing.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import shutil -import sys -import tempfile -import unittest -from unittest.mock import patch - -from mavis.constants import SUBCOMMAND -from mavis.main import main -from mavis.util import read_bpp_from_input_file - -from ..util import get_data - -TEMP_OUTPUT = None - - -def setUpModule(): - global TEMP_OUTPUT - # create the temp output directory to store file outputs - TEMP_OUTPUT = tempfile.mkdtemp() - - -class TestPairing(unittest.TestCase): - def test_pairing(self): - args = [ - 'mavis', - SUBCOMMAND.PAIR, - '-n', - get_data('pairing_annotations.tab'), - '-o', - TEMP_OUTPUT, - '--annotations', - get_data('pairing_reference_annotations_file.tab'), - ] - with patch.object(sys, 'argv', args): - self.assertEqual(0, main()) - # make sure the output file exists - output = os.path.join(TEMP_OUTPUT, 'mavis_paired_A36971_A36973.tab') - self.assertTrue(os.path.exists(output)) - # check that the expected pairings are present - bpps = read_bpp_from_input_file(output, expand_strand=False, expand_orient=False) - self.assertEqual(6, len(bpps)) - - -def tearDownModule(): - # remove the temp directory and outputs - shutil.rmtree(TEMP_OUTPUT) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/end_to_end/test_ref_alt_count.py b/tests/end_to_end/test_ref_alt_count.py index e5ec5a3d..4c30fb81 100644 --- a/tests/end_to_end/test_ref_alt_count.py +++ b/tests/end_to_end/test_ref_alt_count.py @@ -1,15 +1,14 @@ -import tempfile -import unittest import os import shutil +import tempfile +import unittest from mavis.annotate.file_io import load_reference_genome from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import ORIENT, SVTYPE from tools.calculate_ref_alt_counts import RefAltCalculator -from ..util import get_data -from . import glob_exists +from ..util import get_data, glob_exists def setUpModule(): diff --git a/tests/full-tutorial.config.json b/tests/full-tutorial.config.json new file mode 100644 index 00000000..bf125870 --- /dev/null +++ b/tests/full-tutorial.config.json @@ -0,0 +1,98 @@ +{ + "annotate.draw_fusions_only": true, + "cluster.min_clusters_per_file": 100, + "cluster.uninformative_filter": true, + "convert": { + "breakdancer": { + "assume_no_untemplated": true, + "file_type": "breakdancer", + "inputs": [ + "tutorial_data/breakdancer-1.4.5/*txt" + ] + }, + "breakseq": { + "assume_no_untemplated": true, + "file_type": "breakseq", + "inputs": [ + "tutorial_data/breakseq-2.2/breakseq.vcf.gz" + ] + }, + "chimerascan": { + "assume_no_untemplated": true, + "file_type": "chimerascan", + "inputs": [ + "tutorial_data/chimerascan-0.4.5/chimeras.bedpe" + ] + }, + "defuse": { + "assume_no_untemplated": true, + "file_type": "defuse", + "inputs": [ + "tutorial_data/defuse-0.6.2/results.classify.tsv" + ] + }, + "manta": { + "assume_no_untemplated": true, + "file_type": "manta", + "inputs": [ + "tutorial_data/manta-1.0.0/diploidSV.vcf.gz", + "tutorial_data/manta-1.0.0/somaticSV.vcf" + ] + } + }, + "libraries": { + "L1522785992-normal": { + "assign": [ + "breakdancer", + "breakseq", + "manta" + ], + "bam_file": "tutorial_data/L1522785992_normal.sorted.bam", + "disease_status": "normal", + "protocol": "genome" + }, + "L1522785992-trans": { + "assign": [ + "chimerascan", + "defuse" + ], + "bam_file": "tutorial_data/L1522785992_trans.sorted.bam", + "disease_status": "diseased", + "protocol": "transcriptome", + "strand_specific": true + }, + "L1522785992-tumour": { + "assign": [ + "breakdancer", + "breakseq", + "manta" + ], + "bam_file": "tutorial_data/L1522785992_tumour.sorted.bam", + "disease_status": "diseased", + "protocol": "genome" + } + }, + "output_dir": "output_dir_full", + "reference.aligner_reference": [ + "reference_inputs/hg19.2bit" + ], + "reference.annotations": [ + "reference_inputs/ensembl69_hg19_annotations.json" + ], + "reference.dgv_annotation": [ + "reference_inputs/dgv_hg19_variants.tab" + ], + "reference.masking": [ + "reference_inputs/hg19_masking.tab" + ], + "reference.reference_genome": [ + "reference_inputs/hg19.fa" + ], + "reference.template_metadata": [ + "reference_inputs/cytoBand.txt" + ], + "summary.filter_min_flanking_reads": 10, + "summary.filter_min_linking_split_reads": 1, + "summary.filter_min_remapped_reads": 5, + "summary.filter_min_spanning_reads": 5 +} diff --git a/tests/integration/schedule/__init__.py b/tests/integration/schedule/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/integration/schedule/test_pipeline.py b/tests/integration/schedule/test_pipeline.py deleted file mode 100644 index 4ae43b33..00000000 --- a/tests/integration/schedule/test_pipeline.py +++ /dev/null @@ -1,157 +0,0 @@ -import unittest -from unittest import mock -import configparser -import tempfile -import shutil -import os - -from mavis.schedule import pipeline as _pipeline -from mavis.schedule import scheduler -from mavis.main import main - -from ...util import get_data - - -class TestTime(unittest.TestCase): - def test_time(self): - self.assertEqual('0:20:00', scheduler.time_format(1200)) - self.assertEqual('1:00:00', scheduler.time_format(3600)) - self.assertEqual('25:25:25', scheduler.time_format(91525)) - - -class TestReadBuildFile(unittest.TestCase): - - # TODO: test_skip_validate - # TODO: test_no_skip - # TODO: test_missing_summary - # TODO: test_missing_pairing - # TODO: test_missing_annotations - # TODO: test_error_on_config_not_exists - # TODO: test_loading_unsubmitted - # TODO: test_loading_submitted - # TODO: test_loading_completed - # TODO: test_missing_validations - # TODO: test_missing_dependency_job - - def setUp(self): - self.exists_patcher = mock.patch('os.path.exists') - self.exists_patcher.start().return_value = True - - def read_mock_config(self, content): - with mock.patch('configparser.ConfigParser.read', configparser.ConfigParser.read_string): - return _pipeline.Pipeline.read_build_file(content) - - def test_torque(self): - pipeline = _pipeline.Pipeline.read_build_file(get_data('torque_build.cfg')) - self.assertEqual(3, len(pipeline.validations)) - self.assertEqual(3, len(pipeline.annotations)) - self.assertIn(pipeline.annotations[0].dependencies[0], pipeline.validations) - self.assertIn(pipeline.pairing, pipeline.summary.dependencies) - - def test_basic(self): - content = """ -[general] -output_dir = temp -scheduler = SLURM -batch_id = 1 - -[job1] -stage = validate -task_list = 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 -name = job1 -output_dir = temp2 - - -[job2] -stage = annotate -name = job2 -dependencies = job1 -output_dir = temp3 - -[job3] -stage = pairing -name = job3 -dependencies = job2 -output_dir = temp4 - -[job4] -stage = summary -name = job4 -dependencies = job3 -output_dir = temp5 - """ - result = self.read_mock_config(content) - self.assertEqual('job3', result.pairing.name) - self.assertEqual('job1', result.validations[0].name) - self.assertEqual('job2', result.annotations[0].name) - self.assertEqual(result.validations[0], result.annotations[0].dependencies[0]) - self.assertEqual(result.annotations[0], result.pairing.dependencies[0]) - self.assertEqual(result.pairing, result.summary.dependencies[0]) - - def test_parsed_types(self): - build = _pipeline.Pipeline.read_build_file(get_data('build.cfg')) - self.assertIs(build.validations[0].import_env, True) - self.assertIs(build.scheduler.concurrency_limit, None) - - def tearDown(self): - self.exists_patcher.stop() - - -class TestBuildPipeline(unittest.TestCase): - def setUp(self): - self.temp_output = tempfile.mkdtemp() - # clear any environment variables - self.env_patch = mock.patch( - 'os.environ', {k: v for k, v in os.environ.items() if not k.startswith('MAVIS_')} - ) - self.env_patch.start() - - def test_basic_slurm(self): - os.environ['MAVIS_SCHEDULER'] = 'SLURM' - config = get_data('pipeline_config.cfg') - - with mock.patch('sys.argv', ['mavis', 'setup', '--output', self.temp_output, config]): - self.assertEqual(0, main()) - build_file = os.path.join(self.temp_output, 'build.cfg') - with open(build_file, 'r') as fh: - print(fh.read()) - build = _pipeline.Pipeline.read_build_file(build_file) - print(build) - self.assertGreaterEqual(len(build.validations), 1) - self.assertGreaterEqual(len(build.annotations), 1) - self.assertEqual(2, len(build.pairing.dependencies)) - self.assertIsNotNone(build.pairing) - self.assertIsNotNone(build.summary) - - def test_basic_sge(self): - os.environ['MAVIS_SCHEDULER'] = 'SGE' - config = get_data('pipeline_config.cfg') - - with mock.patch('sys.argv', ['mavis', 'setup', '--output', self.temp_output, config]): - self.assertEqual(0, main()) - build_file = os.path.join(self.temp_output, 'build.cfg') - with open(build_file, 'r') as fh: - print(fh.read()) - build = _pipeline.Pipeline.read_build_file(build_file) - print(build) - self.assertGreaterEqual(len(build.validations), 1) - self.assertGreaterEqual(len(build.annotations), 1) - self.assertIsNotNone(build.pairing) - self.assertIsNotNone(build.summary) - - # TODO: test_basic_submit - # TODO: test pipeline failure - # TODO: test conversion failure - - def tearDown(self): - shutil.rmtree(self.temp_output) - self.env_patch.stop() diff --git a/tests/integration/schedule/test_sge.py b/tests/integration/schedule/test_sge.py deleted file mode 100644 index 172c3931..00000000 --- a/tests/integration/schedule/test_sge.py +++ /dev/null @@ -1,748 +0,0 @@ -import subprocess -import unittest -from unittest import mock - -from mavis.schedule import job as _job -from mavis.schedule import scheduler as _scheduler -from mavis.schedule import constants as _constants -from mavis.constants import SUBCOMMAND - -QACCT_ARR3_OK = """ -============================================================== -qname merge.q -hostname n601.numbers.bcgsc.ca -group users -owner creisle -project NONE -department defaultdepartment -jobname arrtest -jobnumber 3757289 -taskid 1 -account sge -priority 0 -qsub_time Thu May 24 10:54:05 2018 -start_time Thu May 24 10:54:12 2018 -end_time Thu May 24 10:55:12 2018 -granted_pe NONE -slots 1 -failed 0 -exit_status 0 -ru_wallclock 60s -ru_utime 0.057s -ru_stime 0.087s -ru_maxrss 5.160KB -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 20948 -ru_majflt 0 -ru_nswap 0 -ru_inblock 0 -ru_oublock 8 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 224 -ru_nivcsw 59 -cpu 0.144s -mem 0.000GBs -io 0.001GB -iow 0.000s -maxvmem 1.934MB -arid undefined -ar_sub_time undefined -category -U transabyss_users -============================================================== -qname merge.q -hostname n602.numbers.bcgsc.ca -group users -owner creisle -project NONE -department defaultdepartment -jobname arrtest -jobnumber 3757289 -taskid 3 -account sge -priority 0 -qsub_time Thu May 24 10:54:05 2018 -start_time Thu May 24 10:54:12 2018 -end_time Thu May 24 10:55:12 2018 -granted_pe NONE -slots 1 -failed 0 -exit_status 0 -ru_wallclock 60s -ru_utime 0.063s -ru_stime 0.079s -ru_maxrss 5.156KB -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 20954 -ru_majflt 0 -ru_nswap 0 -ru_inblock 0 -ru_oublock 8 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 220 -ru_nivcsw 65 -cpu 0.142s -mem 0.000GBs -io 0.001GB -iow 0.000s -maxvmem 1.934MB -arid undefined -ar_sub_time undefined -category -U transabyss_users -============================================================== -qname merge.q -hostname n604.numbers.bcgsc.ca -group users -owner creisle -project NONE -department defaultdepartment -jobname arrtest -jobnumber 3757289 -taskid 2 -account sge -priority 0 -qsub_time Thu May 24 10:54:05 2018 -start_time Thu May 24 10:54:17 2018 -end_time Thu May 24 10:55:17 2018 -granted_pe NONE -slots 1 -failed 0 -exit_status 0 -ru_wallclock 60s -ru_utime 0.055s -ru_stime 0.086s -ru_maxrss 5.156KB -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 20954 -ru_majflt 0 -ru_nswap 0 -ru_inblock 0 -ru_oublock 8 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 218 -ru_nivcsw 66 -cpu 0.141s -mem 0.000GBs -io 0.001GB -iow 0.000s -maxvmem 1.930MB -arid undefined -ar_sub_time undefined -category -U transabyss_users -""" - - -class TestUpdate(unittest.TestCase): - # TODO: status of array job - # TODO: status of single job - # TODO: status of job waiting on dependency - - @mock.patch('subprocess.check_output') - def test_job_array_waiting(self, patch_check): - content = """ -job-ID prior name user state submit/start at queue slots ja-task-ID ------------------------------------------------------------------------------------------------------------------ -3751935 0.00000 subtest.sh creisle qw 05/23/2018 13:44:04 1 1-10:1 - """.encode( - 'utf8' - ) - patch_check.return_value = content - job = _job.ArrayJob(output_dir='temp', job_ident='3751935', task_list=10, stage='validate') - _scheduler.SgeScheduler().update_info(job) - self.assertEqual(_constants.JOB_STATUS.PENDING, job.status) - - @mock.patch('subprocess.check_output') - def test_job_array(self, patch_check): - content = """ -job-ID prior name user state submit/start at queue slots ja-task-ID ------------------------------------------------------------------------------------------------------------------ -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n601.numbers.bcgsc.ca 1 1 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n602.numbers.bcgsc.ca 1 2 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n604.numbers.bcgsc.ca 1 3 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n603.numbers.bcgsc.ca 1 4 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n601.numbers.bcgsc.ca 1 5 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n602.numbers.bcgsc.ca 1 6 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n604.numbers.bcgsc.ca 1 7 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n603.numbers.bcgsc.ca 1 8 -3751935 0.50500 subtest.sh creisle r 05/23/2018 13:44:12 merge.q@n601.numbers.bcgsc.ca 1 9 -3751935 0.50500 subtest.sh creisle qw 05/23/2018 13:44:12 merge.q@n602.numbers.bcgsc.ca 1 10 - """.encode( - 'utf8' - ) - patch_check.return_value = content - job = _job.ArrayJob(output_dir='temp', job_ident='3751935', task_list=10, stage='validate') - _scheduler.SgeScheduler().update_info(job) - - for task in job.task_list[:9]: - self.assertEqual(_constants.JOB_STATUS.RUNNING, task.status) - self.assertEqual(_constants.JOB_STATUS.PENDING, job.task_list[-1].status) - self.assertEqual(_constants.JOB_STATUS.PENDING, job.status) - - @mock.patch('subprocess.check_output') - def test_single_job(self, patch_check): - content = """ -job-ID prior name user state submit/start at queue slots ja-task-ID ------------------------------------------------------------------------------------------------------------------ - 217940 1.50000 subtest.sh creisle qw 05/22/2018 23:39:55 1 - """.encode( - 'utf8' - ) - patch_check.return_value = content - job = _job.Job(output_dir='temp', job_ident='217940', stage='validate') - _scheduler.SgeScheduler().update_info(job) - self.assertEqual(_constants.JOB_STATUS.PENDING, job.status) - - @mock.patch('subprocess.check_output') - def test_completed_array(self, patch_check): - patch_check.side_effect = [''.encode('utf8'), QACCT_ARR3_OK.encode('utf8')] - job = _job.ArrayJob( - output_dir='temp', job_ident='3757289', stage='validate', name='arrtest', task_list=3 - ) - _scheduler.SgeScheduler().update_info(job) - self.assertEqual(_constants.JOB_STATUS.COMPLETED, job.status) - for task in job.task_list: - self.assertEqual(_constants.JOB_STATUS.COMPLETED, task.status) - - -class TestParseQacct(unittest.TestCase): - def test_job_array(self): - content = QACCT_ARR3_OK - rows = _scheduler.SgeScheduler().parse_qacct(content) - expected = { - 'job_ident': '3757289', - 'name': 'arrtest', - 'status': _constants.JOB_STATUS.COMPLETED, - 'status_comment': '', - } - for task_id, row in zip([1, 3, 2], rows): - exp = {'task_ident': str(task_id)} - exp.update(expected) - self.assertEqual(exp, row) - - def test_passed(self): - content = """ -============================================================== -qname transabyss.q -hostname tac3n15.hpc.bcgsc.ca -group users -owner bioapps -project NONE -department defaultdepartment -jobname A89009negative -jobnumber 3744253 -taskid 40 -account sge -priority 0 -qsub_time Tue May 22 09:26:31 2018 -start_time Tue May 22 10:32:42 2018 -end_time Tue May 22 13:28:32 2018 -granted_pe openmpi -slots 8 -failed 0 -exit_status 0 -ru_wallclock 10550s -ru_utime 42298.581s -ru_stime 34509.422s -ru_maxrss 2.608MB -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 5382919 -ru_majflt 978 -ru_nswap 0 -ru_inblock 14027520 -ru_oublock 9259368 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 20635137 -ru_nivcsw 14100587 -cpu 76808.002s -mem 119.207KGBs -io 579.782GB -iow 0.000s -maxvmem 14.885GB -arid undefined -ar_sub_time undefined -category -U transabyss_users -q transabyss.q -l h_vmem=3.85G,mem_free=3.85G,mem_token=3.85G -pe openmpi 8 - """ - rows = _scheduler.SgeScheduler().parse_qacct(content) - self.assertEqual(1, len(rows)) - expected = { - 'job_ident': '3744253', - 'task_ident': '40', - 'name': 'A89009negative', - 'status': _constants.JOB_STATUS.COMPLETED, - 'status_comment': '', - } - self.assertEqual(expected, rows[0]) - - def test_non_zero_exit(self): - content = """ -============================================================== -qname merge.q -hostname n603.numbers.bcgsc.ca -group users -owner creisle -project NONE -department defaultdepartment -jobname error -jobnumber 3755560 -taskid undefined -account sge -priority 0 -qsub_time Thu May 24 09:42:58 2018 -start_time Thu May 24 09:43:12 2018 -end_time Thu May 24 09:44:12 2018 -granted_pe NONE -slots 1 -failed 0 -exit_status 1 -ru_wallclock 60s -ru_utime 0.054s -ru_stime 0.088s -ru_maxrss 5.148KB -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 21134 -ru_majflt 0 -ru_nswap 0 -ru_inblock 8 -ru_oublock 16 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 228 -ru_nivcsw 62 -cpu 0.142s -mem 0.000GBs -io 0.001GB -iow 0.000s -maxvmem 1.926MB -arid undefined -ar_sub_time undefined -category -U transabyss_users - """ - - rows = _scheduler.SgeScheduler().parse_qacct(content) - self.assertEqual(1, len(rows)) - expected = { - 'job_ident': '3755560', - 'task_ident': None, - 'name': 'error', - 'status': _constants.JOB_STATUS.FAILED, - 'status_comment': '', - } - self.assertEqual(expected, rows[0]) - - def test_failed(self): - content = """ -============================================================== -qname merge.q -hostname n603.numbers.bcgsc.ca -group users -owner creisle -project NONE -department defaultdepartment -jobname MV_mock-A36971_batch-E6aEZJnTQAau598tcsMjAE -jobnumber 3760712 -taskid 1 -account sge -priority 0 -qsub_time Thu May 24 13:35:02 2018 -start_time -/- -end_time -/- -granted_pe NONE -slots 1 -failed 26 : opening input/output file -exit_status 0 -ru_wallclock 0s -ru_utime 0.000s -ru_stime 0.000s -ru_maxrss 0.000B -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 0 -ru_majflt 0 -ru_nswap 0 -ru_inblock 0 -ru_oublock 0 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 0 -ru_nivcsw 0 -cpu 0.000s -mem 0.000GBs -io 0.000GB -iow 0.000s -maxvmem 0.000B -arid undefined -ar_sub_time undefined -category -U transabyss_users -l h_rt=57600,h_vmem=16000M,mem_free=16000M,mem_token=16000M - """ - rows = _scheduler.SgeScheduler().parse_qacct(content) - self.assertEqual(1, len(rows)) - expected = { - 'job_ident': '3760712', - 'task_ident': '1', - 'name': 'MV_mock-A36971_batch-E6aEZJnTQAau598tcsMjAE', - 'status': _constants.JOB_STATUS.FAILED, - 'status_comment': 'opening input/output file', - } - self.assertEqual(expected, rows[0]) - - def test_cancelled(self): - content = """ -============================================================== -qname merge.q -hostname n603.numbers.bcgsc.ca -group users -owner creisle -project NONE -department defaultdepartment -jobname arrtest -jobnumber 3757249 -taskid undefined -account sge -priority 0 -qsub_time Thu May 24 10:50:27 2018 -start_time Thu May 24 10:50:45 2018 -end_time Thu May 24 10:51:09 2018 -granted_pe NONE -slots 1 -failed 100 : assumedly after job -exit_status 137 (Killed) -ru_wallclock 24s -ru_utime 0.052s -ru_stime 0.088s -ru_maxrss 5.160KB -ru_ixrss 0.000B -ru_ismrss 0.000B -ru_idrss 0.000B -ru_isrss 0.000B -ru_minflt 20737 -ru_majflt 0 -ru_nswap 0 -ru_inblock 0 -ru_oublock 8 -ru_msgsnd 0 -ru_msgrcv 0 -ru_nsignals 0 -ru_nvcsw 215 -ru_nivcsw 63 -cpu 0.140s -mem 0.000GBs -io 0.001GB -iow 0.000s -maxvmem 1.934MB -arid undefined -ar_sub_time undefined -category -U transabyss_users - """ - rows = _scheduler.SgeScheduler().parse_qacct(content) - self.assertEqual(1, len(rows)) - expected = { - 'job_ident': '3757249', - 'task_ident': None, - 'name': 'arrtest', - 'status': _constants.JOB_STATUS.CANCELLED, - 'status_comment': 'assumedly after job', - } - self.assertEqual(expected, rows[0]) - - def test_job_not_found(self): - content = """ -Total System Usage - WALLCLOCK UTIME STIME CPU MEMORY IO IOW -================================================================================================================ - 3786481073 6713770428.951 4374477378.582 11585461604.347 187237653407.317 156350319.140 0.000 - """ - with self.assertRaises(ValueError): - _scheduler.SgeScheduler().parse_qacct(content) - - -class TestParseQstat(unittest.TestCase): - def test_single_job(self): - content = """ -job-ID prior name user state submit/start at queue slots ja-task-ID ------------------------------------------------------------------------------------------------------------------ - 217940 1.50000 subtest.sh creisle qw 05/22/2018 23:39:55 1 - """ - rows = _scheduler.SgeScheduler().parse_qstat(content, '217940') - self.assertEqual(1, len(rows)) - expected = { - 'job_ident': '217940', - 'task_ident': None, - 'status': _constants.JOB_STATUS.PENDING, - 'name': 'subtest.sh', - 'status_comment': '', - } - self.assertEqual(expected, rows[0]) - - def test_no_jobs_found(self): - rows = _scheduler.SgeScheduler().parse_qstat("", '217940') - self.assertEqual([], rows) - - -class TestCancel(unittest.TestCase): - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_single_job(self, patcher): - sched = _scheduler.SgeScheduler() - job = _job.Job(SUBCOMMAND.VALIDATE, '', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, job.status) - patcher.assert_called_with(['qdel', '1234']) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_array_job(self, patcher): - sched = _scheduler.SgeScheduler() - job = _job.ArrayJob(SUBCOMMAND.VALIDATE, 10, output_dir='', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, job.status) - for task in job.task_list: - self.assertEqual(_constants.JOB_STATUS.CANCELLED, task.status) - patcher.assert_called_with(['qdel', '1234']) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_array_job_task(self, patcher): - sched = _scheduler.SgeScheduler() - job = _job.ArrayJob(SUBCOMMAND.VALIDATE, 10, output_dir='', job_ident='1234') - sched.cancel(job, task_ident=4) - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, job.status) - for i, task in enumerate(job.task_list): - if i == 3: - self.assertEqual(_constants.JOB_STATUS.CANCELLED, task.status) - else: - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, task.status) - patcher.assert_called_with(['qdel', '1234', '-t', '4']) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_bad_command(self, patcher): - patcher.side_effect = [subprocess.CalledProcessError(1, 'command')] - sched = _scheduler.SgeScheduler() - job = _job.Job(SUBCOMMAND.VALIDATE, '', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, job.status) - - -class TestSubmit(unittest.TestCase): - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_job(self, patcher): - patcher.side_effect = ['Your job 3891651 ("MV1") has been submitted'] - job = _job.Job( - SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - script='script.sh', - name='MV1', - memory_limit=1, - ) - sched = _scheduler.SgeScheduler() - sched.submit(job) - self.assertEqual('3891651', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - patcher.assert_called_with( - 'qsub -j y -q all -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-N MV1 -o output_dir/job-\\$JOB_NAME-\\$JOB_ID.log script.sh', - shell=True, - ) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_job_with_array_dep(self, patcher): - patcher.side_effect = ['Your job 3891651 ("MV1") has been submitted'] - job = _job.Job( - SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - script='script.sh', - name='MV1', - memory_limit=1, - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - ) - dep = _job.ArrayJob( - job_ident='1234', task_list=10, output_dir='', stage=SUBCOMMAND.VALIDATE - ) - job.dependencies.append(dep) - sched = _scheduler.SgeScheduler() - sched.submit(job) - self.assertEqual('3891651', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - patcher.assert_called_with( - 'qsub -j y -q all -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-hold_jid 1234 -N MV1 -m abes -M me@example.com ' - '-o output_dir/job-\\$JOB_NAME-\\$JOB_ID.log script.sh', - shell=True, - ) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_job_with_job_dep(self, patcher): - patcher.side_effect = ['Your job 3891651 ("MV1") has been submitted'] - job = _job.Job( - SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - script='script.sh', - name='MV1', - memory_limit=1, - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - ) - dep = _job.Job(job_ident='1234', output_dir='', stage=SUBCOMMAND.VALIDATE) - job.dependencies.append(dep) - sched = _scheduler.SgeScheduler() - sched.submit(job) - self.assertEqual('3891651', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - patcher.assert_called_with( - 'qsub -j y -q all -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-hold_jid 1234 -N MV1 -m abes -M me@example.com ' - '-o output_dir/job-\\$JOB_NAME-\\$JOB_ID.log script.sh', - shell=True, - ) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_array_job(self, patcher): - patcher.side_effect = ['Your job-array 3891657.2-4:1 ("MV1") has been submitted'] - job = _job.ArrayJob( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - script='script.sh', - name='MV1', - task_list=[2, 3, 4], - memory_limit=1, - ) - sched = _scheduler.SgeScheduler(concurrency_limit=2) - sched.submit(job) - self.assertEqual('3891657', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - - patcher.assert_called_with( - 'qsub -j y -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-N MV1 -t 2-4 -o output_dir/job-\\$JOB_NAME-\\$JOB_ID-\\$TASK_ID.log script.sh', - shell=True, - ) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_array_job_with_job_dep(self, patcher): - patcher.side_effect = ['Your job-array 3891657.2-4:1 ("MV1") has been submitted'] - job = _job.ArrayJob( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - script='script.sh', - name='MV1', - task_list=[2, 3, 4], - memory_limit=1, - ) - sched = _scheduler.SgeScheduler(concurrency_limit=2) - - dep = _job.Job(job_ident='1234', output_dir='', stage=SUBCOMMAND.VALIDATE) - job.dependencies.append(dep) - - sched.submit(job) - self.assertEqual('3891657', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - - patcher.assert_called_with( - 'qsub -j y -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-hold_jid 1234 ' - '-N MV1 -t 2-4 -o output_dir/job-\\$JOB_NAME-\\$JOB_ID-\\$TASK_ID.log script.sh', - shell=True, - ) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_array_job_with_array_dep(self, patcher): - patcher.side_effect = ['Your job-array 3891657.2-4:1 ("MV1") has been submitted'] - job = _job.ArrayJob( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - script='script.sh', - name='MV1', - task_list=[2, 3, 4], - memory_limit=1, - ) - sched = _scheduler.SgeScheduler(concurrency_limit=2) - - dep = _job.ArrayJob( - job_ident='1234', task_list=[2, 3, 4], output_dir='', stage=SUBCOMMAND.VALIDATE - ) - job.dependencies.append(dep) - - sched.submit(job) - self.assertEqual('3891657', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - - patcher.assert_called_with( - 'qsub -j y -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-hold_jid_ad 1234 ' - '-N MV1 -t 2-4 -o output_dir/job-\\$JOB_NAME-\\$JOB_ID-\\$TASK_ID.log script.sh', - shell=True, - ) - - @mock.patch('mavis.schedule.scheduler.SgeScheduler.command') - def test_array_job_with_diff_array(self, patcher): - patcher.side_effect = ['Your job-array 3891657.2-4:1 ("MV1") has been submitted'] - job = _job.ArrayJob( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - script='script.sh', - name='MV1', - task_list=[2, 3, 4], - memory_limit=1, - ) - sched = _scheduler.SgeScheduler(concurrency_limit=2) - - dep = _job.ArrayJob( - job_ident='1234', task_list=[2, 3, 4, 5], output_dir='', stage=SUBCOMMAND.VALIDATE - ) - job.dependencies.append(dep) - - sched.submit(job) - self.assertEqual('3891657', job.job_ident) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - - patcher.assert_called_with( - 'qsub -j y -l mem_free=1M,mem_token=1M,h_vmem=1M -l h_rt=16:00:00 -V ' - '-hold_jid 1234 ' - '-N MV1 -t 2-4 -o output_dir/job-\\$JOB_NAME-\\$JOB_ID-\\$TASK_ID.log script.sh', - shell=True, - ) - - def test_array_job_non_consec_error(self): - job = _job.ArrayJob( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - script='script.sh', - name='MV1', - task_list=[2, 3, 4, 7], - memory_limit=1, - ) - sched = _scheduler.SgeScheduler(concurrency_limit=2) - with self.assertRaises(ValueError): - sched.submit(job) - - def test_already_submitted_error(self): - job = _job.Job(stage=SUBCOMMAND.VALIDATE, output_dir='output_dir', job_ident='1') - sched = _scheduler.SgeScheduler(concurrency_limit=2) - with self.assertRaises(ValueError): - sched.submit(job) diff --git a/tests/integration/schedule/test_slurm.py b/tests/integration/schedule/test_slurm.py deleted file mode 100644 index a8566531..00000000 --- a/tests/integration/schedule/test_slurm.py +++ /dev/null @@ -1,617 +0,0 @@ -import subprocess -import unittest -from unittest import mock - -from mavis.schedule import job as _job -from mavis.schedule import constants as _constants -from mavis.schedule import scheduler as _scheduler -from mavis.constants import SUBCOMMAND - - -class TestSubmit(unittest.TestCase): - - # TODO: test initial submission - # TODO: test submit after failure - # TODO: test reporting errors - - @mock.patch('subprocess.check_output') - def test_single_job(self, patch_check): - patch_check.return_value = "Submitted batch job 1665695".encode('utf8') - job = _job.Job(output_dir='temp', name='job1', stage='validate', script='submit.sh') - print(job) - _scheduler.SlurmScheduler().submit(job) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - self.assertEqual('1665695', job.job_ident) - patch_check.assert_called_with( - [ - 'sbatch', - '--mem', - '16000M', - '-t', - '16:00:00', - '--export=ALL', - '-J', - 'job1', - '-o', - 'temp/job-%x-%j.log', - 'submit.sh', - ], - shell=False, - ) - - @mock.patch('subprocess.check_output') - def test_partition(self, patch_check): - patch_check.return_value = "Submitted batch job 1665695".encode('utf8') - job = _job.Job( - output_dir='temp', name='job1', stage='validate', script='submit.sh', queue='all' - ) - print(job) - _scheduler.SlurmScheduler().submit(job) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - self.assertEqual('1665695', job.job_ident) - patch_check.assert_called_with( - [ - 'sbatch', - '--partition=all', - '--mem', - '16000M', - '-t', - '16:00:00', - '--export=ALL', - '-J', - 'job1', - '-o', - 'temp/job-%x-%j.log', - 'submit.sh', - ], - shell=False, - ) - - @mock.patch('subprocess.check_output') - def test_mail_options(self, patch_check): - patch_check.return_value = "Submitted batch job 1665695".encode('utf8') - job = _job.Job( - output_dir='temp', - name='job1', - stage='validate', - script='submit.sh', - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - ) - print(job) - _scheduler.SlurmScheduler().submit(job) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - self.assertEqual('1665695', job.job_ident) - patch_check.assert_called_with( - [ - 'sbatch', - '--mem', - '16000M', - '-t', - '16:00:00', - '--export=ALL', - '-J', - 'job1', - '-o', - 'temp/job-%x-%j.log', - '--mail-type=ALL', - '--mail-user=me@example.com', - 'submit.sh', - ], - shell=False, - ) - - @mock.patch('subprocess.check_output') - def test_dependent_job(self, patch_check): - patch_check.side_effect = ["Submitted batch job 1665695".encode('utf8')] - job = _job.Job( - output_dir='temp', - name='job1', - stage='validate', - script='submit.sh', - dependencies=[ - _job.Job( - output_dir='temp', - name='job2', - stage='cluster', - script='submit2.sh', - job_ident='12345678', - ) - ], - ) - print(job) - _scheduler.SlurmScheduler().submit(job) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - self.assertEqual('1665695', job.job_ident) - patch_check.assert_called_with( - [ - 'sbatch', - '--mem', - '16000M', - '-t', - '16:00:00', - '--export=ALL', - '--dependency=afterok:12345678', - '-J', - 'job1', - '-o', - 'temp/job-%x-%j.log', - 'submit.sh', - ], - shell=False, - ) - - @mock.patch('subprocess.check_output') - def test_dependency_error(self, patch_check): - patch_check.side_effect = [ - "Submitted batch job 12345678".encode('utf8'), - "Submitted batch job 1665695".encode('utf8'), - ] - job = _job.Job( - output_dir='temp', - name='job1', - stage='validate', - script='submit.sh', - dependencies=[ - _job.Job(output_dir='temp', name='job2', stage='cluster', script='submit2.sh') - ], - ) - print(job) - with self.assertRaises(ValueError): - _scheduler.SlurmScheduler().submit(job) - - @mock.patch('subprocess.check_output') - def test_job_array(self, patch_check): - patch_check.return_value = "Submitted batch job 1665695".encode('utf8') - job = _job.ArrayJob( - output_dir='temp', name='job1', stage='validate', script='submit.sh', task_list=10 - ) - print(job) - _scheduler.SlurmScheduler().submit(job) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - self.assertEqual('1665695', job.job_ident) - patch_check.assert_called_with( - [ - 'sbatch', - '--mem', - '16000M', - '-t', - '16:00:00', - '--export=ALL', - '-J', - 'job1', - '-o', - 'temp/job-%x-%A-%a.log', - '--array=1-10', - 'submit.sh', - ], - shell=False, - ) - - @mock.patch('subprocess.check_output') - def test_job_array_concurrency_limit(self, patch_check): - patch_check.side_effect = ["Submitted batch job 1665695".encode('utf8')] - print(patch_check) - job = _job.ArrayJob( - output_dir='temp', - name='job1', - stage='validate', - script='submit.sh', - task_list=[1, 2, 3, 4, 5, 14, 16], - ) - _scheduler.SlurmScheduler(concurrency_limit=2).submit(job) - self.assertEqual(_constants.JOB_STATUS.SUBMITTED, job.status) - self.assertEqual('1665695', job.job_ident) - exp = [ - 'sbatch', - '--mem', - '16000M', - '-t', - '16:00:00', - '--export=ALL', - '-J', - 'job1', - '-o', - 'temp/job-%x-%A-%a.log', - '--array=1-5,14,16%2', - 'submit.sh', - ] - patch_check.assert_called_with(exp, shell=False) - - -class TestUpdate(unittest.TestCase): - # TODO: status of array job - # TODO: status of single job - # TODO: status of job waiting on dependency - - @mock.patch('subprocess.check_output') - def test_job_array(self, patch_check): - content = """ -JobID|JobIDRaw|JobName|Partition|MaxVMSize|MaxVMSizeNode|MaxVMSizeTask|AveVMSize|MaxRSS|MaxRSSNode|MaxRSSTask|AveRSS|MaxPages|MaxPagesNode|MaxPagesTask|AvePages|MinCPU|MinCPUNode|MinCPUTask|AveCPU|NTasks|AllocCPUS|Elapsed|State|ExitCode|AveCPUFreq|ReqCPUFreqMin|ReqCPUFreqMax|ReqCPUFreqGov|ReqMem|ConsumedEnergy|MaxDiskRead|MaxDiskReadNode|MaxDiskReadTask|AveDiskRead|MaxDiskWrite|MaxDiskWriteNode|MaxDiskWriteTask|AveDiskWrite|AllocGRES|ReqGRES|ReqTRES|AllocTRES| -1671879_1|1671879|MV_mock-A36971_batch-tX8SW6tEiEfZ8ZLHDPDa83|short||||||||||||||||||1|00:00:00|FAILED|1:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671879_1.batch|1671879.batch|batch||||||||||||||||||1|1|00:00:00|FAILED|1:0||0|0|0|16000Mn|||||||||||||cpu=1,mem=16000M,node=1| -1671880_1|1671880|MV_mock-A47933_batch-tX8SW6tEiEfZ8ZLHDPDa83|short||||||||||||||||||1|00:00:00|FAILED|1:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1671880_1.batch|1671880.batch|batch||||||||||||||||||1|1|00:00:00|FAILED|1:0||0|0|0|18000Mn|||||||||||||cpu=1,mem=18000M,node=1| -1671893_1|1671893|MV_mock-A36971_batch-tX8SW6tEiEfZ8ZLHDPDa83|short||||||||||||||||||1|00:00:01|FAILED|1:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671893_1.batch|1671893.batch|batch||||||||||||||||||1|1|00:00:01|FAILED|1:0||0|0|0|16000Mn|||||||||||||cpu=1,mem=16000M,node=1| -1671894_1|1671894|MV_mock-A47933_batch-tX8SW6tEiEfZ8ZLHDPDa83|short||||||||||||||||||1|00:00:00|FAILED|1:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1671894_1.batch|1671894.batch|batch||||||||||||||||||1|1|00:00:00|FAILED|1:0||0|0|0|18000Mn|||||||||||||cpu=1,mem=18000M,node=1| -1671915_1|1671915|MV_mock-A36971_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:20|CANCELLED by 1365|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671915_1.batch|1671915.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:20|CANCELLED|0:15|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1671916_1|1671916|MV_mock-A47933_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:20|CANCELLED by 1365|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1671916_1.batch|1671916.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:20|CANCELLED|0:15|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1671970_1|1671970|MV_mock-A36971_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:21|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671970_1.batch|1671970.batch|batch||125588K|n305|0|125588K|908K|n305|0|908K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:21|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1671971_1|1671971|MV_mock-A47933_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:20|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1671971_1.batch|1671971.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:20|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1671974_1|1671974|MV_mock-A36971_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:11|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671974_1.batch|1671974.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:11|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1671975_1|1671975|MV_mock-A47933_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:10|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1671975_1.batch|1671975.batch|batch||125588K|n305|0|125588K|908K|n305|0|908K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:10|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1671981_1|1671981|MV_mock-A36971_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:12|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671981_1.batch|1671981.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:12|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1671982_1|1671982|MV_mock-A47933_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:11|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1671982_1.batch|1671982.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:11|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1671983_1|1671983|MA_mock-A36971_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:05|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1671983_1.batch|1671983.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:05|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1671984_1|1671984|MA_mock-A47933_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:04|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1671984_1.batch|1671984.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:04|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1671985|1671985|MP_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:02|FAILED|2:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1671985.batch|1671985.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|FAILED|2:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1671986|1671986|MS_batch-ezPmnHmYjZjsj8gfCynbsX|short||||||||||||||||||1|00:00:00|CANCELLED by 1365|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|| -1672141|1672141|subtest.sh|all||||||||||||||||||1|00:01:01|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672141.batch|1672141.batch|batch||207004K|n106|0|207004K|1760K|n106|0|1760K|1K|n106|0|1K|00:00:00|n106|0|00:00:00|1|1|00:01:01|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n106|0|0.06M|0.00M|n106|0|0.00M||||cpu=1,mem=7000M,node=1| -1672166|1672166|subtest.sh|all||||||||||||||||||1|00:01:03|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672166.batch|1672166.batch|batch||207004K|n106|0|207004K|1760K|n106|0|1760K|0|n106|0|0|00:00:00|n106|0|00:00:00|1|1|00:01:03|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n106|0|0.06M|0.00M|n106|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_10|1672169|subtest.sh|all||||||||||||||||||1|00:01:02|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_10.batch|1672169.batch|batch||207004K|n130|0|207004K|1764K|n130|0|1764K|0|n130|0|0|00:00:00|n130|0|00:00:00|1|1|00:01:02|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n130|0|0.06M|0.00M|n130|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_1|1672171|subtest.sh|all||||||||||||||||||1|00:01:00|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_1.batch|1672171.batch|batch||207004K|n106|0|207004K|1764K|n106|0|1764K|0|n106|0|0|00:00:00|n106|0|00:00:00|1|1|00:01:00|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n106|0|0.06M|0.00M|n106|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_2|1672172|subtest.sh|all||||||||||||||||||1|00:01:00|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_2.batch|1672172.batch|batch||207004K|n106|0|207004K|1764K|n106|0|1764K|0|n106|0|0|00:00:00|n106|0|00:00:00|1|1|00:01:00|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n106|0|0.06M|0.00M|n106|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_3|1672173|subtest.sh|all||||||||||||||||||1|00:01:01|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_3.batch|1672173.batch|batch||207004K|n106|0|207004K|1764K|n106|0|1764K|0|n106|0|0|00:00:00|n106|0|00:00:00|1|1|00:01:01|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n106|0|0.06M|0.00M|n106|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_4|1672174|subtest.sh|all||||||||||||||||||1|00:01:01|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_4.batch|1672174.batch|batch||207004K|n106|0|207004K|1760K|n106|0|1760K|0|n106|0|0|00:00:00|n106|0|00:00:00|1|1|00:01:01|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n106|0|0.06M|0.00M|n106|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_5|1672175|subtest.sh|all||||||||||||||||||1|00:01:02|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_5.batch|1672175.batch|batch||207004K|n130|0|207004K|1764K|n130|0|1764K|0|n130|0|0|00:00:00|n130|0|00:00:00|1|1|00:01:02|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n130|0|0.06M|0.00M|n130|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_6|1672176|subtest.sh|all||||||||||||||||||1|00:01:02|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_6.batch|1672176.batch|batch||207004K|n130|0|207004K|1764K|n130|0|1764K|1K|n130|0|1K|00:00:00|n130|0|00:00:00|1|1|00:01:02|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n130|0|0.06M|0.00M|n130|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_7|1672177|subtest.sh|all||||||||||||||||||1|00:01:01|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_7.batch|1672177.batch|batch||207004K|n130|0|207004K|1756K|n130|0|1756K|1K|n130|0|1K|00:00:00|n130|0|00:00:00|1|1|00:01:01|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n130|0|0.06M|0.00M|n130|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_8|1672178|subtest.sh|all||||||||||||||||||1|00:01:02|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_8.batch|1672178.batch|batch||207004K|n130|0|207004K|1764K|n130|0|1764K|0|n130|0|0|00:00:00|n130|0|00:00:00|1|1|00:01:02|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n130|0|0.06M|0.00M|n130|0|0.00M||||cpu=1,mem=7000M,node=1| -1672169_9|1672179|subtest.sh|all||||||||||||||||||1|00:01:02|COMPLETED|0:0||Unknown|Unknown|Unknown|7000Mc||||||||||||cpu=1,mem=7000M,node=1|cpu=1,mem=7000M,node=1| -1672169_9.batch|1672179.batch|batch||207004K|n130|0|207004K|1760K|n130|0|1760K|1K|n130|0|1K|00:00:00|n130|0|00:00:00|1|1|00:01:02|COMPLETED|0:0|2.19M|0|0|0|7000Mc|0|0.06M|n130|0|0.06M|0.00M|n130|0|0.00M||||cpu=1,mem=7000M,node=1| -1672268_2|1672268|MV_mock-A36971_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:36|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672268_2.batch|1672268.batch|batch||2025280K|n305|0|2025280K|58776K|n305|0|58776K|0|n305|0|0|00:00:03|n305|0|00:00:03|1|1|00:00:36|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|13.01M|n305|0|13.01M|0.05M|n305|0|0.05M||||cpu=1,mem=16000M,node=1| -1672269_3|1672269|MV_mock-A47933_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:33|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1672269_3.batch|1672269.batch|batch||2019424K|n305|0|2019424K|55844K|n305|0|55844K|0|n305|0|0|00:00:03|n305|0|00:00:03|1|1|00:00:33|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|11.60M|n305|0|11.60M|0.05M|n305|0|0.05M||||cpu=1,mem=18000M,node=1| -1672270_2|1672270|MA_mock-A36971_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672270_2.batch|1672270.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672271_3|1672271|MA_mock-A47933_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672271_3.batch|1672271.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672272|1672272|MP_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:02|FAILED|2:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672272.batch|1672272.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|FAILED|2:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1672273|1672273|MS_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:00|CANCELLED by 1365|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|| -1672270_1|1672274|MA_mock-A36971_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:04|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672270_1.batch|1672274.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:04|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672271_1|1672275|MA_mock-A47933_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672271_1.batch|1672275.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672271_2|1672276|MA_mock-A47933_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672271_2.batch|1672276.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672268_1|1672277|MV_mock-A36971_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:32|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672268_1.batch|1672277.batch|batch||2018912K|n305|0|2018912K|51952K|n305|0|51952K|0|n305|0|0|00:00:03|n305|0|00:00:03|1|1|00:00:32|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|10.79M|n305|0|10.79M|0.05M|n305|0|0.05M||||cpu=1,mem=16000M,node=1| -1672269_1|1672278|MV_mock-A47933_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:32|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1672269_1.batch|1672278.batch|batch||2018660K|n305|0|2018660K|57016K|n305|0|57016K|0|n305|0|0|00:00:03|n305|0|00:00:03|1|1|00:00:32|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|12.79M|n305|0|12.79M|0.05M|n305|0|0.05M||||cpu=1,mem=18000M,node=1| -1672269_2|1672279|M1673291_mock-A47933_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:33|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1672269_2.batch|1671673291279.batch|batch||2019424K|n305|0|2019424K|54212K|n305|0|54212K|0|n305|0|0|00:00:03|n305|0|00:00:03|1|1|00:00:33|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|11.51M|n305|0|11.51M|0.05M|n305|0|0.05M||||cpu=1,mem=18000M,node=1| -1672454_2|1672454|M1673291_mock-A36971_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:29|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672454_2.batch|1671673291454.batch|batch||125588K|n305|0|125588K|908K|n305|0|908K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:29|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1672455_3|1672455|MV_mock-A47933_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:26|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1672455_3.batch|1672455.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:26|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1672456_2|1672456|MA_mock-A36971_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672456_2.batch|1672456.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672457_3|1672457|MA_mock-A47933_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:03|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672457_3.batch|1672457.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:03|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672458|1672458|MP_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672458.batch|1672458.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1672456_1|1672459|MA_mock-A36971_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:04|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672456_1.batch|1672459.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:04|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672457_1|1672460|MA_mock-A47933_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:03|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672457_1.batch|1672460.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:03|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672457_2|1672461|MA_mock-A47933_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|12000Mn||||||||||||cpu=1,mem=12000M,node=1|cpu=1,mem=12000M,node=1| -1672457_2.batch|1672461.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|12000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=12000M,node=1| -1672454_1|1672462|MV_mock-A36971_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:25|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672454_1.batch|1672462.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:25|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| -1672455_1|1672463|MV_mock-A47933_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:25|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1672455_1.batch|1672463.batch|batch||125588K|n305|0|125588K|904K|n305|0|904K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:25|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1672455_2|1672464|MV_mock-A47933_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:27|COMPLETED|0:0||Unknown|Unknown|Unknown|18000Mn||||||||||||cpu=1,mem=18000M,node=1|cpu=1,mem=18000M,node=1| -1672455_2.batch|1672464.batch|batch||125588K|n305|0|125588K|900K|n305|0|900K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:27|COMPLETED|0:0|2.19M|0|0|0|18000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=18000M,node=1| -1672465|1672465|MS_batch-uKEUyUuWbi2mgd75KjP4k5|short||||||||||||||||||1|00:00:02|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1| -1672465.batch|1672465.batch|batch||125588K|n305|0|125588K|896K|n305|0|896K|0|n305|0|0|00:00:00|n305|0|00:00:00|1|1|00:00:02|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|0|n305|65534|0|0|n305|65534|0||||cpu=1,mem=16000M,node=1| - """.encode( - 'utf8' - ) - patch_check.return_value = content - job = _job.ArrayJob(output_dir='temp', job_ident='1672457', task_list=3, stage='validate') - _scheduler.SlurmScheduler().update_info(job) - self.assertEqual(_constants.JOB_STATUS.COMPLETED, job.status) - self.assertEqual(3, len(job.task_list)) - - -class TestParseScontrolShow(unittest.TestCase): - def test_pending_job(self): - content = """ -JobId=1673292 JobName=MP_batch-8PyNX8EN4cBdD9vQd9FrRG - UserId=creisle(1365) GroupId=users(100) MCS_label=N/A - Priority=31 Nice=0 Account=all QOS=normal - JobState=PENDING Reason=DependencyNeverSatisfied Dependency=afterok:1673291_* - Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 - RunTime=00:00:00 TimeLimit=16:00:00 TimeMin=N/A - SubmitTime=2018-05-24T11:32:44 EligibleTime=Unknown - StartTime=Unknown EndTime=Unknown Deadline=N/A - PreemptTime=None SuspendTime=None SecsPreSuspend=0 - Partition=short AllocNode:Sid=n104:47409 - ReqNodeList=(null) ExcNodeList=(null) - NodeList=(null) - NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - TRES=cpu=1,mem=16000,node=1 - Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* - MinCPUsNode=1 MinMemoryNode=16000M MinTmpDiskNode=0 - Features=(null) DelayBoot=00:00:00 - Gres=(null) Reservation=(null) - OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) - Command=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission/output_slurm/pairing/submit.sh - WorkDir=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission - StdErr=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission/output_slurm/pairing/job-%x-1673292.log - StdIn=/dev/null - StdOut=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission/output_slurm/pairing/job-%x-1673292.log - Power= - - """ - rows = _scheduler.SlurmScheduler().parse_scontrol_show(content) - self.assertEqual(1, len(rows)) - self.assertEqual( - { - 'job_ident': '1673292', - 'task_ident': None, - 'status': 'PENDING', - 'status_comment': 'DependencyNeverSatisfied', - 'name': 'MP_batch-8PyNX8EN4cBdD9vQd9FrRG', - }, - rows[0], - ) - - def test_job_array(self): - content = """ -JobId=1673301 ArrayJobId=1673301 ArrayTaskId=3 JobName=subtest.sh - UserId=creisle(1365) GroupId=users(100) MCS_label=N/A - Priority=31 Nice=0 Account=all QOS=normal - JobState=RUNNING Reason=None Dependency=(null) - Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 - RunTime=00:00:11 TimeLimit=90-00:00:00 TimeMin=N/A - SubmitTime=2018-05-24T11:38:28 EligibleTime=2018-05-24T11:38:28 - StartTime=2018-05-24T11:38:29 EndTime=2018-08-22T11:38:29 Deadline=N/A - PreemptTime=None SuspendTime=None SecsPreSuspend=0 - Partition=all AllocNode:Sid=n104:47409 - ReqNodeList=(null) ExcNodeList=(null) - NodeList=n245 - BatchHost=n245 - NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - TRES=cpu=1,mem=7000M,node=1 - Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* - MinCPUsNode=1 MinMemoryCPU=7000M MinTmpDiskNode=0 - Features=(null) DelayBoot=00:00:00 - Gres=(null) Reservation=(null) - OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) - Command=/projects/trans_scratch/validations/workspace/creisle/temp/subtest.sh - WorkDir=/projects/trans_scratch/validations/workspace/creisle/temp - StdErr=/projects/trans_scratch/validations/workspace/creisle/temp/slurm-1673301_3.out - StdIn=/dev/null - StdOut=/projects/trans_scratch/validations/workspace/creisle/temp/slurm-1673301_3.out - Power= - -JobId=1673303 ArrayJobId=1673301 ArrayTaskId=2 JobName=subtest.sh - UserId=creisle(1365) GroupId=users(100) MCS_label=N/A - Priority=31 Nice=0 Account=all QOS=normal - JobState=RUNNING Reason=None Dependency=(null) - Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 - RunTime=00:00:11 TimeLimit=90-00:00:00 TimeMin=N/A - SubmitTime=2018-05-24T11:38:28 EligibleTime=2018-05-24T11:38:28 - StartTime=2018-05-24T11:38:29 EndTime=2018-08-22T11:38:29 Deadline=N/A - PreemptTime=None SuspendTime=None SecsPreSuspend=0 - Partition=all AllocNode:Sid=n104:47409 - ReqNodeList=(null) ExcNodeList=(null) - NodeList=n235 - BatchHost=n235 - NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - TRES=cpu=1,mem=7000M,node=1 - Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* - MinCPUsNode=1 MinMemoryCPU=7000M MinTmpDiskNode=0 - Features=(null) DelayBoot=00:00:00 - Gres=(null) Reservation=(null) - OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) - Command=/projects/trans_scratch/validations/workspace/creisle/temp/subtest.sh - WorkDir=/projects/trans_scratch/validations/workspace/creisle/temp - StdErr=/projects/trans_scratch/validations/workspace/creisle/temp/slurm-1673301_2.out - StdIn=/dev/null - StdOut=/projects/trans_scratch/validations/workspace/creisle/temp/slurm-1673301_2.out - Power= - -JobId=1673302 ArrayJobId=1673301 ArrayTaskId=1 JobName=subtest.sh - UserId=creisle(1365) GroupId=users(100) MCS_label=N/A - Priority=31 Nice=0 Account=all QOS=normal - JobState=RUNNING Reason=None Dependency=(null) - Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 - RunTime=00:00:11 TimeLimit=90-00:00:00 TimeMin=N/A - SubmitTime=2018-05-24T11:38:28 EligibleTime=2018-05-24T11:38:28 - StartTime=2018-05-24T11:38:29 EndTime=2018-08-22T11:38:29 Deadline=N/A - PreemptTime=None SuspendTime=None SecsPreSuspend=0 - Partition=all AllocNode:Sid=n104:47409 - ReqNodeList=(null) ExcNodeList=(null) - NodeList=n137 - BatchHost=n137 - NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - TRES=cpu=1,mem=7000M,node=1 - Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* - MinCPUsNode=1 MinMemoryCPU=7000M MinTmpDiskNode=0 - Features=(null) DelayBoot=00:00:00 - Gres=(null) Reservation=(null) - OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) - Command=/projects/trans_scratch/validations/workspace/creisle/temp/subtest.sh - WorkDir=/projects/trans_scratch/validations/workspace/creisle/temp - StdErr=/projects/trans_scratch/validations/workspace/creisle/temp/slurm-1673301_1.out - StdIn=/dev/null - StdOut=/projects/trans_scratch/validations/workspace/creisle/temp/slurm-1673301_1.out - Power= - - """ - rows = _scheduler.SlurmScheduler().parse_scontrol_show(content) - self.assertEqual(3, len(rows)) - - def test_cancelled_task(self): - content = """ - -JobId=1697512 ArrayJobId=1697503 ArrayTaskId=1 JobName=MV_mock-A47933_batch-uwSwW68EW43XNdvq85NxJ7 - UserId=creisle(1365) GroupId=users(100) MCS_label=N/A - Priority=42 Nice=0 Account=all QOS=normal - JobState=CANCELLED Reason=None Dependency=(null) - Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:15 - RunTime=00:00:02 TimeLimit=16:00:00 TimeMin=N/A - SubmitTime=2018-05-31T20:01:46 EligibleTime=2018-05-31T20:01:49 - StartTime=2018-05-31T20:02:05 EndTime=2018-05-31T20:02:07 Deadline=N/A - PreemptTime=None SuspendTime=None SecsPreSuspend=0 - Partition=all AllocNode:Sid=n104:173998 - ReqNodeList=(null) ExcNodeList=(null) - NodeList=n245 - BatchHost=n245 - NumNodes=1 NumCPUs=1 NumTasks=0 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - TRES=cpu=1,mem=18000M,node=1 - Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* - MinCPUsNode=1 MinMemoryNode=18000M MinTmpDiskNode=0 - Features=(null) DelayBoot=00:00:00 - Gres=(null) Reservation=(null) - OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) - Command=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission/slurm/mock-A47933_diseased_transcriptome/validate/submit.sh - WorkDir=/home/creisle - StdErr=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission/slurm/mock-A47933_diseased_transcriptome/validate/batch-uwSwW68EW43XNdvq85NxJ7-1/job-%x-1697503-1.log - StdIn=/dev/null - StdOut=/projects/trans_scratch/validations/workspace/creisle/temp/test_submission/slurm/mock-A47933_diseased_transcriptome/validate/batch-uwSwW68EW43XNdvq85NxJ7-1/job-%x-1697503-1.log - Power= - - """ - rows = _scheduler.SlurmScheduler().parse_scontrol_show(content) - self.assertEqual(1, len(rows)) - row = rows[0] - self.assertEqual(_constants.JOB_STATUS.CANCELLED, row['status']) - - -class TestParseSacctTable(unittest.TestCase): - def test_basic_table(self): - content = """ -JobID|JobIDRaw|JobName|Partition|MaxVMSize|MaxVMSizeNode|MaxVMSizeTask|AveVMSize|MaxRSS|MaxRSSNode|MaxRSSTask|AveRSS|MaxPages|MaxPagesNode|MaxPagesTask|AvePages|MinCPU|MinCPUNode|MinCPUTask|AveCPU|NTasks|AllocCPUS|Elapsed|State|ExitCode|AveCPUFreq|ReqCPUFreqMin|ReqCPUFreqMax|ReqCPUFreqGov|ReqMem|ConsumedEnergy|MaxDiskRead|MaxDiskReadNode|MaxDiskReadTask|AveDiskRead|MaxDiskWrite|MaxDiskWriteNode|MaxDiskWriteTask|AveDiskWrite|AllocGRES|ReqGRES|ReqTRES|AllocTRES -1672273|1672273|MS_batch-iJUMYRdLFDsuu9eVzGmmKm|short||||||||||||||||||1|00:00:00|CANCELLED by 1365|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1| - """ - rows = _scheduler.SlurmScheduler().parse_sacct(content) - self.assertEqual(1, len(rows)) - row = rows[0] - self.assertEqual(_constants.JOB_STATUS.CANCELLED, row['status']) - - # TODO: test empty header - - def test_cancelled_task(self): - content = """ -JobID|JobName|User|ReqMem|Elapsed|State|MaxRSS|AveRSS|Partition -1697503_3|MV_mock-A47933_batch-uwSwW68EW43XNdvq85NxJ7|creisle|18000Mn|00:00:10|COMPLETED|||all -1697503_3.batch|batch||18000Mn|00:00:10|COMPLETED|904K|904K| -1697503_1|MV_mock-A47933_batch-uwSwW68EW43XNdvq85NxJ7|creisle|18000Mn|00:00:02|CANCELLED by 1365|||all -1697503_1.batch|batch||18000Mn|00:00:02|CANCELLED|896K|896K| -1697503_2|MV_mock-A47933_batch-uwSwW68EW43XNdvq85NxJ7|creisle|18000Mn|00:00:10|COMPLETED|||all -1697503_2.batch|batch||18000Mn|00:00:10|COMPLETED|904K|904K| - """ - rows = _scheduler.SlurmScheduler().parse_sacct(content) - self.assertEqual(3, len(rows)) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, rows[1]['status']) - self.assertEqual(_constants.JOB_STATUS.COMPLETED, rows[0]['status']) - - def test_pending_array(self): - content = """ -JobID|JobName|User|ReqMem|Elapsed|State|MaxRSS|AveRSS|Partition -1701003_[37-200]|MA_L1522785992-normal_batch-aUmErftiY7eEWvENfSeJwc|creisle|12000Mn|00:00:00|PENDING|||all -1701003_1|MA_L1522785992-normal_batch-aUmErftiY7eEWvENfSeJwc|creisle|12000Mn|00:05:00|RUNNING|||all - """ - rows = _scheduler.SlurmScheduler().parse_sacct(content) - self.assertEqual(2, len(rows)) - self.assertEqual(_constants.JOB_STATUS.PENDING, rows[0]['status']) - self.assertEqual(_constants.JOB_STATUS.RUNNING, rows[1]['status']) - self.assertIs(None, rows[0]['task_ident']) - self.assertEqual(1, rows[1]['task_ident']) - - def test_resubmission_array(self): - content = """ -JobID|JobIDRaw|JobName|Partition|MaxVMSize|MaxVMSizeNode|MaxVMSizeTask|AveVMSize|MaxRSS|MaxRSSNode|MaxRSSTask|AveRSS|MaxPages|MaxPagesNode|MaxPagesTask|AvePages|MinCPU|MinCPUNode|MinCPUTask|AveCPU|NTasks|AllocCPUS|Elapsed|State|ExitCode|AveCPUFreq|ReqCPUFreqMin|ReqCPUFreqMax|ReqCPUFreqGov|ReqMem|ConsumedEnergy|MaxDiskRead|MaxDiskReadNode|MaxDiskReadTask|AveDiskRead|MaxDiskWrite|MaxDiskWriteNode|MaxDiskWriteTask|AveDiskWrite|AllocGRES|ReqGRES|ReqTRES|AllocTRES -1873472_162|1873671|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|10:18:26|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_162.batch|1873671.batch|batch||13703984K|n106|0|8708976K|11725204K|n106|0|6743424K|53K|n106|0|53K|10:06:31|n106|0|10:06:31|1|1|10:18:26|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|11767.09M|n106|0|11767.09M|29.74M|n106|0|29.74M||||cpu=1,mem=16000M,node=1 -1873472_163|1873672|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|08:09:50|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_163.batch|1873672.batch|batch||13690948K|n106|0|8686468K|11712556K|n106|0|6721328K|45K|n106|0|45K|07:57:40|n106|0|07:57:40|1|1|08:09:50|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|13345.62M|n106|0|13345.62M|26.77M|n106|0|26.77M||||cpu=1,mem=16000M,node=1 -1873472_164|1873673|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|12:26:33|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_164.batch|1873673.batch|batch||13730588K|n106|0|9577424K|11750552K|n106|0|6777084K|55K|n106|0|55K|12:13:52|n106|0|12:13:52|1|1|12:26:33|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|17065.30M|n106|0|17065.30M|34.39M|n106|0|34.39M||||cpu=1,mem=16000M,node=1 -1873472_165|1873674|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|05:32:32|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_165.batch|1873674.batch|batch||13735224K|n106|0|9574752K|11756988K|n106|0|6773916K|52K|n106|0|52K|05:21:46|n106|0|05:21:46|1|1|05:32:32|COMPLETED|0:0|2.18M|0|0|0|16000Mn|0|15997.17M|n106|0|15997.17M|37.74M|n106|0|37.74M||||cpu=1,mem=16000M,node=1 -1873472_166|1873675|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|07:30:37|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_166.batch|1873675.batch|batch||13722476K|n106|0|8669768K|11742400K|n106|0|6702776K|53K|n106|0|53K|07:18:31|n106|0|07:18:31|1|1|07:30:37|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|14716.82M|n106|0|14716.82M|21.39M|n106|0|21.39M||||cpu=1,mem=16000M,node=1 -1873472_167|1873676|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|06:45:32|COMPLETED|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_167.batch|1873676.batch|batch||13686828K|n106|0|8596932K|11707132K|n106|0|6565180K|49K|n106|0|49K|06:35:26|n106|0|06:35:26|1|1|06:45:32|COMPLETED|0:0|2.19M|0|0|0|16000Mn|0|10274.82M|n106|0|10274.82M|39.37M|n106|0|39.37M||||cpu=1,mem=16000M,node=1 -1873472_168|1873677|MV_P02300_batch-egprnnYFaJtPtnECYfGiKf|all||||||||||||||||||1|16:00:06|TIMEOUT|0:0||Unknown|Unknown|Unknown|16000Mn||||||||||||cpu=1,mem=16000M,node=1|cpu=1,mem=16000M,node=1 -1873472_168.batch|1873677.batch|batch||13749848K|n106|0|8700272K|11771032K|n106|0|6734652K|46K|n106|0|46K|15:48:39|n106|0|15:48:39|1|1|16:00:07|CANCELLED|0:15|2.19M|0|0|0|16000Mn|0|10613.36M|n106|0|10613.36M|25.00M|n106|0|25.00M||||cpu=1,mem=16000M,node=1 - """ - rows = _scheduler.SlurmScheduler().parse_sacct(content) - complete = [ - row['status'] for row in rows if row['status'] == _constants.JOB_STATUS.COMPLETED - ] - fail = [row['status'] for row in rows if row['status'] == _constants.JOB_STATUS.CANCELLED] - self.assertEqual(6, len(complete)) - self.assertEqual(1, len(fail)) - - -class TestCancel(unittest.TestCase): - @mock.patch('mavis.schedule.scheduler.SlurmScheduler.command') - def test_single_job(self, patcher): - sched = _scheduler.SlurmScheduler() - job = _job.Job(SUBCOMMAND.VALIDATE, '', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, job.status) - patcher.assert_called_with(['scancel', '1234']) - - @mock.patch('mavis.schedule.scheduler.SlurmScheduler.command') - def test_array_job(self, patcher): - sched = _scheduler.SlurmScheduler() - job = _job.ArrayJob(SUBCOMMAND.VALIDATE, 10, output_dir='', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, job.status) - for task in job.task_list: - self.assertEqual(_constants.JOB_STATUS.CANCELLED, task.status) - patcher.assert_called_with(['scancel', '1234']) - - @mock.patch('mavis.schedule.scheduler.SlurmScheduler.command') - def test_array_job_task(self, patcher): - sched = _scheduler.SlurmScheduler() - job = _job.ArrayJob(SUBCOMMAND.VALIDATE, 10, output_dir='', job_ident='1234') - sched.cancel(job, task_ident=4) - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, job.status) - for i, task in enumerate(job.task_list): - if i == 3: - self.assertEqual(_constants.JOB_STATUS.CANCELLED, task.status) - else: - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, task.status) - patcher.assert_called_with(['scancel', '1234_4']) - - @mock.patch('mavis.schedule.scheduler.SlurmScheduler.command') - def test_bad_command(self, patcher): - patcher.side_effect = [subprocess.CalledProcessError(1, 'cmd')] - sched = _scheduler.SlurmScheduler() - job = _job.Job(SUBCOMMAND.VALIDATE, '', job_ident='1234') - with self.assertRaises(subprocess.CalledProcessError): - sched.cancel(job) - patcher.assert_called_with(['scancel', '1234']) diff --git a/tests/integration/schedule/test_torque.py b/tests/integration/schedule/test_torque.py deleted file mode 100644 index 50773687..00000000 --- a/tests/integration/schedule/test_torque.py +++ /dev/null @@ -1,441 +0,0 @@ -import subprocess -import unittest -from unittest import mock - -from mavis.schedule import scheduler as _scheduler -from mavis.schedule import constants as _constants -from mavis.schedule import job as _job -from mavis.constants import SUBCOMMAND - - -class TestParseQstat(unittest.TestCase): - - # TODO: single job running - # TODO: batch job running - # TODO: single job complete - - def test_single_job_complete(self): - content = """ -Job Id: 9.torque01.bcgsc.ca - Job_Name = subtest.sh - Job_Owner = creisle@torque01.bcgsc.ca - resources_used.cput = 00:00:00 - resources_used.vmem = 346716kb - resources_used.walltime = 00:01:00 - resources_used.mem = 3624kb - resources_used.energy_used = 0 - job_state = C - queue = batch - server = torque01.bcgsc.ca - Checkpoint = u - ctime = Tue May 29 09:37:00 2018 - Error_Path = torque01.bcgsc.ca:/projects/trans_scratch/validations/workspa - ce/creisle/temp/subtest.sh.e9 - exec_host = torque01.bcgsc.ca/0 - Hold_Types = n - Join_Path = n - Keep_Files = n - Mail_Points = a - mtime = Tue May 29 09:38:01 2018 - Output_Path = torque01.bcgsc.ca:/projects/trans_scratch/validations/worksp - ace/creisle/temp/subtest.sh.o9 - Priority = 0 - qtime = Tue May 29 09:37:00 2018 - Rerunable = True - Resource_List.walltime = 01:00:00 - Resource_List.nodes = 1 - Resource_List.nodect = 1 - session_id = 25438 - Variable_List = PBS_O_QUEUE=batch,PBS_O_HOME=/home/creisle, - PBS_O_LOGNAME=creisle, - PBS_O_PATH=/home/creisle/applications/node-v10.1.0-linux-x64/bin:/hom - e/creisle/.npm-packages/bin:/home/creisle/bin:/home/creisle/applicatio - ns/centos06/python-3.6.1/bin:/projects/tumour_char/analysis_scripts/bi - n/pog:/gsc/software/linux-x86_64-centos6/git-2.12.0/bin/:/usr/local/bi - n:/usr/local/sbin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/p - rojects/trans_scratch/software/pipeline_commands/:/home/creisle/bin, - PBS_O_MAIL=/var/spool/mail/creisle,PBS_O_SHELL=/bin/bash, - PBS_O_LANG=en_US.UTF-8, - PBS_O_WORKDIR=/projects/trans_scratch/validations/workspace/creisle/t - emp,PBS_O_HOST=torque01.bcgsc.ca,PBS_O_SERVER=torque01.bcgsc.ca - euser = creisle - egroup = users - queue_type = E - comment = Job started on Tue May 29 at 09:37 - etime = Tue May 29 09:37:00 2018 - exit_status = 0 - submit_args = subtest.sh - start_time = Tue May 29 09:37:01 2018 - start_count = 1 - fault_tolerant = False - comp_time = Tue May 29 09:38:01 2018 - job_radix = 0 - total_runtime = 60.481239 - submit_host = torque01.bcgsc.ca - init_work_dir = /projects/trans_scratch/validations/workspace/creisle/temp - - request_version = 1 - - """ - rows = _scheduler.TorqueScheduler().parse_qstat(content) - self.assertEqual(1, len(rows)) - row = rows[0] - self.assertEqual(_constants.JOB_STATUS.COMPLETED, row['status']) - self.assertEqual('9.torque01.bcgsc.ca', row['job_ident']) - self.assertEqual('subtest.sh', row['name']) - self.assertIs(None, row['task_ident']) - self.assertEqual('', row['status_comment']) - - def test_array_job(self): - content = """ -Job Id: 48[1].torque01.bcgsc.ca - Job_Name = MA_mock-A47933_batch-JT3CUggKXNStHcoFXYaGR3-1 - Job_Owner = creisle@torque01.bcgsc.ca - job_state = C - queue = batch - server = torque01.bcgsc.ca - Checkpoint = u - ctime = Tue May 29 18:27:33 2018 - depend = afterokarray:43[].torque01.bcgsc.ca - Error_Path = torque01.bcgsc.ca:/projects/trans_scratch/validations/workspa - ce/creisle/temp/test_submission/output_torque/mock-A47933_diseased_tra - nscriptome/annotate/batch-JT3CUggKXNStHcoFXYaGR3-/job---.log-1 - Join_Path = oe - Keep_Files = n - Mail_Points = a - mtime = Tue May 29 18:27:33 2018 - Output_Path = torque01.bcgsc.ca:/projects/trans_scratch/validations/worksp - ace/creisle/temp/test_submission/output_torque/mock-A47933_diseased_tr - anscriptome/annotate/batch-JT3CUggKXNStHcoFXYaGR3-/job---.log-1 - Priority = 0 - qtime = Tue May 29 18:27:33 2018 - Rerunable = True - Resource_List.mem = 12000mb - Resource_List.walltime = 16:00:00 - Resource_List.nodes = 1 - Resource_List.nodect = 1 - Variable_List = PBS_ARRAYID=1,PBS_O_QUEUE=batch,PBS_O_HOME=/home/creisle, - PBS_O_LOGNAME=creisle, - PBS_O_PATH=/home/creisle/git/mavis/venv/bin:/home/creisle/application - s/node-v10.1.0-linux-x64/bin:/home/creisle/.npm-packages/bin:/home/cre - isle/bin:/home/creisle/applications/centos06/python-3.6.1/bin:/project - s/tumour_char/analysis_scripts/bin/pog:/gsc/software/linux-x86_64-cent - os6/git-2.12.0/bin/:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr - /bin:/usr/local/sbin:/usr/sbin:/projects/trans_scratch/software/pipeli - ne_commands/:/home/creisle/bin,PBS_O_MAIL=/var/spool/mail/creisle, - PBS_O_SHELL=/bin/bash,PBS_O_LANG=en_US.UTF-8, - PBS_O_WORKDIR=/projects/trans_scratch/validations/workspace/creisle/t - emp/test_submission,PBS_O_HOST=torque01.bcgsc.ca, - PBS_O_SERVER=torque01.bcgsc.ca, - MANPATH=/home/creisle/.npm-packages/share/man:/home/creisle/applicati - ons/centos06/python-3.6.1/man:/usr/local/share/man:/usr/share/man/over - rides:/usr/share/man,XDG_SESSION_ID=1340,HOSTNAME=torque01.bcgsc.ca, - SHELL=/bin/bash,TERM=xterm-256color,HISTSIZE=1000,CLICOLOR=1, - SSH_CLIENT=10.9.202.242 35994 22,TMPDIR=/var/tmp/, - PYTHONUNBUFFERED=True,MAVIS_MIN_CLUSTERS_PER_FILE=2, - NODE_OPTIONS=--trace-warnings,SSH_TTY=/dev/pts/0,USER=creisle, - SVN_EDITOR=vim,LS_COLORS=di=34;01;47:mi=100;31;01:ln=36;01:ex=01;32, - MAVIS_SCHEDULER=TORQUE,VIRTUAL_ENV=/home/creisle/git/mavis/venv, - SACCT_FORMAT=jobid%-18\\,jobname%45\\,user%-8\\,reqmem\\,elapsed\\,state\\, - MaxRSS\\,AveRSS\\,Partition, - PATH=/home/creisle/git/mavis/venv/bin:/home/creisle/applications/node - -v10.1.0-linux-x64/bin:/home/creisle/.npm-packages/bin:/home/creisle/b - in:/home/creisle/applications/centos06/python-3.6.1/bin:/projects/tumo - ur_char/analysis_scripts/bin/pog:/gsc/software/linux-x86_64-centos6/gi - t-2.12.0/bin/:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/bin:/ - usr/local/sbin:/usr/sbin:/projects/trans_scratch/software/pipeline_com - mands/:/home/creisle/bin,MAIL=/var/spool/mail/creisle, - _=/usr/local/bin/qsub, - PWD=/projects/trans_scratch/validations/workspace/creisle/temp/test_s - ubmission,XMODIFIERS=@im=none,LANG=en_US.UTF-8, - MODULEPATH=/usr/share/Modules/modulefiles:/etc/modulefiles, - LOADEDMODULES=, - NODE_PATH=/home/creisle/.npm-packages/lib/node_modules, - SQUEUE_FORMAT=%.12i %9P %45j %.8u %.2t %.10M %.6D %.8m %.14l %.4c %.2 - 0R %E,HISTCONTROL=ignoredups,MAVIS_MAX_FILES=1,HOME=/home/creisle, - SHLVL=2,LOGNAME=creisle, - PYTHONPATH=/home/creisle/applications/centos06/python-3.6.1/bin:, - SSH_CONNECTION=10.9.202.242 35994 10.9.220.231 22, - ORIENTDB_HOME=/home/creisle/applications/orientdb/orientdb-community- - 2.2.34,MODULESHOME=/usr/share/Modules, - LESSOPEN=||/usr/bin/lesspipe.sh %s,BROWSER=/usr/bin/google-chrome, - NPM_PACKAGES=/home/creisle/.npm-packages, - XDG_RUNTIME_DIR=/run/user/1365, - BASH_FUNC_module()=() { eval `/usr/bin/modulecmd bash $*`\\ -} - euser = creisle - egroup = users - queue_type = E - comment = Job 48[].torque01.bcgsc.ca deleted because its dependency of arr - ay 43[].torque01.bcgsc.ca can never be satisfied - etime = Tue May 29 18:27:33 2018 - exit_status = 271 - submit_args = -j oe -l mem=12000mb -l walltime=16:00:00 -V -W depend=after - okarray:43[].torque01.bcgsc.ca -N MA_mock-A47933_batch-JT3CUggKXNStHco - FXYaGR3 -o /projects/trans_scratch/validations/workspace/creisle/temp/ - test_submission/output_torque/mock-A47933_diseased_transcriptome/annot - ate/batch-JT3CUggKXNStHcoFXYaGR3-/job---.log -t 1 /projects/trans_scra - tch/validations/workspace/creisle/temp/test_submission/output_torque/m - ock-A47933_diseased_transcriptome/annotate/submit.sh - job_array_id = 1 - fault_tolerant = False - job_radix = 0 - submit_host = torque01.bcgsc.ca - init_work_dir = /projects/trans_scratch/validations/workspace/creisle/temp - /test_submission - request_version = 1 - - """ - rows = _scheduler.TorqueScheduler().parse_qstat(content) - self.assertEqual(1, len(rows)) - row = rows[0] - self.assertEqual('48[].torque01.bcgsc.ca', row['job_ident']) - self.assertIs(1, row['task_ident']) - - # TODO: single job error - # TODO: batch job error - # TODO: single job exiting - # TODO: batch job exiting - - -class TestCancel(unittest.TestCase): - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_single_job(self, patcher): - sched = _scheduler.TorqueScheduler() - job = _job.Job(SUBCOMMAND.VALIDATE, '', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, job.status) - patcher.assert_called_with(['qdel', '1234']) - - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_array_job(self, patcher): - sched = _scheduler.TorqueScheduler() - job = _job.ArrayJob(SUBCOMMAND.VALIDATE, 10, output_dir='', job_ident='1234') - sched.cancel(job) - self.assertEqual(_constants.JOB_STATUS.CANCELLED, job.status) - for task in job.task_list: - self.assertEqual(_constants.JOB_STATUS.CANCELLED, task.status) - patcher.assert_called_with(['qdel', '1234']) - - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_array_job_task(self, patcher): - sched = _scheduler.TorqueScheduler() - job = _job.ArrayJob(SUBCOMMAND.VALIDATE, 10, output_dir='', job_ident='1234') - sched.cancel(job, task_ident='4') - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, job.status) - for i, task in enumerate(job.task_list): - if i == 3: - self.assertEqual(_constants.JOB_STATUS.CANCELLED, task.status) - else: - self.assertEqual(_constants.JOB_STATUS.NOT_SUBMITTED, task.status) - patcher.assert_called_with(['qdel', '1234', '-t', '4']) - - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_bad_command(self, patcher): - patcher.side_effect = [subprocess.CalledProcessError(1, 'cmd')] - sched = _scheduler.TorqueScheduler() - job = _job.Job(SUBCOMMAND.VALIDATE, '', job_ident='1234') - sched.cancel(job) - patcher.assert_called_with(['qdel', '1234']) - self.assertNotEqual(_constants.JOB_STATUS.CANCELLED, job.status) - - -class TestSubmit(unittest.TestCase): - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_job(self, patcher): - patcher.side_effect = ['141.torque01.bcgsc.ca\n'] - job = _job.Job( - stage=SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - name='MV1', - memory_limit=1, - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - script='script.sh', - ) - - sched = _scheduler.TorqueScheduler() - sched.submit(job) - self.assertEqual('141.torque01.bcgsc.ca', job.job_ident) - patcher.assert_called_with( - [ - 'qsub', - '-j', - 'oe', - '-q', - 'all', - '-l', - 'mem=1mb', - '-l', - 'walltime=16:00:00', - '-V', - '-N', - 'MV1', - '-o', - 'output_dir/job-$PBS_JOBNAME-$PBS_JOBID.log', - '-m', - 'abef', - '-M', - 'me@example.com', - 'script.sh', - ] - ) - - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_job_with_job_deps(self, patcher): - patcher.side_effect = ['141.torque01.bcgsc.ca\n'] - job = _job.Job( - stage=SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - name='MV1', - memory_limit=1, - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - script='script.sh', - dependencies=[ - _job.Job( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - job_ident='1234.torque01.bcgsc.ca', - ), - _job.Job( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - job_ident='54.torque01.bcgsc.ca', - ), - ], - ) - - sched = _scheduler.TorqueScheduler() - sched.submit(job) - self.assertEqual('141.torque01.bcgsc.ca', job.job_ident) - patcher.assert_called_with( - [ - 'qsub', - '-j', - 'oe', - '-q', - 'all', - '-l', - 'mem=1mb', - '-l', - 'walltime=16:00:00', - '-V', - '-W depend=afterok:1234.torque01.bcgsc.ca:54.torque01.bcgsc.ca', - '-N', - 'MV1', - '-o', - 'output_dir/job-$PBS_JOBNAME-$PBS_JOBID.log', - '-m', - 'abef', - '-M', - 'me@example.com', - 'script.sh', - ] - ) - - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_job_with_mixed_deps(self, patcher): - patcher.side_effect = ['141.torque01.bcgsc.ca\n'] - job = _job.Job( - stage=SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - name='MV1', - memory_limit=1, - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - script='script.sh', - dependencies=[ - _job.Job( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - job_ident='1234.torque01.bcgsc.ca', - ), - _job.Job( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - job_ident='54.torque01.bcgsc.ca', - ), - _job.TorqueArrayJob( - stage=SUBCOMMAND.VALIDATE, - output_dir='output_dir', - job_ident='99[].torque01.bcgsc.ca', - task_list=5, - ), - ], - ) - - sched = _scheduler.TorqueScheduler() - sched.submit(job) - self.assertEqual('141.torque01.bcgsc.ca', job.job_ident) - patcher.assert_called_with( - [ - 'qsub', - '-j', - 'oe', - '-q', - 'all', - '-l', - 'mem=1mb', - '-l', - 'walltime=16:00:00', - '-V', - '-W depend=afterokarray:99[][5].torque01.bcgsc.ca,afterok:1234.torque01.bcgsc.ca:54.torque01.bcgsc.ca', - '-N', - 'MV1', - '-o', - 'output_dir/job-$PBS_JOBNAME-$PBS_JOBID.log', - '-m', - 'abef', - '-M', - 'me@example.com', - 'script.sh', - ] - ) - - @mock.patch('mavis.schedule.scheduler.TorqueScheduler.command') - def test_array(self, patcher): - patcher.side_effect = ['142[].torque01.bcgsc.ca\n'] - job = _job.TorqueArrayJob( - stage=SUBCOMMAND.VALIDATE, - queue='all', - output_dir='output_dir', - name='MV1', - memory_limit=1, - mail_user='me@example.com', - mail_type=_constants.MAIL_TYPE.ALL, - script='script.sh', - task_list=[1, 2, 3, 6, 9], - ) - - sched = _scheduler.TorqueScheduler(concurrency_limit=2) - sched.submit(job) - self.assertEqual('142[].torque01.bcgsc.ca', job.job_ident) - patcher.assert_called_with( - [ - 'qsub', - '-j', - 'oe', - '-q', - 'all', - '-l', - 'mem=1mb', - '-l', - 'walltime=16:00:00', - '-V', - '-N', - 'MV1', - '-o', - 'output_dir/job-$PBS_JOBNAME-$PBS_JOBID-$PBS_ARRAYID.log', - '-m', - 'abef', - '-M', - 'me@example.com', - '-t', - '1-3,6,9%2', - 'script.sh', - ] - ) diff --git a/tests/integration/test_args.py b/tests/integration/test_args.py index 33210a06..0dea660f 100644 --- a/tests/integration/test_args.py +++ b/tests/integration/test_args.py @@ -1,348 +1,398 @@ import argparse +import json import os -import unittest -from unittest.mock import patch import sys -from mavis.main import main as mavis_main +import tempfile +from unittest.mock import patch + +import pytest +from mavis import util from mavis.cluster import main as cluster_main +from mavis.main import main as mavis_main from mavis.validate import main as validate_main -from mavis import util -from . import ARGUMENT_ERROR from ..util import get_data -def expect_error(testcase, func, catchtype): +@pytest.fixture +def output_dir(): + temp_output = tempfile.mkdtemp() + yield temp_output + + +@pytest.fixture +def configpath(tmp_path): + p = tmp_path / "config.json" + return p + + +def expect_error(testcase, func, catchtype=None): try: func() - except catchtype as err: - return err - else: + except (SystemExit, Exception) as err: + if catchtype is None or isinstance(err, catchtype): + return err raise AssertionError('Did not throw the expected error', catchtype) -class TestCluster(unittest.TestCase): - def test_trans_multiple_annotations_no_masking(self): +class TestCluster: + def test_trans_multiple_annotations_no_masking(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'transcriptome', + 'assign': [get_data('mock_sv_events.tsv')], + } + }, + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'cluster', - '--annotations', - get_data('example_genes.json'), - get_data('mock_annotations.json'), '--library', 'translib', - '--protocol', - 'transcriptome', - '--disease_status', - 'diseased', - '--input', + '--inputs', get_data('mock_sv_events.tsv'), '--output', - 'outdir', + output_dir, + '--config', + str(configpath), ] with patch.object(cluster_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): mavis_main() - def test_trans_multiple_annotations_with_masking(self): + def test_trans_multiple_annotations_with_masking(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'transcriptome', + 'assign': [get_data('mock_sv_events.tsv')], + } + }, + 'cluster.uninformative_filter': True, + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'reference.masking': [get_data('mock_masking.tab')], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'cluster', - '--annotations', - get_data('example_genes.json'), - get_data('mock_annotations.json'), '--library', 'translib', - '--protocol', - 'transcriptome', - '--disease_status', - 'diseased', - '--input', + '--inputs', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--masking', - get_data('mock_masking.tab'), + output_dir, + '--config', + str(configpath), ] with patch.object(cluster_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): mavis_main() - def test_error_missing_annotations_translib_uninform(self): - args = [ - 'mavis', - 'cluster', - '--library', - 'translib', - '--protocol', - 'transcriptome', - '--disease_status', - 'diseased', - '--input', - get_data('mock_sv_events.tsv'), - '--output', - 'outdir', - '--uninformative_filter', - 'True', - ] - with patch.object(cluster_main, 'main', util.DEVNULL): - with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) - - def test_ok_missing_annotations_translib_nofilter(self): - args = [ - 'mavis', - 'cluster', - '--library', - 'translib', - '--protocol', - 'transcriptome', - '--disease_status', - 'diseased', - '--input', - get_data('mock_sv_events.tsv'), - '--output', - 'outdir', - ] + def test_error_missing_annotations_translib_uninform(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'transcriptome', + 'assign': [get_data('mock_sv_events.tsv')], + } + }, + 'cluster.uninformative_filter': True, + 'output_dir': output_dir, + } + ) + ) + args = ['mavis', 'cluster', '--library', 'translib', '--output', output_dir] with patch.object(cluster_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): - mavis_main() - + expect_error(self, mavis_main) -class TestValidate(unittest.TestCase): - def test_error_missing_annotations_translib(self): - args = [ - 'mavis', - 'validate', - '--library', - 'translib', - '--protocol', - 'transcriptome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', - '--input', - get_data('mock_sv_events.tsv'), - '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - '--aligner_reference', - get_data('mock_reference_genome.fa'), - '--read_length', - '125', - ] - with patch.object(validate_main, 'main', util.DEVNULL): - with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) - def test_ok_missing_annotations_genome(self): +class TestValidate: + def test_error_missing_annotations_translib(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'transcriptome', + 'assign': [get_data('mock_sv_events.tsv')], + 'bam_file': get_data('mock_trans_reads_for_events.sorted.bam'), + 'read_length': 125, + 'median_fragment_size': 200, + 'stdev_fragment_size': 50, + } + }, + 'cluster.uninformative_filter': True, + 'reference.reference_genome': [get_data('mock_reference_genome.fa')], + 'reference.aligner_reference': [get_data('mock_reference_genome.fa')], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'validate', '--library', 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', '--input', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - '--aligner_reference', - get_data('mock_reference_genome.fa'), - '--read_length', - '125', + output_dir, + '--config', + str(configpath), ] with patch.object(validate_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): - mavis_main() + expect_error(self, mavis_main) - def test_ok_multi_ref_genome(self): + def test_ok_multi_ref_genome(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'genome', + 'assign': [get_data('mock_sv_events.tsv')], + 'bam_file': get_data('mock_trans_reads_for_events.sorted.bam'), + 'read_length': 125, + 'median_fragment_size': 200, + 'stdev_fragment_size': 50, + } + }, + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'cluster.uninformative_filter': True, + 'reference.reference_genome': [ + get_data('mock_reference_genome.fa'), + get_data('example_genes.fa'), + ], + 'reference.aligner_reference': [get_data('mock_reference_genome.fa')], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'validate', '--library', 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', '--input', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - get_data('example_genes.fa'), - '--aligner_reference', - get_data('mock_reference_genome.fa'), - '--read_length', - '125', + output_dir, + '--config', + str(configpath), ] with patch.object(validate_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): mavis_main() - def test_error_multi_aligner_ref(self): - args = [ - 'mavis', - 'validate', - '--library', - 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', - '--input', - get_data('mock_sv_events.tsv'), - '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - '--aligner_reference', - get_data('mock_reference_genome.fa'), - get_data('example_genes.fa'), - '--read_length', - '125', - ] - with patch.object(validate_main, 'main', util.DEVNULL): - with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) - - def test_error_missing_aligner_ref(self): + def test_error_multi_aligner_ref(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'genome', + 'assign': [get_data('mock_sv_events.tsv')], + 'bam_file': get_data('mock_trans_reads_for_events.sorted.bam'), + 'read_length': 125, + 'median_fragment_size': 200, + 'stdev_fragment_size': 50, + } + }, + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'cluster.uninformative_filter': True, + 'reference.reference_genome': [ + get_data('mock_reference_genome.fa'), + get_data('example_genes.fa'), + ], + 'reference.aligner_reference': [ + get_data('mock_reference_genome.fa'), + get_data('example_genes.fa'), + ], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'validate', '--library', 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', '--input', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - '--read_length', - '125', + output_dir, + '--config', + str(configpath), ] with patch.object(validate_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) + expect_error(self, mavis_main) - def test_error_missing_reference_genome(self): + def test_error_missing_aligner_ref(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'genome', + 'assign': [get_data('mock_sv_events.tsv')], + 'bam_file': get_data('mock_trans_reads_for_events.sorted.bam'), + 'read_length': 125, + 'median_fragment_size': 200, + 'stdev_fragment_size': 50, + } + }, + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'cluster.uninformative_filter': True, + 'reference.reference_genome': [ + get_data('mock_reference_genome.fa'), + get_data('example_genes.fa'), + ], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'validate', '--library', 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', '--input', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--aligner_reference', - get_data('mock_reference_genome.fa'), - '--read_length', - '125', + output_dir, + '--config', + str(configpath), ] with patch.object(validate_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) + expect_error(self, mavis_main) - def test_error_bad_aligner_ref(self): + def test_error_missing_reference_genome(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'genome', + 'assign': [get_data('mock_sv_events.tsv')], + 'bam_file': get_data('mock_trans_reads_for_events.sorted.bam'), + 'read_length': 125, + 'median_fragment_size': 200, + 'stdev_fragment_size': 50, + } + }, + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'cluster.uninformative_filter': True, + 'reference.aligner_reference': [ + get_data('mock_reference_genome.fa'), + get_data('example_genes.fa'), + ], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'validate', '--library', 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', '--input', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - '--aligner_reference', - 'bad', - '--read_length', - '125', + output_dir, + '--config', + str(configpath), ] with patch.object(validate_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) + expect_error(self, mavis_main) - def test_error_none_aligner_ref(self): + def test_error_bad_aligner_ref(self, configpath, output_dir): + configpath.write_text( + json.dumps( + { + 'libraries': { + 'translib': { + 'disease_status': 'diseased', + 'protocol': 'genome', + 'assign': [get_data('mock_sv_events.tsv')], + 'bam_file': get_data('mock_trans_reads_for_events.sorted.bam'), + 'read_length': 125, + 'median_fragment_size': 200, + 'stdev_fragment_size': 50, + } + }, + 'reference.annotations': [ + get_data('example_genes.json'), + get_data('mock_annotations.json'), + ], + 'cluster.uninformative_filter': True, + 'reference.reference_genome': [ + get_data('mock_reference_genome.fa'), + get_data('example_genes.fa'), + ], + 'reference.aligner_reference': [ + 'fake_path', + ], + 'output_dir': output_dir, + } + ) + ) args = [ 'mavis', 'validate', '--library', 'translib', - '--protocol', - 'genome', - '--bam_file', - get_data('mock_trans_reads_for_events.sorted.bam'), - '--stdev_fragment_size', - '50', - '--median_fragment_size', - '200', '--input', get_data('mock_sv_events.tsv'), '--output', - 'outdir', - '--reference_genome', - get_data('mock_reference_genome.fa'), - '--aligner_reference', - 'none', - '--read_length', - '125', + output_dir, + '--config', + str(configpath), ] with patch.object(validate_main, 'main', util.DEVNULL): with patch.object(sys, 'argv', args): - err = expect_error(self, mavis_main, SystemExit) - self.assertEqual(ARGUMENT_ERROR, err.code) + expect_error(self, mavis_main) diff --git a/tests/integration/test_checker.py b/tests/integration/test_checker.py deleted file mode 100644 index d77c4453..00000000 --- a/tests/integration/test_checker.py +++ /dev/null @@ -1,111 +0,0 @@ -import errno -import os -import unittest -from unittest.mock import mock_open, patch - -from mavis.schedule import job as _job -from mavis.schedule import pipeline as _pipeline - -MOCK_GENOME = 'mock-A36971' -MOCK_TRANS = 'mock-A47933' -ERROR_MESSAGE = """Traceback (most recent call last): - File "/home/dpaulino/gitrepo/mavis/venv/bin/mavis_run.py", line 6, in - exec(compile(open(__file__).read(), __file__, 'exec')) - File "/home/dpaulino/gitrepo/mavis/bin/mavis_run.py", line 7, in - from mavis.annotate import load_reference_genes, load_reference_genome, load_masking_regions, load_templates - File "/home/dpaulino/gitrepo/mavis/mavis/__init__.py", line 6, in - __version__ = get_version() - File "/home/dpaulino/gitrepo/mavis/mavis/util.py", line 32, in get_version - v = subprocess.check_output('cd {}; git describe'.format(os.path.dirname(__file__)), shell=True) - File "/projects/tumour_char/analysis_scripts/python/centos06/python-3.6.0/lib/python3.6/subprocess.py", line 336, in check_output - **kwargs).stdout - File "/projects/tumour_char/analysis_scripts/python/centos06/python-3.6.0/lib/python3.6/subprocess.py", line 418, in run - output=stdout, stderr=stderr) -subprocess.CalledProcessError: Command 'cd /home/dpaulino/gitrepo/mavis/mavis; git describe' returned non-zero exit status 127.""" - - -def mkdirs(newdir, mode=0o777): - """ - make directories and ignores if it already exists. - """ - try: - os.makedirs(newdir, mode) - except OSError as err: - # Reraise the error unless it's about an already existing directory - if err.errno != errno.EEXIST or not os.path.isdir(newdir): - raise err - - -class TestParseLogFile(unittest.TestCase): - def mock_log(self, content): - mockopen = mock_open(read_data=content) - with patch('builtins.open', mockopen), patch('os.path.isfile') as isfile, patch( - '__main__.open', mockopen - ): - isfile.return_value = True - return _job.LogFile.parse('log') - - def test_command_not_found_error(self): - log = self.mock_log( - "stty: standard input: Inappropriate ioctl for device\n" - "/opt/slurm/spool/slurmd/job814329/slurm_script: line 9: mavis: command not found\n" - ) - self.assertEqual(_job.LogFile.STATUS.CRASH, log.status) - - def test_python_index_error(self): - content = """ -Traceback (most recent call last): - File "/home/creisle/git/mavis/venv/bin/mavis", line 11, in - load_entry_point('mavis===v0.1.0-220-g3f65e68', 'console_scripts', 'mavis')() - File "/home/creisle/git/mavis/venv/lib/python3.6/site-packages/mavis-v0.1.0_220_g3f65e68-py3.6.egg/mavis/main.py", line 554, in main - check_completion(args.output) - File "/home/creisle/git/mavis/venv/lib/python3.6/site-packages/mavis-v0.1.0_220_g3f65e68-py3.6.egg/mavis/main.py", line 450, in check_completion - cur_time = check_single_job(d) - File "/home/creisle/git/mavis/venv/lib/python3.6/site-packages/mavis-v0.1.0_220_g3f65e68-py3.6.egg/mavis/main.py", line 429, in check_single_job - check_log(max(log_files, key=os.path.getctime)) - File "/home/creisle/git/mavis/venv/lib/python3.6/site-packages/mavis-v0.1.0_220_g3f65e68-py3.6.egg/mavis/main.py", line 359, in check_log - if 'error' in lines[-1].lower(): -IndexError: list index out of range""" - log = self.mock_log(content) - self.assertEqual(_job.LogFile.STATUS.CRASH, log.status) - - def test_python_keyerror(self): - content = "KeyError: ('cannot check membership column. column not found in header', 'protocol', {'break2_orientation', 'break1_chromosome', 'break1_orientation', 'tools', 'defuse_cluster_id', 'break1_position_end', 'event_type', 'defuse_split_read_count', 'break2_chromosome', 'break2_position_end', 'stranded', 'defuse_spanning_read_count', 'break2_strand', 'library', 'break1_position_start', 'defuse_probability', 'untemplated_seq', 'opposing_strands', 'break1_strand', 'break2_position_start'})" - log = self.mock_log(content) - self.assertEqual(_job.LogFile.STATUS.CRASH, log.status) - - def test_empty_log(self): - log = self.mock_log("") - self.assertEqual(_job.LogFile.STATUS.EMPTY, log.status) - log = self.mock_log("\n\n") - self.assertEqual(_job.LogFile.STATUS.EMPTY, log.status) - - def test_incomplete_log(self): - log = self.mock_log("other\n") - self.assertEqual(_job.LogFile.STATUS.INCOMPLETE, log.status) - log = self.mock_log("thing") - self.assertEqual(_job.LogFile.STATUS.INCOMPLETE, log.status) - - -class TestModule(unittest.TestCase): - def test_parse_run_time_none(self): - content = "" - mockopen = mock_open(read_data=content) - with patch('builtins.open', mockopen), patch('os.path.isfile') as isfile, patch( - '__main__.open', mockopen - ), patch('os.path.getmtime') as getmtime: - getmtime.return_value = 1 - isfile.return_value = True - result = _pipeline.parse_run_time('log') - self.assertEqual(-1, result) - - def test_parse_valid_run_time(self): - content = "[2018-03-06 15:25:46.153560] complete: MAVIS.COMPLETE\nrun time (hh/mm/ss): 0:06:41\nrun time (s): 1\n" - mockopen = mock_open(read_data=content) - with patch('builtins.open', mockopen), patch('os.path.isfile') as isfile, patch( - '__main__.open', mockopen - ), patch('os.path.getmtime') as getmtime: - getmtime.return_value = 1 - isfile.return_value = True - result = _pipeline.parse_run_time('log') - self.assertEqual(1, result) diff --git a/tests/integration/test_config.py b/tests/integration/test_config.py deleted file mode 100644 index bb1c5e1f..00000000 --- a/tests/integration/test_config.py +++ /dev/null @@ -1,64 +0,0 @@ -import unittest -from unittest.mock import mock_open, patch -import configparser - -from mavis.config import MavisConfig - - -STUB = """ -[reference] -template_metadata = tests/data/cytoBand.txt -annotations = tests/data/mock_annotations.json -masking = tests/data/mock_masking.tab -reference_genome = tests/data/mock_reference_genome.fa -aligner_reference = tests/data/mock_reference_genome.2bit -dgv_annotation = tests/data/mock_dgv_annotation.txt - -[mock-A36971] -read_length = 150 -median_fragment_size = 400 -stdev_fragment_size = 97 -bam_file = tests/data/mock_reads_for_events.sorted.bam -protocol = genome -inputs = mock_converted -strand_specific = False -disease_status=diseased - -[mock-A47933] -read_length = 75 -median_fragment_size = 188 -stdev_fragment_size = 50 -bam_file = tests/data/mock_trans_reads_for_events.sorted.bam -protocol = transcriptome -inputs = mock_converted -strand_specific = True -disease_status=diseased - -[convert] -assume_no_untemplated = True -# addfile twice to check this notation is ok (will collapse them anyway) -mock_converted = convert_tool_output - tests/data/mock_sv_events.tsv - tests/data/mock_sv_events.tsv - mavis - False -""" - - -class TestConfig(unittest.TestCase): - def mock_config(self, content=""): - with patch('configparser.ConfigParser.read', configparser.ConfigParser.read_string), patch( - 'os.path.isfile' - ) as isfile, patch('os.path.exists') as exists: - isfile.return_value = True - exists.return_value = True - return MavisConfig.read(content) - - def test_error_in_schedule(self): - with self.assertRaises(TypeError): - content = STUB + '\n[schedule]\nmail_type=\n' - print(content) - self.mock_config(content) - - def test_ok(self): - self.mock_config(STUB) diff --git a/tests/integration/test_mains.py b/tests/integration/test_mains.py deleted file mode 100644 index 4fa1478c..00000000 --- a/tests/integration/test_mains.py +++ /dev/null @@ -1,135 +0,0 @@ -import glob -import os -import re -import shutil -from tempfile import mkdtemp -import unittest -from unittest import mock - -from mavis.annotate.file_io import ( - load_reference_genes, - load_reference_genome, - load_templates, - ReferenceFile, - load_annotations, -) -from mavis.annotate.main import main as annotate_main -from mavis.cluster.main import main as cluster_main -from mavis.constants import DISEASE_STATUS, PROTOCOL -from mavis.validate.main import main as validate_main -import pysam - -from . import RUN_FULL -from ..util import get_data - -annotations = None -reference_genome = None -template_metadata = None -trans_bam_fh = None -genome_bam_fh = None -masking = mock.Mock(content={}) # do not mask - - -def setUpModule(): - global annotations, reference_genome, template_metadata, genome_bam_fh, trans_bam_fh, masking - print('setup start') - annotations = ReferenceFile('annotations', get_data('mock_annotations.json')) - reference_genome = ReferenceFile( - 'reference_genome', get_data('mock_reference_genome.fa'), eager_load=True - ) - template_metadata = ReferenceFile( - 'template_metadata', get_data('cytoBand.txt'), eager_load=True - ) - genome_bam_fh = pysam.AlignmentFile(get_data('mock_reads_for_events.sorted.bam')) - trans_bam_fh = pysam.AlignmentFile(get_data('mock_trans_reads_for_events.sorted.bam')) - print('setup loading is complete') - - -def tearDownModule(): - trans_bam_fh.close() - genome_bam_fh.close() - - -@unittest.skipIf( - not RUN_FULL, 'slower tests will not be run unless the environment variable RUN_FULL is given' -) -class TestPipeline(unittest.TestCase): - def setUp(self): - self.output = mkdtemp() - - def tearDown(self): - shutil.rmtree(self.output) - - @unittest.skipIf(not shutil.which('blat'), 'missing the blat command') - def test_mains(self): - # test the clustering - cluster_files = cluster_main( - [get_data('mock_sv_events.tsv')], - self.output, - False, - 'mock-A36971', - PROTOCOL.GENOME, - DISEASE_STATUS.DISEASED, - limit_to_chr=[None], - log_args=True, - masking=masking, - cluster_clique_size=15, - cluster_radius=20, - uninformative_filter=True, - max_proximity=5000, - annotations=annotations, - min_clusters_per_file=5, - max_files=1, - ) - self.assertGreaterEqual(100, len(cluster_files)) - self.assertLessEqual(1, len(cluster_files)) - # next test the validate runs without errors - validate_main( - [cluster_files[0]], - self.output, - genome_bam_fh, - False, - 'mock-A36971', - PROTOCOL.GENOME, - median_fragment_size=427, - stdev_fragment_size=106, - read_length=150, - reference_genome=reference_genome, - annotations=annotations, - masking=masking, - aligner_reference=ReferenceFile( - 'aligner_reference', get_data('mock_reference_genome.2bit') - ), - ) - for suffix in [ - 'validation-passed.tab', - 'validation-failed.tab', - 'raw_evidence.bam', - 'raw_evidence.sorted.bam', - 'raw_evidence.sorted.bam.bai', - 'contigs.sorted.bam', - 'contigs.sorted.bam.bai', - 'contigs.bam', - 'igv.batch', - ]: - self.assertTrue(os.path.exists(os.path.join(self.output, suffix))) - - # test the annotation - annotate_main( - [os.path.join(self.output, 'validation-passed.tab')], - self.output, - 'mock-A36971', - PROTOCOL.GENOME, - reference_genome, - annotations, - template_metadata, - min_domain_mapping_match=0.95, - min_orf_size=300, - max_orf_cap=3, - ) - self.assertTrue(os.path.exists(os.path.join(self.output, 'annotations.tab'))) - self.assertTrue(os.path.exists(os.path.join(self.output, 'annotations.fusion-cdna.fa'))) - drawings_dir = os.path.join(self.output, 'drawings') - self.assertTrue(os.path.exists(drawings_dir)) - self.assertLessEqual(1, len(glob.glob(os.path.join(drawings_dir, '*.svg')))) - self.assertLessEqual(1, len(glob.glob(os.path.join(drawings_dir, '*.legend.json')))) diff --git a/tests/mini-tutorial.config.json b/tests/mini-tutorial.config.json new file mode 100644 index 00000000..00fd4d50 --- /dev/null +++ b/tests/mini-tutorial.config.json @@ -0,0 +1,64 @@ +{ + "annotate.draw_fusions_only": false, + "convert": { + "mock_converted": { + "inputs": [ + "tests/data/mock_sv_events.tsv" + ], + "file_type": "mavis", + "assume_no_untemplated": true + } + }, + "cluster.uninformative_filter": true, + "cluster.limit_to_chr": null, + "cluster.min_clusters_per_file": 5, + "libraries": { + "mock-A47933": { + "assign": [ + "tests/data/mock_trans_sv_events.tsv" + ], + "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", + "disease_status": "diseased", + "median_fragment_size": 188, + "protocol": "transcriptome", + "read_length": 75, + "stdev_fragment_size": 50, + "strand_specific": true + }, + "mock-A36971": { + "assign": [ + "mock_converted" + ], + "bam_file": "tests/data/mock_reads_for_events.sorted.bam", + "disease_status": "diseased", + "median_fragment_size": 400, + "protocol": "genome", + "read_length": 150, + "stdev_fragment_size": 97, + "strand_specific": false + } + }, + "output_dir": "output_dir", + "reference.aligner_reference": [ + "tests/data/mock_reference_genome.2bit" + ], + "reference.annotations": [ + "tests/data/mock_annotations.json" + ], + "reference.dgv_annotation": [ + "tests/data/mock_dgv_annotation.txt" + ], + "reference.masking": [ + "tests/data/mock_masking.tab" + ], + "reference.reference_genome": [ + "tests/data/mock_reference_genome.fa" + ], + "reference.template_metadata": [ + "tests/data/cytoBand.txt" + ], + "summary.filter_min_remapped_reads": 5, + "summary.filter_min_spanning_reads": 5, + "summary.filter_min_linking_split_reads": 1, + "summary.filter_min_flanking_reads": 10 +} diff --git a/mavis/schedule/__init__.py b/tests/snakemake/__init__.py similarity index 100% rename from mavis/schedule/__init__.py rename to tests/snakemake/__init__.py diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py new file mode 100644 index 00000000..37e81b56 --- /dev/null +++ b/tests/snakemake/test_mini_workflow.py @@ -0,0 +1,55 @@ +import json +import os +import shutil +import sys +import tempfile +from unittest.mock import patch + +import pytest + +from snakemake import main as snakemake_main + +from ..util import glob_exists, package_relative_file + + +@pytest.fixture +def output_dir(): + temp_output = tempfile.mkdtemp() + + os.makedirs(os.path.join(temp_output, 'mavis/schemas')) + + with open(package_relative_file('tests/mini-tutorial.config.json'), 'r') as fh: + config = json.load(fh) + config['output_dir'] = os.path.join(temp_output, 'output_dir') + with open(os.path.join(temp_output, 'mini-tutorial.config.json'), 'w') as fh: + fh.write(json.dumps(config)) + yield temp_output + shutil.rmtree(temp_output) + + +def test_workflow(output_dir): + argv = [ + 'snakemake', + '-s', + package_relative_file('Snakefile'), + '-j', + '1', + '--configfile', + os.path.join(output_dir, 'mini-tutorial.config.json'), + '-d', + package_relative_file(), + ] + with patch.object(sys, 'argv', argv): + try: + snakemake_main() + assert glob_exists(os.path.join(output_dir, 'summary', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'pairing', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'cluster', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'validate', '*', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'annotate', '*', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'cluster', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'validate', '*', 'MAVIS.COMPLETE')) + assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'annotate', '*', 'MAVIS.COMPLETE')) + except SystemExit as err: + if err.code != 0: + raise err diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py deleted file mode 100644 index 0edf3629..00000000 --- a/tests/unit/test_config.py +++ /dev/null @@ -1,67 +0,0 @@ -import unittest -from argparse import ArgumentTypeError - -from mavis.config import float_fraction, nameable_string - - -class TestFloatFraction(unittest.TestCase): - def test_bad_string(self): - with self.assertRaises(ArgumentTypeError): - float_fraction('a') - - def test_float_too_big(self): - with self.assertRaises(ArgumentTypeError): - float_fraction('1.1') - - def test_float_negative_error(self): - with self.assertRaises(ArgumentTypeError): - float_fraction('-0.1') - - def test_zero_ok(self): - self.assertEqual(0, float_fraction('0')) - - def test_one_ok(self): - self.assertEqual(1, float_fraction('1')) - - -class TestNoReservedChars(unittest.TestCase): - def test_semicolon_error(self): - with self.assertRaises(TypeError): - nameable_string('thing;thing') - - def test_comma_error(self): - with self.assertRaises(TypeError): - nameable_string('thing,thing') - - def test_underscore_error(self): - with self.assertRaises(TypeError): - nameable_string('thing_thing') - - def test_space_error(self): - with self.assertRaises(TypeError): - nameable_string(' ') - - with self.assertRaises(TypeError): - nameable_string('thing thing') - - def test_ok(self): - lib = 'libName' - self.assertEqual('libName', nameable_string(lib)) - - def test_number_start_error(self): - with self.assertRaises(TypeError): - nameable_string('1thing') - - with self.assertRaises(TypeError): - nameable_string('1') - - def test_empty_error(self): - with self.assertRaises(TypeError): - nameable_string('') - - def test_none_error(self): - with self.assertRaises(TypeError): - nameable_string('none') - - with self.assertRaises(TypeError): - nameable_string(None) diff --git a/tests/util.py b/tests/util.py index 089985bf..55db2d11 100644 --- a/tests/util.py +++ b/tests/util.py @@ -1,7 +1,31 @@ +import glob import os DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +def package_relative_file(*paths): + return os.path.abspath(os.path.join(os.path.dirname(__file__), '..', *paths)) + + def get_data(*paths): return os.path.join(DATA_DIR, *paths) + + +def glob_exists(*pos, strict=False, n=1): + globexpr = os.path.join(*pos) + file_list = glob.glob(globexpr) + if strict and len(file_list) == n: + return file_list[0] if len(file_list) == 1 else file_list + elif not strict and len(file_list) > 0: + return file_list + else: + print(globexpr) + print(file_list) + return False + + +def glob_not_exists(*pos): + globexpr = os.path.join(*pos) + file_list = glob.glob(globexpr) + return not file_list From 57e0254eba0b2fcdf74e5513f4fca38049d14289 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 10:31:07 -0700 Subject: [PATCH 002/137] Include branch coverage --- .github/workflows/build.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1601aeba..ded37cdc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -45,14 +45,26 @@ jobs: - name: run short tests with pytest run: | export PATH=$PATH:$(pwd):$(pwd)/bwa - pytest tests -v --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov mavis --cov-report term --cov-report xml --durations=10 + pytest tests -v \ + --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ + --cov mavis \ + --cov-report term-missing \ + --cov-report xml \ + --durations=10 \ + --cov-branch env: RUN_FULL: 0 if: github.event_name != 'pull_request' - name: run full tests with pytest run: | export PATH=$PATH:$(pwd):$(pwd)/bwa - pytest tests -v --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov mavis --cov-report term --cov-report xml --durations=10 + pytest tests -v \ + --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ + --cov mavis \ + --cov-report term-missing \ + --cov-report xml \ + --durations=10 \ + --cov-branch env: RUN_FULL: 1 if: github.event_name == 'pull_request' @@ -72,4 +84,4 @@ jobs: env_vars: OS,PYTHON name: codecov-umbrella fail_ci_if_error: true - if: matrix.python-version == 3.8 + if: matrix.python-version == 3.7 && github.event_name == 'pull_request' From d99112a9907bc9ab9fb02dcac2259caf5f3b16ce Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 11:01:46 -0700 Subject: [PATCH 003/137] Increase test timeout for slow tests due to increased coverage metrics --- tests/integration/test_assemble.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_assemble.py b/tests/integration/test_assemble.py index f206ca5e..e732cfbc 100644 --- a/tests/integration/test_assemble.py +++ b/tests/integration/test_assemble.py @@ -3,15 +3,14 @@ import unittest import timeout_decorator - from mavis.assemble import Contig, assemble, filter_contigs -from mavis.interval import Interval from mavis.constants import reverse_complement -from mavis.validate.constants import DEFAULTS +from mavis.interval import Interval from mavis.util import LOG +from mavis.validate.constants import DEFAULTS -from . import MockObject, RUN_FULL from ..util import get_data +from . import RUN_FULL, MockObject class TestFilterContigs(unittest.TestCase): @@ -629,7 +628,7 @@ def test_assemble_short_contig(self): print(len(contig.seq), contig.remap_score(), contig.seq) self.assertTrue({target, reverse_complement(target)} & {c.seq for c in contigs}) - @timeout_decorator.timeout(60) + @timeout_decorator.timeout(120) @unittest.skipIf( not RUN_FULL, 'slower tests will not be run unless the environment variable RUN_FULL is given', From 770c29832a6a67d92be651b66bfebf43819a6be3 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 15:09:35 -0700 Subject: [PATCH 004/137] Add type annotations --- mavis/align.py | 13 +- mavis/annotate/constants.py | 42 ------ mavis/annotate/file_io.py | 46 +----- mavis/annotate/main.py | 5 +- mavis/breakpoint.py | 94 ++++++------ mavis/cluster/constants.py | 46 ------ mavis/config.py | 16 +-- mavis/illustrate/constants.py | 68 +-------- mavis/illustrate/scatter.py | 8 +- mavis/interval.py | 9 +- mavis/main.py | 1 - mavis/pairing/constants.py | 46 +----- mavis/pairing/pairing.py | 3 +- mavis/schemas/config.json | 2 +- mavis/summary/constants.py | 55 +------ mavis/tools/__init__.py | 48 ++++--- mavis/tools/chimerascan.py | 7 +- mavis/tools/vcf.py | 6 +- mavis/util.py | 16 +-- mavis/validate/base.py | 210 ++++++++++++++++++--------- mavis/validate/call.py | 13 +- mavis/validate/constants.py | 262 ---------------------------------- mavis/validate/evidence.py | 102 +++++++------ mavis/validate/main.py | 10 +- 24 files changed, 349 insertions(+), 779 deletions(-) delete mode 100644 mavis/cluster/constants.py diff --git a/mavis/align.py b/mavis/align.py index 8fccc613..cdf0a287 100644 --- a/mavis/align.py +++ b/mavis/align.py @@ -1,33 +1,32 @@ """ Should take in a sam file from a aligner like bwa aln or bwa mem and convert it into a """ -from copy import copy import itertools import os import re import subprocess import warnings +from copy import copy import pysam from .bam import cigar as _cigar from .bam import read as _read -from .breakpoint import BreakpointPair, Breakpoint +from .breakpoint import Breakpoint, BreakpointPair from .constants import ( CIGAR, COLUMNS, - MavisNamespace, + NA_MAPPING_QUALITY, ORIENT, - reverse_complement, STRAND, SVTYPE, - NA_MAPPING_QUALITY, + MavisNamespace, + reverse_complement, ) from .error import InvalidRearrangement from .interval import Interval from .util import DEVNULL - SUPPORTED_ALIGNER = MavisNamespace( BWA_MEM='bwa mem', BLAT='blat', __name__='mavis.align.SUPPORTED_ALIGNER' ) @@ -229,9 +228,9 @@ def convert_to_duplication(alignment, reference_genome): ), untemplated_seq=alignment.untemplated_seq[dup_len:], opposing_strands=alignment.opposing_strands, - data=alignment.data, read1=alignment.read1, read2=alignment.read2, + **alignment.data ) return result return alignment diff --git a/mavis/annotate/constants.py b/mavis/annotate/constants.py index a7645efd..d2fbf5c2 100644 --- a/mavis/annotate/constants.py +++ b/mavis/annotate/constants.py @@ -5,50 +5,8 @@ from ..constants import MavisNamespace, float_fraction from ..util import WeakMavisNamespace - PASS_FILENAME = 'annotations.tab' -DEFAULTS = WeakMavisNamespace() -""" -- [annotation_filters](/configuration/settings/#annotation_filters) -- [max_orf_cap](/configuration/settings/#max_orf_cap) -- [min_domain_mapping_match](/configuration/settings/#min_domain_mapping_match) -- [min_orf_size](/configuration/settings/#min_orf_size) -""" -DEFAULTS.add( - 'min_domain_mapping_match', - 0.9, - cast_type=float_fraction, - defn='a number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript ' - 'to be displayed', -) -DEFAULTS.add( - 'min_orf_size', - 300, - defn='the minimum length (in base pairs) to retain a putative open reading frame (ORF)', -) -DEFAULTS.add( - 'max_orf_cap', - 3, - defn='the maximum number of ORFs to return (best putative ORFs will be retained)', -) -DEFAULTS.add( - 'annotation_filters', - 'choose_more_annotated,choose_transcripts_by_priority', - defn='a comma separated list of filters to apply to putative annotations', -) -DEFAULTS.add( - 'draw_fusions_only', - True, - cast_type=tab.cast_boolean, - defn='flag to indicate if events which do not produce a fusion transcript should produce illustrations', -) -DEFAULTS.add( - 'draw_non_synonymous_cdna_only', - True, - cast_type=tab.cast_boolean, - defn='flag to indicate if events which are synonymous at the cdna level should produce illustrations', -) SPLICE_TYPE = MavisNamespace( RETAIN='retained intron', diff --git a/mavis/annotate/file_io.py b/mavis/annotate/file_io.py index 2683806f..ed4f45cb 100644 --- a/mavis/annotate/file_io.py +++ b/mavis/annotate/file_io.py @@ -11,55 +11,11 @@ from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate from ..interval import Interval -from ..util import DEVNULL, LOG, WeakMavisNamespace, filepath +from ..util import DEVNULL, LOG, filepath from .base import BioInterval, ReferenceName from .genomic import Exon, Gene, PreTranscript, Template, Transcript from .protein import Domain, Translation -REFERENCE_DEFAULTS = WeakMavisNamespace() -REFERENCE_DEFAULTS.add( - 'template_metadata', - [], - cast_type=filepath, - listable=True, - defn='file containing the cytoband template information. Used for illustrations only', -) -REFERENCE_DEFAULTS.add( - 'masking', - [], - cast_type=filepath, - listable=True, - defn='file containing regions for which input events overlapping them are dropped prior to validation', -) -REFERENCE_DEFAULTS.add( - 'annotations', - [], - cast_type=filepath, - listable=True, - defn='path to the reference annotations of genes, transcript, exons, domains, etc', -) -REFERENCE_DEFAULTS.add( - 'aligner_reference', - None, - cast_type=filepath, - nullable=True, - defn='path to the aligner reference file used for aligning the contig sequences', -) -REFERENCE_DEFAULTS.add( - 'dgv_annotation', - [], - cast_type=filepath, - listable=True, - defn='Path to the dgv reference processed to look like the cytoband file.', -) -REFERENCE_DEFAULTS.add( - 'reference_genome', - [], - cast_type=filepath, - listable=True, - defn='Path to the human reference genome fasta file', -) - def load_masking_regions(*filepaths): """ diff --git a/mavis/annotate/main.py b/mavis/annotate/main.py index 61c7fd57..ffd54bd7 100644 --- a/mavis/annotate/main.py +++ b/mavis/annotate/main.py @@ -6,9 +6,9 @@ from ..constants import COLUMNS, PRIME, PROTOCOL, sort_columns from ..error import DrawingFitError, NotSpecifiedError -from ..illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS from ..illustrate.constants import DiagramSettings from ..illustrate.diagram import draw_sv_summary_diagram +from ..schemas import DEFAULTS from ..util import LOG, generate_complete_stamp, mkdirp, read_inputs from .constants import PASS_FILENAME from .file_io import ReferenceFile @@ -167,8 +167,9 @@ def main( ) # now try generating the svg + illustration_defaults = get_by_prefix(DEFAULTS, 'illustrate.') drawing_config = DiagramSettings( - **{k: v for k, v in kwargs.items() if k in ILLUSTRATION_DEFAULTS} + **{k: v for k, v in kwargs.items() if k in illustration_defaults} ) header_req = { diff --git a/mavis/breakpoint.py b/mavis/breakpoint.py index 63384d4d..2f903cd6 100644 --- a/mavis/breakpoint.py +++ b/mavis/breakpoint.py @@ -1,7 +1,9 @@ from __future__ import division + from copy import copy as _copy +from typing import Callable, Dict, List, Optional, Set, Tuple -from .constants import CIGAR, COLUMNS, DNA_ALPHABET, ORIENT, reverse_complement, STRAND, SVTYPE +from .constants import CIGAR, COLUMNS, DNA_ALPHABET, ORIENT, STRAND, SVTYPE, reverse_complement from .error import InvalidRearrangement, NotSpecifiedError from .interval import Interval @@ -12,6 +14,11 @@ class for storing information about a SV breakpoint coordinates are given as 1-indexed """ + orient: str + chr: str + strand: str + seq: str + @property def key(self): return (self.chr, self.start, self.end, self.orient, self.strand) @@ -73,18 +80,23 @@ def to_dict(self): class BreakpointPair: - """""" - - def __getattr__(self, attr): - data = object.__getattribute__(self, 'data') - try: - return data[COLUMNS[attr]] - except (KeyError, AttributeError): - try: - return data[attr] - except KeyError: - pass - raise AttributeError(attr) + break1: Breakpoint + break2: Breakpoint + stranded: bool + opposing_strands: bool + untemplated_seq: Optional[str] + data: Dict + + # def __getattr__(self, attr): + # data = object.__getattribute__(self, 'data') + # try: + # return data[COLUMNS[attr]] + # except (KeyError, AttributeError): + # try: + # return data[attr] + # except KeyError: + # pass + # raise AttributeError(attr) def __getitem__(self, index): try: @@ -128,26 +140,26 @@ def __lt__(self, other): return self.untemplated_seq < other.untemplated_seq @property - def interchromosomal(self): + def interchromosomal(self) -> bool: """bool: True if the breakpoints are on different chromosomes, False otherwise""" if self.break1.chr == self.break2.chr: return False return True @property - def LL(self): + def LL(self) -> bool: return self.break1.orient == ORIENT.LEFT and self.break2.orient == ORIENT.LEFT @property - def LR(self): + def LR(self) -> bool: return self.break1.orient == ORIENT.LEFT and self.break2.orient == ORIENT.RIGHT @property - def RL(self): + def RL(self) -> bool: return self.break1.orient == ORIENT.RIGHT and self.break2.orient == ORIENT.LEFT @property - def RR(self): + def RR(self) -> bool: return self.break1.orient == ORIENT.RIGHT and self.break2.orient == ORIENT.RIGHT def copy(self): @@ -160,22 +172,21 @@ def copy(self): def __init__( self, - b1, - b2, - stranded=False, - opposing_strands=None, - untemplated_seq=None, - data=None, + b1: Breakpoint, + b2: Breakpoint, + stranded: bool = False, + opposing_strands: Optional[bool] = None, + untemplated_seq: Optional[str] = None, **kwargs ): """ Args: - b1 (Breakpoint): the first breakpoint - b2 (Breakpoint): the second breakpoint - stranded (bool): if not stranded then +/- is equivalent to -/+ - opposing_strands (bool): are the strands at the breakpoint opposite? i.e. +/- instead of +/+ - untemplated_seq (str): seq between the breakpoints that is not part of either breakpoint - data (dict): optional dictionary of attributes associated with this pair + b1: the first breakpoint + b2: the second breakpoint + stranded: if not stranded then +/- is equivalent to -/+ + opposing_strands: are the strands at the breakpoint opposite? i.e. +/- instead of +/+ + untemplated_seq: seq between the breakpoints that is not part of either breakpoint + data: optional dictionary of attributes associated with this pair Note: untemplated_seq should always be given wrt to the positive/forward reference strand @@ -192,7 +203,7 @@ def __init__( self.break1 = b1 self.break2 = b2 self.stranded = stranded - self.opposing_strands = opposing_strands + self.opposing_strands = opposing_strands # type: ignore if self.break1.orient != ORIENT.NS and self.break2.orient != ORIENT.NS: if self.opposing_strands is not None: @@ -209,13 +220,7 @@ def __init__( self.opposing_strands = self.break1.orient == self.break2.orient # between break1 and break2 not in either self.untemplated_seq = untemplated_seq - self.data = {} - if data is not None: - self.data.update(data) - conflicts = set(data.keys()) & set(kwargs.keys()) - if conflicts: - raise TypeError('data got multiple values for data elements:', conflicts) - self.data.update(kwargs) + self.data = kwargs if self.break1.strand != STRAND.NS and self.break2.strand != STRAND.NS: opposing = self.break1.strand != self.break2.strand @@ -275,16 +280,16 @@ def flatten(self): return row @classmethod - def classify(cls, pair, distance=None): + def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: """ uses the chr, orientations and strands to determine the possible structural_variant types that this pair could support Args: pair (BreakpointPair): the pair to classify - distance (Callable): if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) + distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) Returns: - List[SVTYPE]: a list of possible SVTYPE + a list of possible SVTYPE Example: >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) @@ -327,6 +332,7 @@ def classify(cls, pair, distance=None): return {SVTYPE.DEL, SVTYPE.INS} elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT: return {SVTYPE.DUP} + raise InvalidRearrangement(pair) else: # interchromosomal if pair.opposing_strands: if pair.LR or pair.RL: @@ -337,7 +343,7 @@ def classify(cls, pair, distance=None): raise InvalidRearrangement(pair) return {SVTYPE.TRANS} - def net_size(self, distance=lambda x, y: Interval(abs(x - y))): + def net_size(self, distance=lambda x, y: Interval(abs(x - y))) -> Interval: """ Returns the size of the event for a given pair. Mainly applicable to indels """ @@ -358,7 +364,7 @@ def net_size(self, distance=lambda x, y: Interval(abs(x - y))): return size @property - def is_putative_indel(self): + def is_putative_indel(self) -> bool: if self.interchromosomal or self.opposing_strands or self.break1.orient == ORIENT.RIGHT: return False return True @@ -521,7 +527,7 @@ def untemplated_shift(self, reference_genome): ) return (break2_shift, break1_shift) - def get_bed_repesentation(self): + def get_bed_repesentation(self) -> List[Tuple[str, int, int, Optional[str]]]: bed = [] if self.interchromosomal: bed.append( diff --git a/mavis/cluster/constants.py b/mavis/cluster/constants.py deleted file mode 100644 index 107100bf..00000000 --- a/mavis/cluster/constants.py +++ /dev/null @@ -1,46 +0,0 @@ -from ..util import WeakMavisNamespace - - -DEFAULTS = WeakMavisNamespace() -""" -- [cluster_initial_size_limit](/configuration/settings/#cluster_initial_size_limit) -- [cluster_radius](/configuration/settings/#cluster_radius) -- [limit_to_chr](/configuration/settings/#limit_to_chr) -- [max_files](/configuration/settings/#max_files) -- [max_proximity](/configuration/settings/#max_proximity) -- [min_clusters_per_file](/configuration/settings/#min_clusters_per_file) -- [uninformative_filter](/configuration/settings/#uninformative_filter) -""" -DEFAULTS.add( - 'min_clusters_per_file', 50, defn='the minimum number of breakpoint pairs to output to a file' -) -DEFAULTS.add( - 'max_files', 200, defn='The maximum number of files to output from clustering/splitting' -) -DEFAULTS.add( - 'cluster_initial_size_limit', - 25, - defn='the maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering ' - 'phase (combining based on overlap)', -) -DEFAULTS.add('cluster_radius', 100, defn='maximum distance allowed between paired breakpoint pairs') -DEFAULTS.add( - 'max_proximity', - 5000, - defn='the maximum distance away from an annotation before the region in considered to be uninformative', -) -DEFAULTS.add( - 'uninformative_filter', - False, - defn='flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered ' - 'out prior to clustering', -) -DEFAULTS.add( - 'limit_to_chr', - [str(x) for x in range(1, 23)] + ['X', 'Y'], - cast_type=str, - listable=True, - nullable=True, - defn='A list of chromosome names to use. BreakpointPairs on other chromosomes will be filtered' - 'out. For example \'1 2 3 4\' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4', -) diff --git a/mavis/config.py b/mavis/config.py index f38483bb..480eb301 100644 --- a/mavis/config.py +++ b/mavis/config.py @@ -5,6 +5,8 @@ import snakemake import tab +from snakemake.exceptions import WorkflowError +from snakemake.utils import validate as snakemake_validate from .annotate.file_io import ReferenceFile from .bam import stats @@ -124,14 +126,16 @@ def validate_config(config: Dict, bam_stats: Optional[bool] = False, stage: str schema = 'config' if stage != SUBCOMMAND.OVERLAY else 'overlay' try: - snakemake.utils.validate( - config, os.path.join(os.path.dirname(__file__), f'schemas/{schema}.json') + snakemake_validate( + config, + os.path.join(os.path.dirname(__file__), f'schemas/{schema}.json'), + set_default=True, ) except Exception as err: short_msg = '. '.join( [line for line in str(err).split('\n') if line.strip()][:3] ) # these can get super long - raise snakemake.WorkflowError(short_msg) + raise WorkflowError(short_msg) required = [] if ( @@ -146,7 +150,7 @@ def validate_config(config: Dict, bam_stats: Optional[bool] = False, stage: str for req in required: if req not in config: - raise snakemake.WorkflowError(f'missing required property: {req}') + raise WorkflowError(f'missing required property: {req}') if schema == 'config': conversion_dir = os.path.join(config['output_dir'], 'converted_outputs') @@ -218,7 +222,3 @@ def get_metavar(arg_type): elif arg_type == filepath: return 'FILEPATH' return None - - -def get_by_prefix(config, prefix): - return {k.replace(prefix, ''): v for k, v in config.items() if k.startswith(prefix)} diff --git a/mavis/illustrate/constants.py b/mavis/illustrate/constants.py index 59e60121..670e5364 100644 --- a/mavis/illustrate/constants.py +++ b/mavis/illustrate/constants.py @@ -1,64 +1,7 @@ from colour import Color -from ..constants import GIEMSA_STAIN, float_fraction -from ..util import WeakMavisNamespace - -DEFAULTS = WeakMavisNamespace() -""" -- [breakpoint_color](/configuration/settings/#breakpoint_color) -- [domain_color](/configuration/settings/#domain_color) -- [domain_mismatch_color](/configuration/settings/#domain_mismatch_color) -- [domain_name_regex_filter](/configuration/settings/#domain_name_regex_filter) -- [domain_scaffold_color](/configuration/settings/#domain_scaffold_color) -- [drawing_width_iter_increase](/configuration/settings/#drawing_width_iter_increase) -- [gene1_color_selected](/configuration/settings/#gene1_color_selected) -- [gene1_color](/configuration/settings/#gene1_color) -- [gene2_color_selected](/configuration/settings/#gene2_color_selected) -- [gene2_color](/configuration/settings/#gene2_color) -- [label_color](/configuration/settings/#label_color) -- [mask_fill](/configuration/settings/#mask_fill) -- [mask_opacity](/configuration/settings/#mask_opacity) -- [max_drawing_retries](/configuration/settings/#max_drawing_retries) -- [novel_exon_color](/configuration/settings/#novel_exon_color) -- [scaffold_color](/configuration/settings/#scaffold_color) -- [splice_color](/configuration/settings/#splice_color) -- [width](/configuration/settings/#width) -""" -DEFAULTS.add('width', 1000, defn='The drawing width in pixels') -DEFAULTS.add( - 'domain_name_regex_filter', - r'^PF\d+$', - defn='The regular expression used to select domains to be displayed (filtered by name)', -) -DEFAULTS.add( - 'max_drawing_retries', - 5, - defn='The maximum number of retries for attempting a drawing. Each iteration the width is extended. If it ' - 'is still insufficient after this number a gene-level only drawing will be output', -) -DEFAULTS.add('scaffold_color', '#000000', defn='The color used for the gene/transcripts scaffolds') -DEFAULTS.add('gene1_color_selected', '#518dc5', defn='The color of the first gene') -DEFAULTS.add('gene2_color_selected', '#4c9677', defn='The color of the second gene') -DEFAULTS.add('gene1_color', '#657e91', defn='The color of genes near the first gene') -DEFAULTS.add('gene2_color', '#325556', defn='The color of genes near the second gene') -DEFAULTS.add('label_color', '#000000', defn='The label color') -DEFAULTS.add('domain_color', '#ccccb3', defn='Domain fill color') -DEFAULTS.add('domain_mismatch_color', '#b2182b', defn='Domain fill color on 0%% match') -DEFAULTS.add('novel_exon_color', '#5D3F6A', defn='Novel Exon fill color') -DEFAULTS.add('splice_color', '#000000', defn='Splicing lines color') -DEFAULTS.add('breakpoint_color', '#000000', defn='Breakpoint outline color') -DEFAULTS.add('mask_fill', '#ffffff', defn='Color of mask (for deleted region etc.)') -DEFAULTS.add('mask_opacity', 0.7, defn='opacity of the mask layer', cast_type=float_fraction) -DEFAULTS.add('domain_scaffold_color', '#000000', defn='The color of the domain scaffold') -DEFAULTS.add( - 'drawing_width_iter_increase', - 500, - defn='The amount (in pixels) by which to increase the drawing width upon failure to fit', -) -DEFAULTS.add( - 'exon_min_focus_size', - 10, - defn='minimum size of an exon for it to be granted a label or min exon width', -) + +from ..constants import GIEMSA_STAIN +from ..schemas import DEFAULTS, get_by_prefix class DiagramSettings: @@ -68,10 +11,11 @@ class DiagramSettings: def __init__(self, **kwargs): inputs = {} - inputs.update(DEFAULTS.items()) + defaults = get_by_prefix(DEFAULTS, 'illustrate.') + inputs.update(defaults) inputs.update(kwargs) for arg, val in inputs.items(): - if arg not in DEFAULTS: + if arg not in defaults: raise KeyError('unrecognized argument', arg) setattr(self, arg, val) self.min_width = 10 # no element (exon, gene, etc can be less than this wide) diff --git a/mavis/illustrate/scatter.py b/mavis/illustrate/scatter.py index 5e0078df..976a6a85 100644 --- a/mavis/illustrate/scatter.py +++ b/mavis/illustrate/scatter.py @@ -1,9 +1,8 @@ import os -from ..bam.read import sequenced_strand, pileup -from ..util import LOG, DEVNULL +from ..bam.read import pileup, sequenced_strand from ..interval import Interval -from ..validate.constants import DEFAULTS as VALIDATION_DEFAULTS +from ..util import DEVNULL, LOG def bam_to_scatter( @@ -16,6 +15,7 @@ def bam_to_scatter( axis_name=None, ymax=None, min_mapping_quality=0, + strand_determining_read=2, ymax_color='#FF0000', ): """ @@ -50,7 +50,7 @@ def read_filter(read): if strand is None: return False try: - return sequenced_strand(read, VALIDATION_DEFAULTS.strand_determining_read) != strand + return sequenced_strand(read, strand_determining_read) != strand except ValueError: return True diff --git a/mavis/interval.py b/mavis/interval.py index 941ee707..c78e3aa8 100644 --- a/mavis/interval.py +++ b/mavis/interval.py @@ -1,7 +1,12 @@ +from typing import Callable, Optional + + class Interval: - """""" + start: int + end: int + freq: int = 1 - def __init__(self, start, end=None, freq=1, number_type=None): + def __init__(self, start: int, end: Optional[int] = None, freq: int = 1, number_type=None): """ Args: start (int): the start of the interval (inclusive) diff --git a/mavis/main.py b/mavis/main.py index 9c6d678c..dfa9f127 100644 --- a/mavis/main.py +++ b/mavis/main.py @@ -269,7 +269,6 @@ def main(argv=None): with open(args.outputfile, 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) else: - print(args) overlay_main( buffer_length=args.buffer_length, gene_name=args.gene_name, diff --git a/mavis/pairing/constants.py b/mavis/pairing/constants.py index 73c9d8ca..eebeffec 100644 --- a/mavis/pairing/constants.py +++ b/mavis/pairing/constants.py @@ -1,46 +1,12 @@ from ..constants import CALL_METHOD, MavisNamespace -from ..util import WeakMavisNamespace - - -DEFAULTS = WeakMavisNamespace() -""" -- [contig_call_distance](/configuration/settings/#contig_call_distance) -- [flanking_call_distance](/configuration/settings/#flanking_call_distance) -- [spanning_call_distance](/configuration/settings/#spanning_call_distance) -- [split_call_distance](/configuration/settings/#split_call_distance) -""" -DEFAULTS.add( - 'flanking_call_distance', - 50, - defn='the maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair', -) -DEFAULTS.add( - 'split_call_distance', - 20, - defn='the maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair', -) -DEFAULTS.add( - 'contig_call_distance', - 10, - defn='the maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair', -) -DEFAULTS.add( - 'spanning_call_distance', - 20, - defn='the maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair', -) -DEFAULTS.add( - 'input_call_distance', - 20, - defn='the maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair', -) +from ..schemas import DEFAULTS PAIRING_DISTANCES = MavisNamespace( **{ - CALL_METHOD.FLANK: DEFAULTS.flanking_call_distance, - CALL_METHOD.SPAN: DEFAULTS.spanning_call_distance, - CALL_METHOD.SPLIT: DEFAULTS.split_call_distance, - CALL_METHOD.CONTIG: DEFAULTS.contig_call_distance, - CALL_METHOD.INPUT: DEFAULTS.input_call_distance, + CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'], + CALL_METHOD.SPAN: DEFAULTS['pairing.spanning_call_distance'], + CALL_METHOD.SPLIT: DEFAULTS['pairing.split_call_distance'], + CALL_METHOD.CONTIG: DEFAULTS['pairing.contig_call_distance'], + CALL_METHOD.INPUT: DEFAULTS['pairing.input_call_distance'], } ) diff --git a/mavis/pairing/pairing.py b/mavis/pairing/pairing.py index e94bde37..316598c2 100644 --- a/mavis/pairing/pairing.py +++ b/mavis/pairing/pairing.py @@ -1,11 +1,10 @@ -from .constants import DEFAULTS, PAIRING_DISTANCES - from ..annotate.variant import determine_prime from ..breakpoint import Breakpoint from ..constants import CALL_METHOD, COLUMNS, ORIENT, PRIME, PROTOCOL, STRAND from ..error import NotSpecifiedError from ..interval import Interval from ..util import DEVNULL +from .constants import PAIRING_DISTANCES def product_key(bpp): diff --git a/mavis/schemas/config.json b/mavis/schemas/config.json index c50a8632..9769754c 100644 --- a/mavis/schemas/config.json +++ b/mavis/schemas/config.json @@ -287,7 +287,7 @@ "description": "The drawing width in pixels", "type": "integer" }, - "illustratebreakpoint_color": { + "illustrate.breakpoint_color": { "default": "#000000", "description": "Breakpoint outline color", "type": "string" diff --git a/mavis/summary/constants.py b/mavis/summary/constants.py index 4d087d7a..27de3695 100644 --- a/mavis/summary/constants.py +++ b/mavis/summary/constants.py @@ -1,60 +1,7 @@ -from ..constants import MavisNamespace, float_fraction -from ..util import WeakMavisNamespace +from ..constants import MavisNamespace - -DEFAULTS = WeakMavisNamespace() HOMOPOLYMER_MIN_LENGTH = 3 -""" -- [filter_cdna_synon](/configuration/settings/#filter_cdna_synon) -- [filter_min_flanking_reads](/configuration/settings/#filter_min_flanking_reads) -- [filter_min_linking_split_reads](/configuration/settings/#filter_min_linking_split_reads) -- [filter_min_remapped_reads](/configuration/settings/#filter_min_remapped_reads) -- [filter_min_spanning_reads](/configuration/settings/#filter_min_spanning_reads) -- [filter_min_split_reads](/configuration/settings/#filter_min_split_reads) -- [filter_protein_synon](/configuration/settings/#filter_protein_synon) -- [filter_min_complexity](/configuration/settings/#filter_min_complexity) -- [filter_trans_homopolymers](/configuration/settings/#filter_trans_homopolymers) -""" -DEFAULTS.add( - 'filter_min_remapped_reads', 5, defn='Minimum number of remapped reads for a call by contig' -) -DEFAULTS.add( - 'filter_min_spanning_reads', - 5, - defn='Minimum number of spanning reads for a call by spanning reads', -) -DEFAULTS.add( - 'filter_min_flanking_reads', - 10, - defn='Minimum number of flanking pairs for a call by flanking pairs', -) -DEFAULTS.add( - 'filter_min_split_reads', 5, defn='Minimum number of split reads for a call by split reads' -) -DEFAULTS.add( - 'filter_min_linking_split_reads', - 1, - defn='Minimum number of linking split reads for a call by split reads', -) -DEFAULTS.add('filter_cdna_synon', True, defn='Filter all annotations synonymous at the cdna level') -DEFAULTS.add( - 'filter_protein_synon', False, defn='Filter all annotations synonymous at the protein level' -) -DEFAULTS.add( - 'filter_trans_homopolymers', - True, - defn='Filter all single bp ins/del/dup events that are in a homopolymer region of at least ' - '{} bps and are not paired to a genomic event'.format(HOMOPOLYMER_MIN_LENGTH), -) -DEFAULTS.add( - 'filter_min_complexity', - 0.2, - cast_type=float_fraction, - defn='Filter event calls based on call sequence complexity', -) - - PAIRING_STATE = MavisNamespace( EXP='expressed', NO_EXP='not expressed', diff --git a/mavis/tools/__init__.py b/mavis/tools/__init__.py index 94719b81..307f4af5 100644 --- a/mavis/tools/__init__.py +++ b/mavis/tools/__init__.py @@ -1,30 +1,30 @@ import itertools +from typing import Callable, Dict, List -from shortuuid import uuid import tab +from shortuuid import uuid from ..breakpoint import Breakpoint, BreakpointPair from ..constants import COLUMNS, ORIENT, STRAND, SVTYPE from ..error import InvalidRearrangement from ..util import DEVNULL, read_bpp_from_input_file - -from .constants import SUPPORTED_TOOL, TRACKING_COLUMN, TOOL_SVTYPE_MAPPING -from .transabyss import convert_row as _parse_transabyss -from .cnvnator import convert_row as _parse_cnvnator -from .vcf import convert_file as read_vcf from .breakdancer import convert_file as _convert_breakdancer_file -from .starfusion import convert_row as _parse_starfusion from .chimerascan import convert_row as _parse_chimerascan +from .cnvnator import convert_row as _parse_cnvnator +from .constants import SUPPORTED_TOOL, TOOL_SVTYPE_MAPPING, TRACKING_COLUMN +from .starfusion import convert_row as _parse_starfusion +from .transabyss import convert_row as _parse_transabyss +from .vcf import convert_file as read_vcf def convert_tool_output( - fnames, - file_type=SUPPORTED_TOOL.MAVIS, - stranded=False, - log=DEVNULL, - collapse=True, - assume_no_untemplated=True, -): + fnames: List[str], + file_type: str = SUPPORTED_TOOL.MAVIS, + stranded: bool = False, + log: Callable = DEVNULL, + collapse: bool = True, + assume_no_untemplated: bool = True, +) -> List[BreakpointPair]: """ Reads output from a given SV caller and converts to a set of MAVIS breakpoint pairs. Also collapses duplicates """ @@ -36,7 +36,7 @@ def convert_tool_output( ) ) if collapse: - collapse_mapping = {} + collapse_mapping: Dict[BreakpointPair, List[BreakpointPair]] = {} for bpp in result: collapse_mapping.setdefault(bpp, []).append(bpp) log('collapsed', len(result), 'to', len(collapse_mapping), 'calls') @@ -62,7 +62,9 @@ def convert_tool_output( return result -def _convert_tool_row(row, file_type, stranded, assume_no_untemplated=True): +def _convert_tool_row( + row: Dict, file_type: str, stranded: bool, assume_no_untemplated: bool = True +) -> List[BreakpointPair]: """ converts a row parsed from an input file to the appropriate column names for it to be converted to MAVIS style row """ @@ -214,8 +216,8 @@ def _convert_tool_row(row, file_type, stranded, assume_no_untemplated=True): opposing_strands=oppose, untemplated_seq=untemplated_seq, event_type=event_type, - data={COLUMNS.tools: file_type, COLUMNS.tracking_id: std_row[COLUMNS.tracking_id]}, stranded=stranded, + **{COLUMNS.tools: file_type, COLUMNS.tracking_id: std_row[COLUMNS.tracking_id]} ) for col, value in std_row.items(): @@ -237,12 +239,12 @@ def _convert_tool_row(row, file_type, stranded, assume_no_untemplated=True): def _convert_tool_output( - input_file, - file_type=SUPPORTED_TOOL.MAVIS, - stranded=False, - log=DEVNULL, - assume_no_untemplated=True, -): + input_file: str, + file_type: str = SUPPORTED_TOOL.MAVIS, + stranded: bool = False, + log: Callable = DEVNULL, + assume_no_untemplated: bool = True, +) -> List[BreakpointPair]: log('reading:', input_file) result = [] rows = None diff --git a/mavis/tools/chimerascan.py b/mavis/tools/chimerascan.py index fa71a8cb..1d4be17b 100644 --- a/mavis/tools/chimerascan.py +++ b/mavis/tools/chimerascan.py @@ -1,9 +1,10 @@ -from ..constants import COLUMNS, ORIENT +from typing import Dict -from .constants import TRACKING_COLUMN, SUPPORTED_TOOL +from ..constants import COLUMNS, ORIENT +from .constants import SUPPORTED_TOOL, TRACKING_COLUMN -def convert_row(row): +def convert_row(row: Dict) -> Dict: """ transforms the chimerscan output into the common format for expansion. Maps the input column names to column names which MAVIS can read diff --git a/mavis/tools/vcf.py b/mavis/tools/vcf.py index 5a5ec4e7..4ffa1e2b 100644 --- a/mavis/tools/vcf.py +++ b/mavis/tools/vcf.py @@ -1,14 +1,14 @@ import re +from typing import Dict, List, Tuple from pysam import VariantFile from ..constants import COLUMNS, ORIENT, SVTYPE from ..util import DEVNULL - from .constants import SUPPORTED_TOOL -def parse_bnd_alt(alt): +def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: """ parses the alt statement from vcf files using the specification in vcf 4.2/4.2. @@ -74,7 +74,7 @@ def parse_bnd_alt(alt): raise NotImplementedError('alt specification in unexpected format', alt) -def convert_record(record, record_mapping={}, log=DEVNULL): +def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: """ converts a vcf record diff --git a/mavis/util.py b/mavis/util.py index dee6b1e9..db673fa2 100644 --- a/mavis/util.py +++ b/mavis/util.py @@ -1,21 +1,21 @@ -from argparse import Namespace -from datetime import datetime import errno -from functools import partial -from glob import glob import itertools +import logging import os import re -import time -import logging import sys +import time +from argparse import Namespace +from datetime import datetime +from functools import partial +from glob import glob from braceexpand import braceexpand -from tab import tab from shortuuid import uuid +from tab import tab from .breakpoint import Breakpoint, BreakpointPair -from .constants import COLUMNS, ORIENT, PROTOCOL, sort_columns, STRAND, SVTYPE, MavisNamespace +from .constants import COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE, MavisNamespace, sort_columns from .error import InvalidRearrangement from .interval import Interval diff --git a/mavis/validate/base.py b/mavis/validate/base.py index 911e4984..49220ae9 100644 --- a/mavis/validate/base.py +++ b/mavis/validate/base.py @@ -1,29 +1,42 @@ import itertools import logging +from abc import abstractproperty +from typing import Dict, List, Optional, Set, Tuple + +import pysam from ..assemble import assemble from ..bam import cigar as _cigar from ..bam import read as _read from ..bam.cache import BamCache -from ..breakpoint import BreakpointPair -from ..constants import ( - CIGAR, - COLUMNS, - NA_MAPPING_QUALITY, - ORIENT, - PROTOCOL, - PYSAM_READ_FLAGS, - STRAND, - SVTYPE, - reverse_complement, -) +from ..breakpoint import Breakpoint, BreakpointPair +from ..constants import (CIGAR, COLUMNS, NA_MAPPING_QUALITY, ORIENT, PROTOCOL, + PYSAM_READ_FLAGS, STRAND, SVTYPE, reverse_complement) from ..error import NotSpecifiedError from ..interval import Interval +from ..schemas import DEFAULTS from ..util import DEVNULL -from .constants import DEFAULTS class Evidence(BreakpointPair): + bam_cache: BamCache + classification: Optional[str] + reference_genome: Dict + read_length: int + stdev_fragment_size: int + median_fragment_size: int + split_reads: Tuple[Set, Set] + flanking_pairs: Set + compatible_flanking_pairs: Set + spanning_reads: Set + counts: List[int] + contigs: List + half_mapped: Tuple[Set, Set] + compatible_window1: Optional[Interval] + compatible_window2: Optional[Interval] + config: Dict + assenmbly_max_kmer_size: int + @property def min_expected_fragment_size(self): # cannot be negative @@ -48,6 +61,30 @@ def max_expected_fragment_size(self): ) ) + @abstractproperty + def strand_determining_read(self): + pass + + @abstractproperty + def outer_window1(self): + pass + + @abstractproperty + def outer_window2(self): + pass + + @abstractproperty + def inner_window1(self): + pass + + @abstractproperty + def inner_window2(self): + pass + + @abstractproperty + def min_mapping_quality(self): + pass + def __init__( self, break1, @@ -60,9 +97,10 @@ def __init__( stranded=False, opposing_strands=None, untemplated_seq=None, - data={}, classification=None, - **kwargs + config=DEFAULTS, + assembly_max_kmer_size=None, + **kwargs, ): """ Args: @@ -77,6 +115,7 @@ def __init__( # initialize the breakpoint pair self.bam_cache = bam_cache self.stranded = stranded and bam_cache.stranded + self.config = config BreakpointPair.__init__( self, break1, @@ -84,7 +123,7 @@ def __init__( stranded=stranded, opposing_strands=opposing_strands, untemplated_seq=untemplated_seq, - **data + **kwargs, ) # check that the breakpoints are within the reference length if reference_genome: @@ -110,16 +149,9 @@ def __init__( len(reference_genome[self.break2.chr].seq), ) ) - defaults = dict() - for arg in kwargs: - if arg not in DEFAULTS: - raise AttributeError('unrecognized attribute', arg) - defaults.update(DEFAULTS.items()) - kwargs.setdefault('assembly_max_kmer_size', int(read_length * 0.7)) - defaults.update(kwargs) # input arguments should override the defaults - for arg, val in defaults.items(): - setattr(self, arg, val) - + self.assembly_max_kmer_size = ( + assembly_max_kmer_size if assembly_max_kmer_size is not None else int(read_length * 0.7) + ) self.bam_cache = bam_cache self.classification = classification self.reference_genome = reference_genome @@ -163,11 +195,11 @@ def __init__( pass @staticmethod - def distance(start, end): + def distance(start: int, end: int): return Interval(abs(end - start)) @staticmethod - def traverse(start, distance, direction): + def traverse(start: int, distance: int, direction: str) -> Interval: if direction == ORIENT.LEFT: return Interval(start - distance) return Interval(start + distance) @@ -195,7 +227,9 @@ def standardize_read(self, read): ) prefix = 0 try: - cigar, prefix = _cigar.extend_softclipping(cigar, self.min_anchor_exact) + cigar, prefix = _cigar.extend_softclipping( + cigar, self.config['validate.min_anchor_exact'] + ) except AttributeError: pass read.cigar = _cigar.join(cigar) @@ -229,11 +263,8 @@ def compatible_type(self): return SVTYPE.INS return None - def compute_fragment_size(self, read, mate): + def compute_fragment_size(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): """ - Args: - read (pysam.AlignedSegment): - mate (pysam.AlignedSegment): Returns: Interval: interval representing the range of possible fragment sizes for this read pair """ @@ -251,7 +282,7 @@ def supporting_reads(self): result.update(self.spanning_reads) return result - def collect_spanning_read(self, read): + def collect_spanning_read(self, read: pysam.AlignedSegment): """ spanning read: a read covering BOTH breakpoints @@ -259,7 +290,7 @@ def collect_spanning_read(self, read): here since they will be collected already Args: - read (pysam.AlignedSegment): the putative spanning read + read: the putative spanning read Returns: bool: @@ -302,14 +333,16 @@ def collect_spanning_read(self, read): return True return False - def collect_compatible_flanking_pair(self, read, mate, compatible_type): + def collect_compatible_flanking_pair( + self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment, compatible_type: str + ) -> bool: """ checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for this event Args: - read (pysam.AlignedSegment): the read to add - mate (pysam.AlignedSegment): the mate + read: the read to add + mate: the mate compatible_type (SVTYPE): the type we are collecting for Returns: @@ -394,14 +427,14 @@ def collect_compatible_flanking_pair(self, read, mate, compatible_type): return False - def collect_flanking_pair(self, read, mate): + def collect_flanking_pair(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): """ checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for this event Args: - read (pysam.AlignedSegment): the read to add - mate (pysam.AlignedSegment): the mate + read: the read to add + mate: the mate Returns: bool: @@ -505,11 +538,11 @@ def collect_flanking_pair(self, read, mate): return False - def collect_half_mapped(self, read, mate): + def collect_half_mapped(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): """ Args: - read (pysam.AlignedSegment): the read to add - mate (pysam.AlignedSegment): the unmapped mate + read: the read to add + mate: the unmapped mate Returns: bool: @@ -534,12 +567,12 @@ def collect_half_mapped(self, read, mate): added = True return added - def collect_split_read(self, read, first_breakpoint): + def collect_split_read(self, read: pysam.AlignedSegment, first_breakpoint: bool): """ adds a split read if it passes the criteria filters and raises a warning if it does not Args: - read (pysam.AlignedSegment): the read to add + read: the read to add first_breakpoint (bool): add to the first breakpoint (or second if false) Returns: bool: @@ -605,7 +638,10 @@ def collect_split_read(self, read, first_breakpoint): len(read.query_sequence), ) - if len(primary) < self.min_anchor_exact or len(clipped) < self.min_softclipping: + if ( + len(primary) < self.config['validate.min_anchor_exact'] + or len(clipped) < self.config['validate.min_softclipping'] + ): # split read does not meet the minimum anchor criteria return False if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( @@ -614,14 +650,17 @@ def collect_split_read(self, read, first_breakpoint): read = self.standardize_read(read) # data quality filters if ( - _cigar.alignment_matches(read.cigar) >= self.min_sample_size_to_apply_percentage - and _cigar.match_percent(read.cigar) < self.min_anchor_match + _cigar.alignment_matches(read.cigar) + >= self.config['validate.min_sample_size_to_apply_percentage'] + and _cigar.match_percent(read.cigar) < self.config['validate.min_anchor_match'] ): return False # too poor quality of an alignment if ( - _cigar.longest_exact_match(read.cigar) < self.min_anchor_exact - and _cigar.longest_fuzzy_match(read.cigar, self.fuzzy_mismatch_number) - < self.min_anchor_fuzzy + _cigar.longest_exact_match(read.cigar) < self.config['validate.min_anchor_exact'] + and _cigar.longest_fuzzy_match( + read.cigar, self.config['validate.fuzzy_mismatch_number'] + ) + < self.config['validate.min_anchor_fuzzy'] ): return False # too poor quality of an alignment else: @@ -636,14 +675,14 @@ def collect_split_read(self, read, first_breakpoint): putative_alignments = None # figure out how much of the read must match when remaped min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] - min_match_tgt = min(min_match_tgt * self.min_anchor_match, min_match_tgt - 1) / len( - read.query_sequence - ) + min_match_tgt = min( + min_match_tgt * self.config['validate.min_anchor_match'], min_match_tgt - 1 + ) / len(read.query_sequence) if not self.opposing_strands: # same strand sc_align = _read.nsb_align( opposite_breakpoint_ref, read.query_sequence, - min_consecutive_match=self.min_anchor_exact, + min_consecutive_match=self.config['validate.min_anchor_exact'], min_match=min_match_tgt, min_overlap_percent=min_match_tgt, ) # split half to this side @@ -657,7 +696,7 @@ def collect_split_read(self, read, first_breakpoint): revcomp_sc_align = _read.nsb_align( opposite_breakpoint_ref, revcomp_sc_align, - min_consecutive_match=self.min_anchor_exact, + min_consecutive_match=self.config['validate.min_anchor_exact'], min_match=min_match_tgt, min_overlap_percent=min_match_tgt, ) @@ -683,7 +722,9 @@ def collect_split_read(self, read, first_breakpoint): alignment.next_reference_id = read.next_reference_id alignment.mapping_quality = NA_MAPPING_QUALITY try: - cigar, offset = _cigar.extend_softclipping(alignment.cigar, self.min_anchor_exact) + cigar, offset = _cigar.extend_softclipping( + alignment.cigar, self.config['validate.min_anchor_exact'] + ) alignment.cigar = cigar alignment.reference_start = alignment.reference_start + offset except AttributeError: @@ -705,27 +746,31 @@ def collect_split_read(self, read, first_breakpoint): alignment.template_length = 0 if ( _cigar.alignment_matches(alignment.cigar) - >= self.min_sample_size_to_apply_percentage - and _cigar.match_percent(alignment.cigar) < self.min_anchor_match + >= self.config['validate.min_sample_size_to_apply_percentage'] + and _cigar.match_percent(alignment.cigar) < self.config['validate.min_anchor_match'] ): continue if ( - _cigar.longest_exact_match(alignment.cigar) < self.min_anchor_exact - and _cigar.longest_fuzzy_match(alignment.cigar, self.fuzzy_mismatch_number) - < self.min_anchor_fuzzy + _cigar.longest_exact_match(alignment.cigar) + < self.config['validate.min_anchor_exact'] + and _cigar.longest_fuzzy_match( + alignment.cigar, self.config['validate.fuzzy_mismatch_number'] + ) + < self.config['validate.min_anchor_fuzzy'] ): continue - if self.max_sc_preceeding_anchor is not None: + if self.config['validate.max_sc_preceeding_anchor'] is not None: if opposite_breakpoint.orient == ORIENT.LEFT: if ( alignment.cigar[0][0] == CIGAR.S - and alignment.cigar[0][1] > self.max_sc_preceeding_anchor + and alignment.cigar[0][1] > self.config['validate.max_sc_preceeding_anchor'] ): continue elif opposite_breakpoint.orient == ORIENT.RIGHT: if ( alignment.cigar[-1][0] == CIGAR.S - and alignment.cigar[-1][1] > self.max_sc_preceeding_anchor + and alignment.cigar[-1][1] + > self.config['validate.max_sc_preceeding_anchor'] ): continue alignment.set_key() # set the hash key before we add the read as evidence @@ -747,7 +792,7 @@ def collect_split_read(self, read, first_breakpoint): ) # add to the opposite breakpoint return True - def decide_sequenced_strand(self, reads): + def decide_sequenced_strand(self, reads: Set[pysam.AlignedSegment]): """ given a set of reads, determines the sequenced strand (if possible) and then returns the majority strand found @@ -780,9 +825,9 @@ def decide_sequenced_strand(self, reads): else: ratio = strand_calls[STRAND.POS] / (strand_calls[STRAND.NEG] + strand_calls[STRAND.POS]) neg_ratio = 1 - ratio - if ratio >= self.assembly_strand_concordance: + if ratio >= self.config['validate.assembly_strand_concordance']: return STRAND.POS - elif neg_ratio >= self.assembly_strand_concordance: + elif neg_ratio >= self.config['validate.assembly_strand_concordance']: return STRAND.NEG raise ValueError( 'Could not determine the strand. Equivocal POS/(NEG + POS) ratio', @@ -910,9 +955,9 @@ def assemble_contig(self, log=DEVNULL): build_strand[STRAND.NEG] + build_strand[STRAND.POS] ) neg_ratio = 1 - ratio - if ratio >= self.assembly_strand_concordance: + if ratio >= self.config['validate.assembly_strand_concordance']: flipped_build = False - elif neg_ratio >= self.assembly_strand_concordance: + elif neg_ratio >= self.config['validate.assembly_strand_concordance']: flipped_build = True else: continue @@ -1180,3 +1225,26 @@ def get_bed_repesentation(self): bed.append((self.break2.chr, self.outer_window2[0] - 1, self.outer_window2[1], name)) bed.append((self.break2.chr, self.inner_window2[0] - 1, self.inner_window2[1], name)) return bed + + def generate_window(self, breakpoint: Breakpoint) -> Interval: + """ + given some input breakpoint uses the current evidence setting to determine an + appropriate window/range of where one should search for supporting reads + + Args: + breakpoint (Breakpoint): the breakpoint we are generating the evidence window for + read_length (int): the read length + call_error (int): + adds a buffer to the calculations if confidence in the breakpoint calls is low can increase this + Returns: + Interval: the range where reads should be read from the bam looking for evidence for this event + """ + call_error = self.config['validate.call_error'] + start = breakpoint.start - self.max_expected_fragment_size - call_error + 1 + end = breakpoint.end + self.max_expected_fragment_size + call_error - 1 + + if breakpoint.orient == ORIENT.LEFT: + end = breakpoint.end + call_error + self.read_length - 1 + elif breakpoint.orient == ORIENT.RIGHT: + start = breakpoint.start - call_error - self.read_length + 1 + return Interval(max([1, start]), max([end, 1])) diff --git a/mavis/validate/call.py b/mavis/validate/call.py index 2c69d981..a2f49bfc 100644 --- a/mavis/validate/call.py +++ b/mavis/validate/call.py @@ -1,10 +1,10 @@ import itertools import math import statistics +from typing import Optional, Set -from ..align import SplitAlignment, call_read_events, call_paired_read_event, convert_to_duplication +from ..align import SplitAlignment, call_paired_read_event, call_read_events, convert_to_duplication from ..bam import read as _read - from ..breakpoint import Breakpoint, BreakpointPair from ..constants import ( CALL_METHOD, @@ -25,6 +25,15 @@ class for holding evidence and the related calls since we can't freeze the evide just a reference to the evidence object and decisions on class, exact breakpoints, etc """ + spanning_reads: Set + flanking_pairs: Set + break1_split_reads: Set + break2_split_reads: Set + compatible_flanking_pairs: Set + compatible_type: str + contig: Optional + contig_alignment: Optional + @property def has_compatible(self): return False if self.compatible_type is None else True diff --git a/mavis/validate/constants.py b/mavis/validate/constants.py index a1a84ed5..a8c884d6 100644 --- a/mavis/validate/constants.py +++ b/mavis/validate/constants.py @@ -1,263 +1 @@ -from ..align import SUPPORTED_ALIGNER -from ..constants import float_fraction -from ..util import WeakMavisNamespace - PASS_FILENAME = 'validation-passed.tab' - -DEFAULTS = WeakMavisNamespace() -""" -see [settings section](/configuration/settings) -""" -DEFAULTS.add( - 'min_call_complexity', - 0.10, - cast_type=float_fraction, - defn='The minimum complexity score for a call sequence. Is an average for non-contig calls. Filters ' - 'low complexity contigs before alignment. see [contig_complexity](#contig_complexity)', -) -DEFAULTS.add( - 'aligner', - SUPPORTED_ALIGNER.BLAT, - cast_type=SUPPORTED_ALIGNER, - defn='the aligner to use to map the contigs/reads back to the reference e.g blat or bwa', -) -DEFAULTS.add( - 'assembly_kmer_size', - 0.74, - cast_type=float_fraction, - defn='The percent of the read length to make kmers for assembly', -) -DEFAULTS.add( - 'assembly_max_paths', - 8, - defn='the maximum number of paths to resolve. This is used to limit when there is a messy assembly graph to ' - 'resolve. The assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater ' - 'than the given setting.', -) -DEFAULTS.add( - 'assembly_min_uniq', - 0.10, - cast_type=float_fraction, - defn='Minimum percent uniq required to keep separate assembled contigs. If contigs are more similar then the lower scoring, then shorter, contig is dropped', -) -DEFAULTS.add( - 'assembly_min_exact_match_to_remap', - 15, - defn='The minimum length of exact matches to initiate remapping a read to a contig', -) -DEFAULTS.add( - 'assembly_min_edge_trim_weight', - 3, - defn='this is used to simplify the DeBruijn graph before path finding. Edges with less than this frequency will ' - 'be discarded if they are non-cutting, at a fork, or the end of a path', -) -DEFAULTS.add( - 'assembly_min_remap_coverage', - 0.9, - cast_type=float_fraction, - defn='Minimum fraction of the contig sequence which the remapped sequences must align over', -) -DEFAULTS.add( - 'assembly_min_remapped_seq', - 3, - defn='The minimum input sequences that must remap for an assembled contig to be used', -) -DEFAULTS.add( - 'assembly_strand_concordance', - 0.51, - cast_type=float_fraction, - defn='When the number of remapped reads from each strand are compared, the ratio must be above this number to ' - 'decide on the strand', -) -DEFAULTS.add( - 'blat_min_identity', - 0.9, - cast_type=float_fraction, - defn='The minimum percent identity match required for blat results when aligning contigs', -) -DEFAULTS.add( - 'blat_limit_top_aln', 10, defn='Number of results to return from blat (ranking based on score)' -) -DEFAULTS.add('call_error', 10, defn='buffer zone for the evidence window') -DEFAULTS.add( - 'contig_aln_max_event_size', - 50, - defn='relates to determining breakpoints when pairing contig alignments. For any given read in a putative pair ' - 'the soft clipping is extended to include any events of greater than this size. The softclipping is added to the ' - 'side of the alignment as indicated by the breakpoint we are assigning pairs to', -) -DEFAULTS.add( - 'contig_aln_merge_inner_anchor', - 20, - defn='the minimum number of consecutive exact match base pairs to not merge events within a contig alignment', -) -DEFAULTS.add( - 'contig_aln_merge_outer_anchor', - 15, - defn='minimum consecutively aligned exact matches to anchor an end for merging internal events', -) -DEFAULTS.add( - 'contig_aln_min_anchor_size', - 50, - defn='the minimum number of aligned bases for a contig (M or =) in order to simplify. Do not have to be consecutive.', -) -DEFAULTS.add( - 'contig_aln_min_query_consumption', - 0.9, - cast_type=float_fraction, - defn='minimum fraction of the original query sequence that must be used by the read(s) of the alignment', -) -DEFAULTS.add( - 'contig_aln_min_extend_overlap', - 10, - defn='minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment', -) -DEFAULTS.add( - 'contig_aln_min_score', - 0.9, - cast_type=float_fraction, - defn='minimum score for a contig to be used as evidence in a call by contig', -) -DEFAULTS.add( - 'fetch_min_bin_size', - 50, - defn='the minimum size of any bin for reading from a bam file. Increasing this number will result in smaller bins ' - 'being merged or less bins being created (depending on the fetch method)', -) -DEFAULTS.add( - 'fetch_reads_bins', - 5, - defn='number of bins to split an evidence window into to ensure more even sampling of high coverage regions', -) -DEFAULTS.add( - 'fetch_reads_limit', - 3000, - defn='maximum number of reads, cap, to loop over for any given evidence window', -) -DEFAULTS.add( - 'trans_fetch_reads_limit', - 12000, - cast_type=int, - nullable=True, - defn='Related to [fetch_reads_limit](#fetch_reads_limit). Overrides fetch_reads_limit for transcriptome libraries when set. ' - 'If this has a value of None then fetch_reads_limit will be used for transcriptome libraries instead', -) -DEFAULTS.add( - 'filter_secondary_alignments', - True, - defn='filter secondary alignments when gathering read evidence', -) -DEFAULTS.add( - 'fuzzy_mismatch_number', - 1, - defn='The number of events/mismatches allowed to be considered a fuzzy match', -) -DEFAULTS.add( - 'max_sc_preceeding_anchor', - 6, - defn='when remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of ' - 'where we expect it. For example for a softclipped read on a breakpoint with a left orientation this limits the ' - 'amount of softclipping that is allowed on the right. If this is set to None then there is no limit on softclipping', -) -DEFAULTS.add( - 'min_anchor_exact', - 6, - defn='Applies to re-aligning softclipped reads to the opposing breakpoint. The minimum ' - 'number of consecutive exact matches to anchor a read to initiate targeted realignment', -) -DEFAULTS.add( - 'min_anchor_fuzzy', - 10, - defn='Applies to re-aligning softclipped reads to the opposing breakpoint. The minimum ' - 'length of a fuzzy match to anchor a read to initiate targeted realignment', -) -DEFAULTS.add( - 'min_anchor_match', - 0.9, - cast_type=float_fraction, - defn='Minimum percent match for a read to be kept as evidence', -) -DEFAULTS.add( - 'min_double_aligned_to_estimate_insertion_size', - 2, - defn='The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the ' - 'untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping', -) -DEFAULTS.add( - 'min_flanking_pairs_resolution', - 10, - defn='the minimum number of flanking reads required to call a breakpoint by flanking evidence', -) -DEFAULTS.add( - 'min_linking_split_reads', - 2, - defn='The minimum number of split reads which aligned to both breakpoints', -) -DEFAULTS.add( - 'min_mapping_quality', 5, defn='the minimum mapping quality of reads to be used as evidence' -) -DEFAULTS.add( - 'trans_min_mapping_quality', - 0, - cast_type=int, - nullable=True, - defn='Related to [min_mapping_quality](#min_mapping_quality). Overrides the min_mapping_quality if the library is a transcriptome ' - 'and this is set to any number not None. If this value is None, min_mapping_quality is used for transcriptomes as' - 'well as genomes', -) -DEFAULTS.add( - 'min_non_target_aligned_split_reads', - 1, - defn='The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local ' - 'alignment to the target region to call a breakpoint by split read evidence', -) -DEFAULTS.add( - 'min_sample_size_to_apply_percentage', - 10, - defn='Minimum number of aligned bases to compute a match percent. ' - 'If there are less than this number of aligned bases (match or mismatch) the percent comparator is not used', -) -DEFAULTS.add( - 'min_softclipping', - 6, - defn='minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence', -) -DEFAULTS.add( - 'min_spanning_reads_resolution', - 5, - defn='Minimum number of spanning reads required to call an event by spanning evidence', -) -DEFAULTS.add( - 'min_splits_reads_resolution', - 3, - defn='minimum number of split reads required to call a breakpoint by split reads', -) -DEFAULTS.add( - 'stdev_count_abnormal', - 3.0, - defn='the number of standard deviations away from the normal considered expected and therefore not qualifying as ' - 'flanking reads', -) -DEFAULTS.add( - 'strand_determining_read', - 2, - defn='1 or 2. The read in the pair which determines if (assuming a stranded protocol) the first or second read in ' - 'the pair matches the strand sequenced', -) -DEFAULTS.add( - 'outer_window_min_event_size', - 125, - defn='the minimum size of an event in order for flanking read evidence to be collected', -) -DEFAULTS.add( - 'write_evidence_files', - True, - defn='write the intermediate bam and bed files containing the raw evidence collected and ' - 'contigs aligned. Not required for subsequent steps but can be useful in debugging and deep investigation of events', -) -DEFAULTS.add( - 'clean_aligner_files', - False, - defn='Remove the aligner output files after the validation stage is complete. Not' - ' required for subsequent steps but can be useful in debugging and deep investigation of events', -) diff --git a/mavis/validate/evidence.py b/mavis/validate/evidence.py index 0d065d7e..8719210c 100644 --- a/mavis/validate/evidence.py +++ b/mavis/validate/evidence.py @@ -1,28 +1,49 @@ import itertools +from typing import Optional + +import pysam -from .base import Evidence from ..align import SplitAlignment, call_read_events -from ..bam import cigar as _cigar from ..annotate.variant import overlapping_transcripts +from ..bam import cigar as _cigar from ..breakpoint import Breakpoint -from ..constants import ORIENT, PROTOCOL, STRAND, SVTYPE, CIGAR +from ..constants import CIGAR, ORIENT, PROTOCOL, STRAND, SVTYPE from ..interval import Interval +from ..schemas import DEFAULTS +from .base import Evidence class GenomeEvidence(Evidence): + outer_window1: Interval + outer_window2: Interval + inner_window1: Interval + inner_window2: Interval + compatible_window1: Interval + compatible_window2: Interval + protocol: str + + @property + def min_mapping_quality(self): + return self.config['validate.min_mapping_quality'] + + @property + def fetch_reads_limit(self): + return self.config['validate.fetch_reads_limit'] + def __init__(self, *pos, **kwargs): Evidence.__init__(self, *pos, **kwargs) self.protocol = PROTOCOL.GENOME self.outer_window1 = self.generate_window(self.break1) self.outer_window2 = self.generate_window(self.break2) + call_error = self.config['validate.call_error'] self.inner_window1 = Interval( - max([self.break1.start - self.call_error - self.read_length + 1, 1]), - self.break1.end + self.call_error + self.read_length - 1, + max([self.break1.start - call_error - self.read_length + 1, 1]), + self.break1.end + call_error + self.read_length - 1, ) self.inner_window2 = Interval( - max([self.break2.start - self.call_error - self.read_length + 1, 1]), - self.break2.end + self.call_error + self.read_length - 1, + max([self.break2.start - call_error - self.read_length + 1, 1]), + self.break2.end + call_error + self.read_length - 1, ) if SVTYPE.INS in self.putative_event_types(): @@ -64,42 +85,32 @@ def __init__(self, *pos, **kwargs): self.compatible_window1 = self.generate_window(compt_break1) self.compatible_window2 = self.generate_window(compt_break2) - def generate_window(self, breakpoint): - """ - given some input breakpoint uses the current evidence setting to determine an - appropriate window/range of where one should search for supporting reads - - Args: - breakpoint (Breakpoint): the breakpoint we are generating the evidence window for - read_length (int): the read length - call_error (int): - adds a buffer to the calculations if confidence in the breakpoint calls is low can increase this - Returns: - Interval: the range where reads should be read from the bam looking for evidence for this event - """ - start = breakpoint.start - self.max_expected_fragment_size - self.call_error + 1 - end = breakpoint.end + self.max_expected_fragment_size + self.call_error - 1 - - if breakpoint.orient == ORIENT.LEFT: - end = breakpoint.end + self.call_error + self.read_length - 1 - elif breakpoint.orient == ORIENT.RIGHT: - start = breakpoint.start - self.call_error - self.read_length + 1 - return Interval(max([1, start]), max([end, 1])) - - def compute_fragment_size(self, read, mate=None): + def compute_fragment_size( + self, read: pysam.AlignedSegment, mate: Optional[pysam.AlignedSegment] = None + ): return Interval(abs(read.template_length)) class TranscriptomeEvidence(Evidence): + outer_window1: Interval + outer_window2: Interval + inner_window1: Interval + inner_window2: Interval + compatible_window1: Interval + compatible_window2: Interval + protocol: str + + @property + def min_mapping_quality(self): + return self.config['validate.trans_min_mapping_quality'] + + @property + def fetch_reads_limit(self): + return self.config['validate.trans_fetch_reads_limit'] + def __init__(self, annotations, *pos, **kwargs): Evidence.__init__(self, *pos, **kwargs) - # set the transcriptome specific overrides - if self.trans_min_mapping_quality is not None: - self.min_mapping_quality = self.trans_min_mapping_quality - if self.trans_fetch_reads_limit is not None: - self.fetch_reads_limit = self.trans_fetch_reads_limit - self.protocol = PROTOCOL.TRANS # get the list of overlapping transcripts self.overlapping_transcripts = overlapping_transcripts( @@ -108,7 +119,8 @@ def __init__(self, annotations, *pos, **kwargs): self.outer_window1 = self.generate_window(self.break1) self.outer_window2 = self.generate_window(self.break2) - tgt = self.call_error + self.read_length - 1 + call_error = self.config['validate.call_error'] + tgt = call_error + self.read_length - 1 self.inner_window1 = self.traverse(self.break1.end, tgt, ORIENT.RIGHT) | self.traverse( self.break1.start, tgt, ORIENT.LEFT @@ -156,7 +168,14 @@ def __init__(self, annotations, *pos, **kwargs): self.compatible_window1 = self.generate_window(compt_break1) self.compatible_window2 = self.generate_window(compt_break2) - def traverse(self, start, distance, direction, strand=STRAND.NS, chrom=None): + def traverse( # type: ignore + self, + start: int, + distance: int, + direction: str, + strand: str = STRAND.NS, + chrom: Optional[str] = None, + ): """ given some genomic position and a distance. Uses the input transcripts to compute all possible genomic end positions at that distance if intronic @@ -166,7 +185,6 @@ def traverse(self, start, distance, direction, strand=STRAND.NS, chrom=None): start (int): the genomic start position distance (int): the amount of exonic/intergenic units to traverse direction (ORIENT): the direction wrt to the positive/forward reference strand to traverse - transcripts (List[PreTranscript]): list of transcripts to use """ transcripts = self._select_transcripts(chrom, strand) is_left = True if direction == ORIENT.LEFT else False @@ -230,7 +248,7 @@ def _select_transcripts(self, chrom=None, strand=STRAND.NS): result.append(transcript) return result - def distance(self, start, end, strand=STRAND.NS, chrom=None): + def distance(self, start: int, end: int, strand: str = STRAND.NS, chrom: Optional[str] = None): """ give the current list of transcripts, computes the putative exonic/intergenic distance given two genomic positions. Intronic positions are ignored @@ -265,7 +283,7 @@ def distance(self, start, end, strand=STRAND.NS, chrom=None): return Interval.from_iterable(inter) return Evidence.distance(start, end) - def generate_window(self, breakpoint): + def generate_window(self, breakpoint: Breakpoint): """ given some input breakpoint uses the current evidence setting to determine an appropriate window/range of where one should search for supporting reads @@ -282,7 +300,7 @@ def generate_window(self, breakpoint): Returns: Interval: the range where reads should be read from the bam looking for evidence for this event """ - window = GenomeEvidence.generate_window(self, breakpoint) + window = Evidence.generate_window(self, breakpoint) tgt_left = Evidence.distance(window.start, breakpoint.start) # amount to expand to the left tgt_right = Evidence.distance(breakpoint.end, window.end) # amount to expand to the right window1 = self.traverse( diff --git a/mavis/validate/main.py b/mavis/validate/main.py index 6dc14203..65855c37 100644 --- a/mavis/validate/main.py +++ b/mavis/validate/main.py @@ -15,8 +15,8 @@ from ..bam import cigar as _cigar from ..bam.cache import BamCache from ..breakpoint import BreakpointPair -from ..config import get_by_prefix from ..constants import CALL_METHOD, COLUMNS, PROTOCOL +from ..schemas import get_by_prefix from ..util import ( LOG, filter_on_overlap, @@ -105,11 +105,11 @@ def main( opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, - data=bpp.data, stdev_fragment_size=config['libraries'][library]['stdev_fragment_size'], read_length=config['libraries'][library]['read_length'], median_fragment_size=config['libraries'][library]['median_fragment_size'], - **get_by_prefix(config, 'validate.') + config=config, + **bpp.data ) evidence_clusters.append(evidence) except ValueError as err: @@ -127,12 +127,12 @@ def main( opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, - data=bpp.data, stdev_fragment_size=config['libraries'][library]['stdev_fragment_size'], read_length=config['libraries'][library]['read_length'], median_fragment_size=config['libraries'][library]['median_fragment_size'], strand_determining_read=config['libraries'][library]['strand_determining_read'], - **get_by_prefix(config, 'validate.') + config=config, + **bpp.data ) evidence_clusters.append(evidence) except ValueError as err: From 089c0327b554bcecb24058293607a50a0e3a4544 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 23:37:14 -0700 Subject: [PATCH 005/137] use config not constants - replace the constants objects with the input config object --- mavis/align.py | 20 +++-- mavis/annotate/main.py | 7 +- mavis/annotate/variant.py | 9 +- mavis/breakpoint.py | 73 +++++++++++++++- mavis/pairing/main.py | 7 +- mavis/schemas/__init__.py | 31 +++++++ mavis/summary/summary.py | 44 +++++----- mavis/validate/base.py | 94 +++++++++------------ mavis/validate/call.py | 38 +++++---- mavis/validate/evidence.py | 4 +- mavis/validate/main.py | 7 +- tests/end_to_end/test_convert.py | 4 +- tests/integration/test_align.py | 38 +++++---- tests/integration/test_assemble.py | 16 ++-- tests/integration/test_breakpoint.py | 7 +- tests/integration/test_illustrate.py | 37 ++++---- tests/integration/test_pairing.py | 24 +++--- tests/integration/test_validate.py | 38 +++++---- tests/integration/test_validate_call.py | 92 +++++++++++--------- tests/integration/test_validate_evidence.py | 49 +++++++---- tests/setup_subprocess_cov.py | 10 +++ tests/unit/test_breakpoint.py | 21 ----- tests/unit/test_summary.py | 8 +- tests/unit/test_tool.py | 27 +++--- 24 files changed, 410 insertions(+), 295 deletions(-) create mode 100644 mavis/schemas/__init__.py create mode 100644 tests/setup_subprocess_cov.py diff --git a/mavis/align.py b/mavis/align.py index cdf0a287..f5f493a7 100644 --- a/mavis/align.py +++ b/mavis/align.py @@ -534,11 +534,13 @@ def select_contig_alignments(evidence, reads_by_query): def filter_pass(alignment): return not any( [ - alignment.query_consumption() < evidence.contig_aln_min_query_consumption, - alignment.score() < evidence.contig_aln_min_score, + alignment.query_consumption() + < evidence.config['validate.contig_aln_min_query_consumption'], + alignment.score() < evidence.config['validate.contig_aln_min_score'], alignment.mapping_quality() == Interval(0), alignment.read2 is not None - and alignment.query_overlap_extension() < evidence.contig_aln_min_extend_overlap, + and alignment.query_overlap_extension() + < evidence.config['validate.contig_aln_min_extend_overlap'], ] ) @@ -563,8 +565,8 @@ def supports_primary_event(alignment): read = evidence.standardize_read(raw_read) read.cigar = _cigar.merge_internal_events( read.cigar, - inner_anchor=evidence.contig_aln_merge_inner_anchor, - outer_anchor=evidence.contig_aln_merge_outer_anchor, + inner_anchor=evidence.config['validate.contig_aln_merge_inner_anchor'], + outer_anchor=evidence.config['validate.contig_aln_merge_outer_anchor'], ) read = evidence.standardize_read( read @@ -581,8 +583,8 @@ def supports_primary_event(alignment): _read.convert_events_to_softclipping( read, evidence.break1.orient, - max_event_size=evidence.contig_aln_max_event_size, - min_anchor_size=evidence.contig_aln_min_anchor_size, + max_event_size=evidence.config['validate.contig_aln_max_event_size'], + min_anchor_size=evidence.config['validate.contig_aln_min_anchor_size'], ) ) if evidence.break1.orient == evidence.break2.orient: @@ -591,8 +593,8 @@ def supports_primary_event(alignment): _read.convert_events_to_softclipping( read, evidence.break2.orient, - max_event_size=evidence.contig_aln_max_event_size, - min_anchor_size=evidence.contig_aln_min_anchor_size, + max_event_size=evidence.config['validate.contig_aln_max_event_size'], + min_anchor_size=evidence.config['validate.contig_aln_min_anchor_size'], ) ) diff --git a/mavis/annotate/main.py b/mavis/annotate/main.py index ffd54bd7..6103ea44 100644 --- a/mavis/annotate/main.py +++ b/mavis/annotate/main.py @@ -8,7 +8,7 @@ from ..error import DrawingFitError, NotSpecifiedError from ..illustrate.constants import DiagramSettings from ..illustrate.diagram import draw_sv_summary_diagram -from ..schemas import DEFAULTS +from ..schemas import DEFAULTS, get_by_prefix from ..util import LOG, generate_complete_stamp, mkdirp, read_inputs from .constants import PASS_FILENAME from .file_io import ReferenceFile @@ -167,10 +167,7 @@ def main( ) # now try generating the svg - illustration_defaults = get_by_prefix(DEFAULTS, 'illustrate.') - drawing_config = DiagramSettings( - **{k: v for k, v in kwargs.items() if k in illustration_defaults} - ) + drawing_config = DiagramSettings(**get_by_prefix(config, 'illustrate.')) header_req = { COLUMNS.break1_strand, diff --git a/mavis/annotate/variant.py b/mavis/annotate/variant.py index 050242fa..64b3732e 100644 --- a/mavis/annotate/variant.py +++ b/mavis/annotate/variant.py @@ -47,14 +47,9 @@ def __init__( opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, + **bpp.data, + **kwargs ) - self.data.update(bpp.data) - if data is not None: - conflicts = set(kwargs.keys()) & set(data.keys()) - self.data.update(data) - if conflicts: - raise TypeError('got multiple values for data elements:', conflicts) - self.data.update(kwargs) # match transcript to breakpoint if reveresed if bpp.break1.key[0:3] < bpp.break2.key[0:3]: diff --git a/mavis/breakpoint.py b/mavis/breakpoint.py index 2f903cd6..dfdda6a5 100644 --- a/mavis/breakpoint.py +++ b/mavis/breakpoint.py @@ -139,6 +139,74 @@ def __lt__(self, other): return True return self.untemplated_seq < other.untemplated_seq + @property + def library(self) -> Optional[str]: + return self.data.get(COLUMNS.library) + + @property + def cdna_synon(self) -> Optional[bool]: + return self.data.get(COLUMNS.cdna_synon) + + @property + def contig_remapped_reads(self) -> Optional[int]: + return self.data.get(COLUMNS.contig_remapped_reads) + + @property + def disease_status(self) -> Optional[str]: + return self.data.get(COLUMNS.disease_status) + + @property + def event_type(self) -> Optional[str]: + return self.data.get(COLUMNS.event_type) + + @property + def inferred_pairing(self) -> Optional[str]: + return self.data.get(COLUMNS.inferred_pairing) + + @property + def pairing(self) -> Optional[str]: + return self.data.get(COLUMNS.pairing) + + @property + def protocol(self) -> Optional[str]: + return self.data.get(COLUMNS.protocol) + + @property + def fusion_cdna_coding_start(self) -> Optional[int]: + return self.data.get(COLUMNS.fusion_cdna_coding_start) + + @property + def fusion_cdna_coding_end(self) -> Optional[int]: + return self.data.get(COLUMNS.fusion_cdna_coding_end) + + @property + def fusion_sequence_fasta_id(self) -> Optional[str]: + return self.data.get(COLUMNS.fusion_sequence_fasta_id) + + @property + def fusion_splicing_pattern(self) -> Optional[str]: + return self.data.get(COLUMNS.fusion_splicing_pattern) + + @property + def linking_split_reads(self) -> Optional[int]: + return self.data.get(COLUMNS.linking_split_reads) + + @property + def repeat_count(self) -> Optional[int]: + return self.data.get(COLUMNS.repeat_count) + + @property + def tracking_id(self) -> Optional[str]: + return self.data.get(COLUMNS.tracking_id) + + @property + def cluster_id(self) -> Optional[str]: + return self.data.get(COLUMNS.cluster_id) + + @property + def annotation_id(self) -> Optional[str]: + return self.data.get(COLUMNS.annotation_id) + @property def interchromosomal(self) -> bool: """bool: True if the breakpoints are on different chromosomes, False otherwise""" @@ -242,6 +310,9 @@ def __init__( # try classifying to make sure it's a valid combination BreakpointPair.classify(self) + def column(self, colname: str): + return self.data.get(COLUMNS[colname]) + def __str__(self): return 'BPP({}, {}{}{})'.format( str(self.break1), @@ -332,7 +403,7 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: return {SVTYPE.DEL, SVTYPE.INS} elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT: return {SVTYPE.DUP} - raise InvalidRearrangement(pair) + return {SVTYPE.DEL, SVTYPE.INS, SVTYPE.DUP} else: # interchromosomal if pair.opposing_strands: if pair.LR or pair.RL: diff --git a/mavis/pairing/main.py b/mavis/pairing/main.py index debd823d..8a6dbe1a 100644 --- a/mavis/pairing/main.py +++ b/mavis/pairing/main.py @@ -15,7 +15,6 @@ def main( output: str, config: Dict, start_time=int(time.time()), - **kwargs, ): """ Args: @@ -82,8 +81,10 @@ def main( category = (bpp.break1.chr, bpp.break2.chr, bpp.opposing_strands, bpp.event_type) bpp.data[COLUMNS.product_id] = product_key(bpp) calls_by_cat.setdefault(category, []).append(bpp) - if bpp.gene1 or bpp.gene2: - calls_by_ann.setdefault((bpp.transcript1, bpp.transcript2), []).append(bpp) + if bpp.data.get(COLUMNS.gene1) or bpp.data.get(COLUMNS.gene2): + calls_by_ann.setdefault( + (bpp.data.get(COLUMNS.transcript1), bpp.data.get(COLUMNS.transcript2)), [] + ).append(bpp) bpp.data[COLUMNS.pairing] = '' bpp.data[COLUMNS.inferred_pairing] = '' diff --git a/mavis/schemas/__init__.py b/mavis/schemas/__init__.py new file mode 100644 index 00000000..a0568bac --- /dev/null +++ b/mavis/schemas/__init__.py @@ -0,0 +1,31 @@ +import collections +import os + +from snakemake.utils import validate as snakemake_validate + + +class ImmutableDict(collections.Mapping): + def __init__(self, data): + self._data = data + + def __getitem__(self, key): + return self._data[key] + + def __len__(self): + return len(self._data) + + def __iter__(self): + return iter(self._data) + + +def get_by_prefix(config, prefix): + return {k.replace(prefix, ''): v for k, v in config.items() if k.startswith(prefix)} + + +DEFAULTS = {} +snakemake_validate( + DEFAULTS, + os.path.join(os.path.dirname(__file__), 'config.json'), + set_default=True, +) +DEFAULTS = ImmutableDict(DEFAULTS) diff --git a/mavis/summary/summary.py b/mavis/summary/summary.py index 524bbe95..a3ab1ecc 100644 --- a/mavis/summary/summary.py +++ b/mavis/summary/summary.py @@ -30,13 +30,13 @@ def sort_key(bpp): result.extend( [ - 0 if bpp.transcript1 in best_transcripts else 1, - 0 if bpp.transcript2 in best_transcripts else 1, - sum([bpp.transcript1 is None, bpp.transcript2 is None]), - string_ranks[bpp.gene1], - string_ranks[bpp.gene2], - string_ranks[bpp.transcript1], - string_ranks[bpp.transcript2], + 0 if bpp.data['transcript1'] in best_transcripts else 1, + 0 if bpp.data['transcript2'] in best_transcripts else 1, + sum([bpp.data['transcript1'] is None, bpp.data['transcript2'] is None]), + string_ranks[bpp.data['gene1']], + string_ranks[bpp.data['gene2']], + string_ranks[bpp.data['transcript1']], + string_ranks[bpp.data['transcript2']], ] ) return tuple(result) @@ -281,35 +281,39 @@ def filter_by_evidence( filtered = [] removed = [] for bpp in bpps: - if bpp.call_method == CALL_METHOD.CONTIG: + if bpp.column('call_method') == CALL_METHOD.CONTIG: # inherently the breakpoints have been linked if int(bpp.contig_remapped_reads) < filter_min_remapped_reads: removed.append(bpp) continue - elif bpp.call_method == CALL_METHOD.SPAN: + elif bpp.column('call_method') == CALL_METHOD.SPAN: if bpp.spanning_reads < filter_min_spanning_reads: removed.append(bpp) continue - elif bpp.call_method == CALL_METHOD.SPLIT: - linking_split_reads = bpp.linking_split_reads + elif bpp.column('call_method') == CALL_METHOD.SPLIT: + linking_split_reads = bpp.column('linking_split_reads') if bpp.event_type == SVTYPE.INS: - linking_split_reads += bpp.flanking_pairs + linking_split_reads += bpp.column('flanking_pairs') if any( [ - bpp.break1_split_reads + bpp.break1_split_reads_forced < filter_min_split_reads, - bpp.break2_split_reads + bpp.break2_split_reads_forced < filter_min_split_reads, + bpp.column('break1_split_reads') + bpp.column('break1_split_reads_forced') + < filter_min_split_reads, + bpp.column('break2_split_reads') + bpp.column('break2_split_reads_forced') + < filter_min_split_reads, linking_split_reads < filter_min_linking_split_reads, - bpp.break1_split_reads < 1, - bpp.break2_split_reads < 1, + bpp.column('break1_split_reads') < 1, + bpp.column('break2_split_reads') < 1, ] ): removed.append(bpp) continue - elif bpp.call_method == CALL_METHOD.FLANK: - if bpp.flanking_pairs < filter_min_flanking_reads: + elif bpp.column('call_method') == CALL_METHOD.FLANK: + if bpp.column('flanking_pairs') < filter_min_flanking_reads: removed.append(bpp) continue - elif bpp.call_method != CALL_METHOD.INPUT: - raise AssertionError('unexpected value for call_method: {}'.format(bpp.call_method)) + elif bpp.column('call_method') != CALL_METHOD.INPUT: + raise AssertionError( + 'unexpected value for call_method: {}'.format(bpp.column('call_method')) + ) filtered.append(bpp) return filtered, removed diff --git a/mavis/validate/base.py b/mavis/validate/base.py index 49220ae9..3b2da0aa 100644 --- a/mavis/validate/base.py +++ b/mavis/validate/base.py @@ -1,6 +1,6 @@ import itertools import logging -from abc import abstractproperty +from abc import abstractmethod from typing import Dict, List, Optional, Set, Tuple import pysam @@ -45,7 +45,7 @@ def min_expected_fragment_size(self): max( [ self.median_fragment_size - - self.stdev_fragment_size * self.stdev_count_abnormal, + - self.stdev_fragment_size * self.config['validate.stdev_count_abnormal'], 0, ] ), @@ -57,31 +57,14 @@ def min_expected_fragment_size(self): def max_expected_fragment_size(self): return int( round( - self.median_fragment_size + self.stdev_fragment_size * self.stdev_count_abnormal, 0 + self.median_fragment_size + + self.stdev_fragment_size * self.config['validate.stdev_count_abnormal'], + 0, ) ) - @abstractproperty - def strand_determining_read(self): - pass - - @abstractproperty - def outer_window1(self): - pass - - @abstractproperty - def outer_window2(self): - pass - - @abstractproperty - def inner_window1(self): - pass - - @abstractproperty - def inner_window2(self): - pass - - @abstractproperty + @property + @abstractmethod def min_mapping_quality(self): pass @@ -100,6 +83,7 @@ def __init__( classification=None, config=DEFAULTS, assembly_max_kmer_size=None, + strand_determining_read=2, **kwargs, ): """ @@ -115,7 +99,8 @@ def __init__( # initialize the breakpoint pair self.bam_cache = bam_cache self.stranded = stranded and bam_cache.stranded - self.config = config + self.config = dict(**DEFAULTS) + self.config.update(config) BreakpointPair.__init__( self, break1, @@ -154,12 +139,13 @@ def __init__( ) self.bam_cache = bam_cache self.classification = classification - self.reference_genome = reference_genome - self.read_length = read_length - self.stdev_fragment_size = stdev_fragment_size - self.median_fragment_size = median_fragment_size self.compatible_window1 = None self.compatible_window2 = None + self.median_fragment_size = median_fragment_size + self.read_length = read_length + self.reference_genome = reference_genome + self.stdev_fragment_size = stdev_fragment_size + self.strand_determining_read = strand_determining_read if self.classification is not None and self.classification not in BreakpointPair.classify( self @@ -235,8 +221,8 @@ def standardize_read(self, read): read.cigar = _cigar.join(cigar) read.cigar = _cigar.merge_internal_events( read.cigar, - inner_anchor=self.contig_aln_merge_inner_anchor, - outer_anchor=self.contig_aln_merge_outer_anchor, + inner_anchor=self.config['validate.contig_aln_merge_inner_anchor'], + outer_anchor=self.config['validate.contig_aln_merge_outer_anchor'], ) read.reference_start = read.reference_start + prefix @@ -876,22 +862,22 @@ def assemble_contig(self, log=DEVNULL): log('assembly size of {} sequences'.format(len(assembly_sequences) // 2)) - kmer_size = self.read_length * self.assembly_kmer_size + kmer_size = self.read_length * self.config['validate.assembly_kmer_size'] remap_min_overlap = max( - self.read_length - self.assembly_min_exact_match_to_remap, kmer_size + self.read_length - self.config['validate.assembly_min_exact_match_to_remap'], kmer_size ) contigs = assemble( assembly_sequences, kmer_size, - min_edge_trim_weight=self.assembly_min_edge_trim_weight, - assembly_max_paths=self.assembly_max_paths, + min_edge_trim_weight=self.config['validate.assembly_min_edge_trim_weight'], + assembly_max_paths=self.config['validate.assembly_max_paths'], min_contig_length=self.read_length, log=log, remap_min_overlap=remap_min_overlap, - remap_min_exact_match=self.assembly_min_exact_match_to_remap, - assembly_min_uniq=self.assembly_min_uniq, - min_complexity=self.min_call_complexity, + remap_min_exact_match=self.config['validate.assembly_min_exact_match_to_remap'], + assembly_min_uniq=self.config['validate.assembly_min_uniq'], + min_complexity=self.config['validate.min_call_complexity'], ) # add the input reads @@ -970,8 +956,8 @@ def assemble_contig(self, log=DEVNULL): for contig in sorted(contigs, key=lambda x: (x.remap_score() * -1, x.seq)): # filter on evidence level if ( - contig.remap_score() < self.assembly_min_remapped_seq - or contig.remap_coverage() < self.assembly_min_remap_coverage + contig.remap_score() < self.config['validate.assembly_min_remapped_seq'] + or contig.remap_coverage() < self.config['validate.assembly_min_remap_coverage'] ): continue if self.stranded and self.bam_cache.stranded: @@ -995,7 +981,7 @@ def cache_if_true(read): return True elif any( [ - self.filter_secondary_alignments and read.is_secondary, + self.config['validate.filter_secondary_alignments'] and read.is_secondary, read.mapping_quality < self.min_mapping_quality, ] ): @@ -1028,7 +1014,7 @@ def filter_if_true(read): if not cache_if_true(read): if any( [ - self.filter_secondary_alignments and read.is_secondary, + self.config['validate.filter_secondary_alignments'] and read.is_secondary, read.mapping_quality < self.min_mapping_quality, ] ): @@ -1049,9 +1035,9 @@ def filter_if_true(read): '{0}'.format(self.break1.chr), self.outer_window1[0], self.outer_window1[1], - read_limit=self.fetch_reads_limit, - sample_bins=self.fetch_reads_bins, - min_bin_size=self.fetch_min_bin_size, + read_limit=self.config['validate.fetch_reads_limit'], + sample_bins=self.config['validate.fetch_reads_bins'], + min_bin_size=self.config['validate.fetch_min_bin_size'], cache=True, cache_if=cache_if_true, filter_if=filter_if_true, @@ -1080,9 +1066,9 @@ def filter_if_true(read): '{0}'.format(self.break2.chr), self.outer_window2[0], self.outer_window2[1], - read_limit=self.fetch_reads_limit, - sample_bins=self.fetch_reads_bins, - min_bin_size=self.fetch_min_bin_size, + read_limit=self.config['validate.fetch_reads_limit'], + sample_bins=self.config['validate.fetch_reads_bins'], + min_bin_size=self.config['validate.fetch_min_bin_size'], cache=True, cache_if=cache_if_true, filter_if=filter_if_true, @@ -1132,9 +1118,9 @@ def filter_if_true(read): '{0}'.format(self.break1.chr), self.compatible_window1[0], self.compatible_window1[1], - read_limit=self.fetch_reads_limit, - sample_bins=self.fetch_reads_bins, - min_bin_size=self.fetch_min_bin_size, + read_limit=self.config['validate.fetch_reads_limit'], + sample_bins=self.config['validate.fetch_reads_bins'], + min_bin_size=self.config['validate.fetch_min_bin_size'], cache=True, cache_if=cache_if_true, filter_if=filter_if_true, @@ -1146,9 +1132,9 @@ def filter_if_true(read): '{0}'.format(self.break2.chr), self.compatible_window2[0], self.compatible_window2[1], - read_limit=self.fetch_reads_limit, - sample_bins=self.fetch_reads_bins, - min_bin_size=self.fetch_min_bin_size, + read_limit=self.config['validate.fetch_reads_limit'], + sample_bins=self.config['validate.fetch_reads_bins'], + min_bin_size=self.config['validate.fetch_min_bin_size'], cache=True, cache_if=cache_if_true, filter_if=filter_if_true, diff --git a/mavis/validate/call.py b/mavis/validate/call.py index a2f49bfc..c5a8048e 100644 --- a/mavis/validate/call.py +++ b/mavis/validate/call.py @@ -87,7 +87,7 @@ def __init__( event_type, self.compatible_type = self.compatible_type, event_type putative_types = BreakpointPair.classify(self, source_evidence.distance) - self.event_type = SVTYPE.enforce(event_type) + self.data[COLUMNS.event_type] = SVTYPE.enforce(event_type) if event_type not in putative_types | {self.compatible_type}: raise ValueError( 'event_type is not compatible with the breakpoint call', @@ -634,8 +634,9 @@ def _call_by_spanning_reads(source_evidence, consumed_evidence): event = convert_to_duplication(event, source_evidence.reference_genome) if all( [ - event.query_consumption() >= source_evidence.contig_aln_min_query_consumption, - event.score() >= source_evidence.contig_aln_min_score, + event.query_consumption() + >= source_evidence.config['validate.contig_aln_min_query_consumption'], + event.score() >= source_evidence.config['validate.contig_aln_min_score'], ] ): spanning_calls.setdefault(event, set()).add(read) @@ -643,7 +644,7 @@ def _call_by_spanning_reads(source_evidence, consumed_evidence): for event, reads in spanning_calls.items(): if any( [ - len(reads) < source_evidence.min_spanning_reads_resolution, + len(reads) < source_evidence.config['validate.min_spanning_reads_resolution'], source_evidence.opposing_strands != event.opposing_strands, ] ): @@ -734,12 +735,15 @@ def call_events(source_evidence): try: call = _call_by_flanking_pairs(source_evidence, event_type, type_consumed_evidence) - if len(call.flanking_pairs) < source_evidence.min_flanking_pairs_resolution: + if ( + len(call.flanking_pairs) + < source_evidence.config['validate.min_flanking_pairs_resolution'] + ): errors.add( 'flanking call ({}) failed to supply the minimum evidence required ({} < {})'.format( event_type, len(call.flanking_pairs), - source_evidence.min_flanking_pairs_resolution, + source_evidence.config['validate.min_flanking_pairs_resolution'], ) ) else: @@ -883,7 +887,7 @@ def _compute_coverage_intervals(pairs): ] else: break - if len(selected_flanking_pairs) < evidence.min_flanking_pairs_resolution: + if len(selected_flanking_pairs) < evidence.config['validate.min_flanking_pairs_resolution']: raise AssertionError( 'insufficient flanking pairs ({}) to call {} by flanking reads'.format( len(selected_flanking_pairs), event_type @@ -940,7 +944,7 @@ def _compute_coverage_intervals(pairs): if call.has_compatible: call.add_flanking_support(evidence.compatible_flanking_pairs, is_compatible=True) - if len(call.flanking_pairs) < evidence.min_flanking_pairs_resolution: + if len(call.flanking_pairs) < evidence.config['validate.min_flanking_pairs_resolution']: raise AssertionError( 'insufficient flanking pairs ({}) to call {} by flanking reads'.format( len(call.flanking_pairs), event_type @@ -973,7 +977,7 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None): pass putative_positions = list(pos_dict.keys()) for pos in putative_positions: - if len(pos_dict[pos]) < evidence.min_splits_reads_resolution: + if len(pos_dict[pos]) < evidence.config['validate.min_splits_reads_resolution']: del pos_dict[pos] else: count = 0 @@ -982,7 +986,7 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None): PYSAM_READ_FLAGS.TARGETED_ALIGNMENT ): count += 1 - if count < evidence.min_non_target_aligned_split_reads: + if count < evidence.config['validate.min_non_target_aligned_split_reads']: del pos_dict[pos] linked_pairings = [] @@ -1000,15 +1004,15 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None): links += 1 if (read.query_name, read.query_sequence) in reads: tgt_align += 1 - if links < evidence.min_linking_split_reads: + if links < evidence.config['validate.min_linking_split_reads']: continue deletion_size = second - first - 1 - if tgt_align >= evidence.min_double_aligned_to_estimate_insertion_size: + if tgt_align >= evidence.config['validate.min_double_aligned_to_estimate_insertion_size']: # we can estimate the fragment size - max_insert = evidence.read_length - 2 * evidence.min_softclipping + max_insert = evidence.read_length - 2 * evidence.config['validate.min_softclipping'] if event_type == SVTYPE.INS and max_insert < deletion_size: continue - elif links >= evidence.min_double_aligned_to_estimate_insertion_size: + elif links >= evidence.config['validate.min_double_aligned_to_estimate_insertion_size']: if deletion_size > evidence.max_expected_fragment_size and event_type == SVTYPE.INS: continue @@ -1095,12 +1099,12 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None): if not any( [ len(call.break1_split_read_names(both=True)) - < evidence.min_splits_reads_resolution, + < evidence.config['validate.min_splits_reads_resolution'], len(call.break2_split_read_names(both=True)) - < evidence.min_splits_reads_resolution, + < evidence.config['validate.min_splits_reads_resolution'], len(call.break1_split_read_names()) < 1, len(call.break2_split_read_names()) < 1, - linking_reads < evidence.min_linking_split_reads, + linking_reads < evidence.config['validate.min_linking_split_reads'], call.event_type != event_type, ] ): diff --git a/mavis/validate/evidence.py b/mavis/validate/evidence.py index 8719210c..f371a72e 100644 --- a/mavis/validate/evidence.py +++ b/mavis/validate/evidence.py @@ -31,8 +31,8 @@ def fetch_reads_limit(self): return self.config['validate.fetch_reads_limit'] def __init__(self, *pos, **kwargs): + kwargs[COLUMNS.protocol] = PROTOCOL.GENOME Evidence.__init__(self, *pos, **kwargs) - self.protocol = PROTOCOL.GENOME self.outer_window1 = self.generate_window(self.break1) self.outer_window2 = self.generate_window(self.break2) @@ -109,9 +109,9 @@ def fetch_reads_limit(self): return self.config['validate.trans_fetch_reads_limit'] def __init__(self, annotations, *pos, **kwargs): + kwargs[COLUMNS.protocol] = PROTOCOL.TRANS Evidence.__init__(self, *pos, **kwargs) - self.protocol = PROTOCOL.TRANS # get the list of overlapping transcripts self.overlapping_transcripts = overlapping_transcripts( annotations, self.break1 diff --git a/mavis/validate/main.py b/mavis/validate/main.py index 65855c37..1136ff81 100644 --- a/mavis/validate/main.py +++ b/mavis/validate/main.py @@ -37,7 +37,6 @@ def main( library: str, config: Dict, start_time=int(time.time()), - **kwargs ): """ Args: @@ -225,11 +224,11 @@ def main( aligner_fa_input_file=contig_aligner_fa, aligner_output_file=contig_aligner_output, clean_files=config['validate.clean_aligner_files'], - aligner=kwargs.get('aligner', config['validate.aligner']), + aligner=config['validate.aligner'], aligner_reference=config['reference.aligner_reference'][0], aligner_output_log=contig_aligner_log, - blat_min_identity=kwargs.get('blat_min_identity', config['validate.blat_min_identity']), - blat_limit_top_aln=kwargs.get('blat_limit_top_aln', config['validate.blat_limit_top_aln']), + blat_min_identity=config['validate.blat_min_identity'], + blat_limit_top_aln=config['validate.blat_limit_top_aln'], log=LOG, ) for evidence in evidence_clusters: diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index ff58b064..2b110802 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -44,7 +44,7 @@ def run_main(self, inputfile, file_type, strand_specific=False): self.assertTrue(unique_exists(outputfile)) result = {} for pair in read_bpp_from_input_file(outputfile): - result.setdefault(pair.tracking_id, []).append(pair) + result.setdefault(pair.data['tracking_id'], []).append(pair) return result def test_chimerascan(self): @@ -58,6 +58,8 @@ def test_delly(self): # test the contents were converted successfully self.assertEqual(1, len(result['delly-DUP00000424'])) bpp = result['delly-DUP00000424'][0] + print(bpp.data) + print(bpp) self.assertEqual(SVTYPE.DUP, bpp.event_type) self.assertEqual('1', bpp.break1.chr) self.assertEqual('1', bpp.break2.chr) diff --git a/tests/integration/test_align.py b/tests/integration/test_align.py index c9590a0a..45c9cb1a 100644 --- a/tests/integration/test_align.py +++ b/tests/integration/test_align.py @@ -2,20 +2,20 @@ import unittest from unittest import mock +import mavis.bam.cigar as _cigar from mavis import align from mavis.annotate.file_io import load_reference_genome from mavis.assemble import Contig from mavis.bam.cache import BamCache -import mavis.bam.cigar as _cigar +from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import CIGAR, ORIENT, reverse_complement, STRAND, SVTYPE +from mavis.constants import CIGAR, ORIENT, STRAND, SVTYPE, reverse_complement from mavis.interval import Interval +from mavis.schemas import DEFAULTS from mavis.validate.evidence import GenomeEvidence -from mavis.validate.constants import DEFAULTS -from mavis.bam.read import SamRead -from . import MockBamFileHandle, MockObject, MockLongString, MockRead from ..util import get_data +from . import MockBamFileHandle, MockLongString, MockObject, MockRead REFERENCE_GENOME = None @@ -66,9 +66,11 @@ def test_blat_contigs(self): read_length=40, stdev_fragment_size=25, median_fragment_size=100, - stdev_count_abnormal=2, - min_splits_reads_resolution=1, - min_flanking_pairs_resolution=1, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + }, ) ev.contigs = [ Contig( @@ -110,9 +112,11 @@ def test_bwa_contigs(self): read_length=40, stdev_fragment_size=25, median_fragment_size=100, - stdev_count_abnormal=2, - min_splits_reads_resolution=1, - min_flanking_pairs_resolution=1, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + }, ) ev.contigs = [ Contig( @@ -717,13 +721,11 @@ def test_inversion_and_deletion(self): break2=MockObject(orient=ORIENT.RIGHT, chr='3'), contigs=[MockObject(seq=s, alignments=set())], standardize_read=lambda x: x, - contig_aln_max_event_size=DEFAULTS.contig_aln_max_event_size, - contig_aln_merge_inner_anchor=5, - contig_aln_merge_outer_anchor=DEFAULTS.contig_aln_merge_outer_anchor, - contig_aln_min_query_consumption=0.9, - contig_aln_min_extend_overlap=DEFAULTS.contig_aln_min_extend_overlap, - contig_aln_min_anchor_size=DEFAULTS.contig_aln_min_anchor_size, - contig_aln_min_score=DEFAULTS.contig_aln_min_score, + config={ + **DEFAULTS, + 'validate.contig_aln_merge_inner_anchor': 5, + 'validate.contig_aln_min_query_consumption': 0.9, + }, outer_window1=Interval(1000, 1200), outer_window2=Interval(2000, 2200), LR=False, diff --git a/tests/integration/test_assemble.py b/tests/integration/test_assemble.py index e732cfbc..9903f8ff 100644 --- a/tests/integration/test_assemble.py +++ b/tests/integration/test_assemble.py @@ -6,8 +6,8 @@ from mavis.assemble import Contig, assemble, filter_contigs from mavis.constants import reverse_complement from mavis.interval import Interval +from mavis.schemas import DEFAULTS from mavis.util import LOG -from mavis.validate.constants import DEFAULTS from ..util import get_data from . import RUN_FULL, MockObject @@ -334,12 +334,12 @@ def test_multiple_events(self): assemblies = assemble( sequences, kmer_size, - min_edge_trim_weight=DEFAULTS.assembly_min_edge_trim_weight, + min_edge_trim_weight=DEFAULTS['validate.assembly_min_edge_trim_weight'], remap_min_match=0.95, remap_min_overlap=75 * 0.9, min_contig_length=75, - remap_min_exact_match=DEFAULTS.assembly_min_exact_match_to_remap, - assembly_max_paths=DEFAULTS.assembly_max_paths, + remap_min_exact_match=DEFAULTS['validate.assembly_min_exact_match_to_remap'], + assembly_max_paths=DEFAULTS['validate.assembly_max_paths'], assembly_min_uniq=0.01, log=self.log, ) @@ -359,17 +359,17 @@ def test_multiple_events(self): def test_large_assembly(self): # simply testing that this will complete before the timeout sequences = self.large_assembly_seq - kmer_size = 150 * DEFAULTS.assembly_kmer_size + kmer_size = 150 * DEFAULTS['validate.assembly_kmer_size'] print('read inputs') contigs = assemble( sequences, kmer_size, - min_edge_trim_weight=DEFAULTS.assembly_min_edge_trim_weight, - assembly_max_paths=DEFAULTS.assembly_max_paths, + min_edge_trim_weight=DEFAULTS['validate.assembly_min_edge_trim_weight'], + assembly_max_paths=DEFAULTS['validate.assembly_max_paths'], min_contig_length=150, log=LOG, remap_min_exact_match=30, - assembly_min_uniq=DEFAULTS.assembly_min_uniq, + assembly_min_uniq=DEFAULTS['validate.assembly_min_uniq'], ) for contig in contigs: print(len(contig.seq), contig.remap_score()) diff --git a/tests/integration/test_breakpoint.py b/tests/integration/test_breakpoint.py index b3cdce42..659cf486 100644 --- a/tests/integration/test_breakpoint.py +++ b/tests/integration/test_breakpoint.py @@ -1,15 +1,14 @@ import unittest +from functools import partial from mavis.annotate.file_io import load_reference_genome from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import CIGAR, ORIENT, reverse_complement, STRAND +from mavis.constants import CIGAR, ORIENT, STRAND, reverse_complement from mavis.interval import Interval from mavis.validate.evidence import TranscriptomeEvidence -from mavis.validate.constants import DEFAULTS -from functools import partial -from . import MockRead, MockObject, get_example_genes from ..util import get_data +from . import MockObject, MockRead, get_example_genes REFERENCE_GENOME = None REF_CHR = 'fake' diff --git a/tests/integration/test_illustrate.py b/tests/integration/test_illustrate.py index 2589cfaa..ce377207 100644 --- a/tests/integration/test_illustrate.py +++ b/tests/integration/test_illustrate.py @@ -1,22 +1,19 @@ +import os import random import unittest -import os +from mavis.annotate import fusion, genomic, protein, variant from mavis.annotate.base import BioInterval from mavis.annotate.file_io import load_templates -from mavis.annotate import genomic -from mavis.annotate import protein -from mavis.annotate import variant -from mavis.annotate import fusion from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import ORIENT, PROTOCOL, STRAND, SVTYPE -from mavis.illustrate.constants import DiagramSettings, DEFAULTS +from mavis.illustrate.constants import DEFAULTS, DiagramSettings from mavis.illustrate.diagram import ( + HEX_BLACK, + HEX_WHITE, draw_multi_transcript_overlay, draw_sv_summary_diagram, generate_interval_mapping, - HEX_BLACK, - HEX_WHITE, ) from mavis.illustrate.elements import draw_genes, draw_legend, draw_template, draw_ustranscript from mavis.illustrate.scatter import ScatterPlot @@ -24,8 +21,8 @@ from mavis.interval import Interval from svgwrite import Drawing -from . import build_transcript, MockObject, MockString, OUTPUT_SVG from ..util import get_data +from . import OUTPUT_SVG, MockObject, MockString, build_transcript TEMPLATE_METADATA = None DEFAULTS.domain_name_regex_filter = r'.*' @@ -65,7 +62,7 @@ def test_generate_gene_mapping_err(self): # _generate_interval_mapping [genomic.IntergenicRegion(11:77361962_77361962+)] 1181.39453125 5 30 None 77356962 77366962) ir = genomic.IntergenicRegion('11', 5000, 5000, STRAND.POS) tgt_width = 1000 - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') d.gene_min_buffer = 10 # (self, canvas, gene, width, height, fill, label='', reference_genome=None) draw_genes(d, self.canvas, [ir], tgt_width, []) @@ -91,7 +88,7 @@ def test_draw_genes(self): y = genomic.Gene('1', 5000, 7000, strand=STRAND.NEG) z = genomic.Gene('1', 1500, 2500, strand=STRAND.POS) - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') breakpoints = [Breakpoint('1', 1100, 1200, orient=ORIENT.RIGHT)] g = draw_genes( d, @@ -121,7 +118,7 @@ def test_draw_genes(self): self.assertEqual(breakpoints[0], g.labels['B1']) def test_draw_ustranscript(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') # domains = [protein.Domain()] d1 = protein.Domain('first', [(55, 61), (71, 73)]) d2 = protein.Domain('second', [(10, 20), (30, 34)]) @@ -172,7 +169,7 @@ def test_draw_ustranscript(self): self.assertEqual(d2.name, g.labels['D2']) def test_draw_consec_exons(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') # domains = [protein.Domain()] t = build_transcript( gene=None, @@ -209,7 +206,7 @@ def test_dynamic_label_color(self): self.assertEqual(HEX_BLACK, dynamic_label_color(HEX_WHITE)) def test_draw_legend(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') swatches = [ ('#000000', 'black'), ('#FF0000', 'red'), @@ -233,7 +230,7 @@ def test_draw_legend(self): ) def test_draw_layout_single_transcript(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') d1 = protein.Domain('first', [(55, 61), (71, 73)]) d2 = protein.Domain('second', [(10, 20), (30, 34)]) g1 = genomic.Gene('1', 150, 1000, strand=STRAND.POS) @@ -275,7 +272,7 @@ def test_draw_layout_single_transcript(self): self.assertEqual(expected_height, canvas.attribs['height']) def test_draw_layout_single_genomic(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') d1 = protein.Domain('first', [(55, 61), (71, 73)]) d2 = protein.Domain('second', [(10, 20), (30, 34)]) g1 = genomic.Gene('1', 150, 1000, strand=STRAND.POS) @@ -339,7 +336,7 @@ def test_draw_layout_single_genomic(self): canvas.saveas('test_draw_layout_single_genomic.svg') def test_draw_layout_translocation(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') d1 = protein.Domain('first', [(55, 61), (71, 73)]) d2 = protein.Domain('second', [(10, 20), (30, 34)]) g1 = genomic.Gene('1', 150, 1000, strand=STRAND.POS) @@ -406,7 +403,7 @@ def test_draw_layout_translocation(self): def test_draw_template(self): # def draw_template(self, canvas, template, target_width, height, labels=None, colors=None): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') canvas = Drawing(size=(1000, 50)) t = genomic.Template( '1', @@ -428,7 +425,7 @@ def test_draw_template(self): self.assertEqual(2, len(canvas.elements)) def test_draw_translocation_with_template(self): - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') d1 = protein.Domain('PF0001', [(55, 61), (71, 73)]) d2 = protein.Domain('PF0002', [(10, 20), (30, 34)]) g1 = genomic.Gene(TEMPLATE_METADATA['1'], 150, 1000, strand=STRAND.POS, aliases=['HUGO2']) @@ -547,7 +544,7 @@ def test_draw_overlay(self): gene=gene, domains=[], ) - d = DiagramSettings() + d = DiagramSettings(domain_name_regex_filter=r'.*') for i, t in enumerate(gene.transcripts): t.name = 'transcript {}'.format(i + 1) scatterx = [x + 100 for x in range(gene.start, gene.end + 1, 400)] diff --git a/tests/integration/test_pairing.py b/tests/integration/test_pairing.py index d1fe8044..3a0064f8 100644 --- a/tests/integration/test_pairing.py +++ b/tests/integration/test_pairing.py @@ -12,7 +12,7 @@ def setUp(self): Breakpoint('1', 1), Breakpoint('1', 10), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -23,7 +23,7 @@ def setUp(self): Breakpoint('1', 1), Breakpoint('1', 10), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -100,7 +100,7 @@ def test_mixed_protocol_fusions_same_sequence(self): Breakpoint('1', 1), Breakpoint('1', 10), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -115,7 +115,7 @@ def test_mixed_protocol_fusions_same_sequence(self): Breakpoint('1', 50), Breakpoint('1', 60), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -136,7 +136,7 @@ def test_mixed_protocol_fusions_same_sequence_diff_translation(self): Breakpoint('1', 1), Breakpoint('1', 10), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: 'a', @@ -151,7 +151,7 @@ def test_mixed_protocol_fusions_same_sequence_diff_translation(self): Breakpoint('1', 50), Breakpoint('1', 60), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: 'a', @@ -169,7 +169,7 @@ def test_mixed_protocol_fusions_different_sequence(self): Breakpoint('1', 1), Breakpoint('1', 10), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: 'a', @@ -184,7 +184,7 @@ def test_mixed_protocol_fusions_different_sequence(self): Breakpoint('1', 50), Breakpoint('1', 60), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: 'b', @@ -202,7 +202,7 @@ def test_mixed_protocol_one_predicted_one_match(self): Breakpoint('1', 350, orient=ORIENT.LEFT), Breakpoint('1', 400, orient=ORIENT.RIGHT), opposing_strands=False, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -215,7 +215,7 @@ def test_mixed_protocol_one_predicted_one_match(self): Breakpoint('1', 350, orient=ORIENT.LEFT), Breakpoint('1', 400, orient=ORIENT.RIGHT), opposing_strands=False, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -239,7 +239,7 @@ def test_mixed_protocol_one_predicted_one_mismatch(self): Breakpoint('1', 350, orient=ORIENT.LEFT), Breakpoint('1', 400, orient=ORIENT.RIGHT), opposing_strands=False, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, @@ -252,7 +252,7 @@ def test_mixed_protocol_one_predicted_one_mismatch(self): Breakpoint('1', 350, orient=ORIENT.LEFT), Breakpoint('1', 400, orient=ORIENT.RIGHT), opposing_strands=False, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, diff --git a/tests/integration/test_validate.py b/tests/integration/test_validate.py index a1527541..3458aa68 100644 --- a/tests/integration/test_validate.py +++ b/tests/integration/test_validate.py @@ -1,16 +1,17 @@ import unittest from mavis.annotate.file_io import load_reference_genome +from mavis.bam import cigar as _cigar from mavis.bam.cache import BamCache +from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint -from mavis.constants import ORIENT, PYSAM_READ_FLAGS, NA_MAPPING_QUALITY -from mavis.validate.evidence import GenomeEvidence +from mavis.constants import NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS +from mavis.schemas import DEFAULTS from mavis.validate.base import Evidence -from mavis.bam.read import SamRead -from mavis.bam import cigar as _cigar +from mavis.validate.evidence import GenomeEvidence -from . import mock_read_pair, MockRead, RUN_FULL, MockObject, MockLongString from ..util import get_data +from . import RUN_FULL, MockLongString, MockObject, MockRead, mock_read_pair REFERENCE_GENOME = None @@ -57,11 +58,13 @@ def genome_evidence(self, break1, break2, opposing_strands): read_length=125, stdev_fragment_size=100, median_fragment_size=380, - stdev_count_abnormal=3, - min_flanking_pairs_resolution=3, - max_sc_preceeding_anchor=3, - outer_window_min_event_size=0, - min_mapping_quality=20, + config={ + 'validate.stdev_count_abnormal': 3, + 'validate.min_flanking_pairs_resolution': 3, + 'validate.max_sc_preceeding_anchor': 3, + 'validate.outer_window_min_event_size': 0, + 'validate.min_mapping_quality': 20, + }, ) print(ge.min_expected_fragment_size, ge.max_expected_fragment_size) print(ge.break1.chr, ge.outer_window1) @@ -565,9 +568,11 @@ def setUp(self): read_length=125, stdev_fragment_size=100, median_fragment_size=380, - stdev_count_abnormal=3, - min_flanking_pairs_resolution=3, - assembly_min_edge_trim_weight=3, + config={ + 'validate.stdev_count_abnormal': 3, + 'validate.min_flanking_pairs_resolution': 3, + 'validate.assembly_min_edge_trim_weight': 3, + }, ) def test_collect_split_read(self): @@ -738,8 +743,11 @@ def setUp(self): ) }, bam_cache=MockObject(get_read_reference_name=lambda x: x.reference_name), - contig_aln_merge_inner_anchor=10, - contig_aln_merge_outer_anchor=20, + config={ + 'validate.contig_aln_merge_inner_anchor': 10, + 'validate.contig_aln_merge_outer_anchor': 20, + **DEFAULTS, + }, ) def test_bwa_mem(self): diff --git a/tests/integration/test_validate_call.py b/tests/integration/test_validate_call.py index a7dcec29..5be1962a 100644 --- a/tests/integration/test_validate_call.py +++ b/tests/integration/test_validate_call.py @@ -4,10 +4,10 @@ from mavis.align import call_paired_read_event, select_contig_alignments from mavis.annotate.file_io import load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript +from mavis.bam import cigar as _cigar from mavis.bam.cache import BamCache -from mavis.bam.read import sequenced_strand, SamRead, read_pair_type from mavis.bam.cigar import convert_string_to_cigar -from mavis.bam import cigar as _cigar +from mavis.bam.read import SamRead, read_pair_type, sequenced_strand from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CALL_METHOD, CIGAR, ORIENT, PYSAM_READ_FLAGS, STRAND, SVTYPE from mavis.interval import Interval @@ -15,8 +15,8 @@ from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence -from . import mock_read_pair, MockBamFileHandle, MockRead, get_example_genes, MockLongString from ..util import get_data +from . import MockBamFileHandle, MockLongString, MockRead, get_example_genes, mock_read_pair REFERENCE_GENOME = None @@ -448,12 +448,14 @@ def build_genome_evidence(self, b1, b2, opposing_strands=False): read_length=100, median_fragment_size=200, stdev_fragment_size=50, - stdev_count_abnormal=3, - min_flanking_pairs_resolution=1, - min_splits_reads_resolution=1, - min_spanning_reads_resolution=3, - min_linking_split_reads=1, - min_call_complexity=0, + config={ + 'validate.stdev_count_abnormal': 3, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_spanning_reads_resolution': 3, + 'validate.min_linking_split_reads': 1, + 'validate.min_call_complexity': 0, + }, ) return evidence @@ -850,12 +852,14 @@ def setUp(self): read_length=40, stdev_fragment_size=25, median_fragment_size=100, - stdev_count_abnormal=2, - min_splits_reads_resolution=1, - min_flanking_pairs_resolution=1, - min_linking_split_reads=1, - min_spanning_reads_resolution=3, - min_call_complexity=0, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_linking_split_reads': 1, + 'validate.min_spanning_reads_resolution': 3, + 'validate. min_call_complexity': 0, + }, ) self.dup = GenomeEvidence( Breakpoint('fake', 50, orient=ORIENT.RIGHT), @@ -866,12 +870,14 @@ def setUp(self): read_length=40, stdev_fragment_size=25, median_fragment_size=100, - stdev_count_abnormal=2, - min_splits_reads_resolution=1, - min_flanking_pairs_resolution=1, - min_linking_split_reads=1, - min_spanning_reads_resolution=3, - min_call_complexity=0, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_linking_split_reads': 1, + 'validate.min_spanning_reads_resolution': 3, + 'validate. min_call_complexity': 0, + }, ) def test_empty(self): @@ -1108,10 +1114,12 @@ def test_call_by_split_reads_consume_flanking(self): read_length=125, stdev_fragment_size=100, median_fragment_size=380, - stdev_count_abnormal=3, - min_flanking_pairs_resolution=1, - min_splits_reads_resolution=1, - min_linking_split_reads=1, + config={ + 'validate.stdev_count_abnormal': 3, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_linking_split_reads': 1, + }, ) evidence.split_reads[0].add( MockRead( @@ -1195,9 +1203,11 @@ def setUp(self): read_length=25, stdev_fragment_size=25, median_fragment_size=100, - stdev_count_abnormal=2, - min_flanking_pairs_resolution=1, - min_call_complexity=0, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_call_complexity': 0, + }, ) def test_call_coverage_too_large(self): @@ -1423,8 +1433,10 @@ def test_close_to_zero(self): read_length=40, stdev_fragment_size=25, median_fragment_size=180, - stdev_count_abnormal=2, - min_flanking_pairs_resolution=1, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_flanking_pairs_resolution': 1, + }, ) ev.flanking_pairs.add( mock_read_pair( @@ -1475,7 +1487,7 @@ def test_call_with_overlapping_coverage_intervals(self): read_length=150, stdev_fragment_size=98, median_fragment_size=433, - min_flanking_pairs_resolution=1, + config={'validate.min_flanking_pairs_resolution': 1}, ) evidence.flanking_pairs.add( mock_read_pair( @@ -1510,11 +1522,13 @@ def build_transcriptome_evidence(self, b1, b2, opposing_strands=False): read_length=50, stdev_fragment_size=100, median_fragment_size=100, - stdev_count_abnormal=3, - min_splits_reads_resolution=1, - min_flanking_pairs_resolution=1, - strand_determining_read=2, - min_call_complexity=0, + config={ + 'validate.stdev_count_abnormal': 3, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.strand_determining_read': 2, + 'validate.min_call_complexity': 0, + }, ) def test_call_translocation(self): @@ -1585,8 +1599,10 @@ def test_deletion(self): read_length=40, stdev_fragment_size=25, median_fragment_size=180, - min_flanking_pairs_resolution=1, - min_spanning_reads_resolution=1, + config={ + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_spanning_reads_resolution': 1, + }, ) print(ev.outer_window1, ev.outer_window2) spanning_reads = [ diff --git a/tests/integration/test_validate_evidence.py b/tests/integration/test_validate_evidence.py index 44b3709a..d4b747af 100644 --- a/tests/integration/test_validate_evidence.py +++ b/tests/integration/test_validate_evidence.py @@ -1,19 +1,18 @@ -from functools import partial import unittest +from functools import partial -from mavis.annotate.genomic import Gene, Transcript, PreTranscript +from mavis.annotate.genomic import Gene, PreTranscript, Transcript +from mavis.bam import cigar as _cigar from mavis.bam.cache import BamCache from mavis.bam.read import SamRead -from mavis.bam import cigar as _cigar from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CIGAR, ORIENT, STRAND from mavis.interval import Interval -from mavis.validate.constants import DEFAULTS +from mavis.schemas import DEFAULTS from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence -from . import mock_read_pair, MockBamFileHandle, MockRead, MockObject - +from . import MockBamFileHandle, MockObject, MockRead, mock_read_pair REFERENCE_GENOME = None @@ -173,7 +172,7 @@ def setUp(self): read_length=self.read_length, stdev_fragment_size=100, median_fragment_size=100, - stdev_count_abnormal=1, + config={'validate.stdev_count_abnormal': 1}, ) self.genomic_ev = GenomeEvidence( b1, @@ -184,7 +183,7 @@ def setUp(self): read_length=self.read_length, stdev_fragment_size=100, median_fragment_size=100, - stdev_count_abnormal=1, + config={'validate.stdev_count_abnormal': 1}, ) def test_genomic_vs_trans_no_annotations(self): @@ -366,14 +365,17 @@ def setUp(self): self.pre_transcript.transcripts.append(Transcript(self.pre_transcript, spl)) self.annotations = {gene.chr: [gene]} self.genome_evidence = MockObject( - annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11 + annotations={}, + read_length=100, + max_expected_fragment_size=550, + config={**DEFAULTS, 'validate.call_error': 11}, ) self.trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, - call_error=11, overlapping_transcripts={self.pre_transcript}, + config={**DEFAULTS, 'validate.call_error': 11}, ) setattr( self.trans_evidence, @@ -506,7 +508,12 @@ class TestGenomeEvidenceWindow(unittest.TestCase): def test_orient_ns(self): bpp = Breakpoint(chr='1', start=1000, end=1000, orient=ORIENT.NS) window = GenomeEvidence.generate_window( - MockObject(read_length=100, max_expected_fragment_size=550, call_error=11), bpp + MockObject( + read_length=100, + max_expected_fragment_size=550, + config={**DEFAULTS, 'validate.call_error': 11}, + ), + bpp, ) self.assertEqual(440, window.start) self.assertEqual(1560, window.end) @@ -515,7 +522,12 @@ def test_orient_ns(self): def test_orient_left(self): bpp = Breakpoint(chr='1', start=1000, end=1000, orient=ORIENT.LEFT) window = GenomeEvidence.generate_window( - MockObject(read_length=100, call_error=11, max_expected_fragment_size=550), bpp + MockObject( + read_length=100, + max_expected_fragment_size=550, + config={**DEFAULTS, 'validate.call_error': 11}, + ), + bpp, ) self.assertEqual(440, window.start) self.assertEqual(1110, window.end) @@ -524,7 +536,12 @@ def test_orient_left(self): def test_orient_right(self): bpp = Breakpoint(chr='1', start=1000, end=1000, orient=ORIENT.RIGHT) window = GenomeEvidence.generate_window( - MockObject(read_length=100, call_error=11, max_expected_fragment_size=550), bpp + MockObject( + read_length=100, + max_expected_fragment_size=550, + config={**DEFAULTS, 'validate.call_error': 11}, + ), + bpp, ) self.assertEqual(890, window.start) self.assertEqual(1560, window.end) @@ -540,8 +557,7 @@ def test_window_accessors(self): read_length=150, stdev_fragment_size=500, median_fragment_size=100, - call_error=0, - stdev_count_abnormal=1, + config={'validate.stdev_count_abnormal': 1, 'validate.call_error': 0}, ) self.assertEqual(901, ge.outer_window1.start) self.assertEqual(1649, ge.outer_window1.end) @@ -565,8 +581,7 @@ def setUp(self): read_length=150, stdev_fragment_size=500, median_fragment_size=100, - call_error=0, - stdev_count_abnormal=1, + config={'validate.stdev_count_abnormal': 1, 'validate.call_error': 0}, ) # outer windows (901, 1649) (5852, 6600) # inner windows (1351, 1649) (5852, 6150) diff --git a/tests/setup_subprocess_cov.py b/tests/setup_subprocess_cov.py new file mode 100644 index 00000000..c14c3359 --- /dev/null +++ b/tests/setup_subprocess_cov.py @@ -0,0 +1,10 @@ +import os +import sys + +for p in sys.path: + if p.endswith('site-packages'): + pth_file = os.path.join(p, 'subprocess-coverage.pth') + print('writing path file:', pth_file) + with open(pth_file, 'w') as fh: + fh.write('import coverage\n\ncoverage.process_startup()\n') + break diff --git a/tests/unit/test_breakpoint.py b/tests/unit/test_breakpoint.py index a76770c0..56bdb9da 100644 --- a/tests/unit/test_breakpoint.py +++ b/tests/unit/test_breakpoint.py @@ -199,27 +199,6 @@ def test___init__invalid_inter_lr_opp(self): opposing_strands=True, ) - def test_accessing_data_attributes(self): - bp1 = Breakpoint(1, 1, 2, ORIENT.LEFT) - bp2 = Breakpoint(2, 1, 2, ORIENT.LEFT) - bpp = BreakpointPair(bp1, bp2, opposing_strands=True) - bpp.data['a'] = 1 - print(bpp.data) - self.assertEqual(1, bpp.a) - with self.assertRaises(AttributeError): - bpp.random_attr - - with self.assertRaises(AttributeError): - bpp.call_method - - bpp.data[COLUMNS.call_method] = 1 - print(bpp.data) - self.assertEqual(1, bpp.call_method) - - COLUMNS.call_method = 'bbreak2_call_method' - bpp.data[COLUMNS.call_method] = 2 - self.assertEqual(2, bpp.call_method) - class TestClassifyBreakpointPair(unittest.TestCase): def test_inverted_translocation(self): diff --git a/tests/unit/test_summary.py b/tests/unit/test_summary.py index de2760a3..f2a81ef8 100644 --- a/tests/unit/test_summary.py +++ b/tests/unit/test_summary.py @@ -11,27 +11,27 @@ def setUp(self): Breakpoint('1', 1), Breakpoint('1', 10), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, COLUMNS.protocol: PROTOCOL.GENOME, COLUMNS.fusion_cdna_coding_end: None, COLUMNS.fusion_cdna_coding_start: None, - }, + } ) self.gev2 = BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 100), opposing_strands=True, - data={ + **{ COLUMNS.event_type: SVTYPE.DEL, COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, COLUMNS.protocol: PROTOCOL.GENOME, COLUMNS.fusion_cdna_coding_start: None, COLUMNS.fusion_cdna_coding_end: None, - }, + } ) self.best_transcripts = {'ABCA': True, 'ABCD': True} diff --git a/tests/unit/test_tool.py b/tests/unit/test_tool.py index 17228569..fffaa36a 100644 --- a/tests/unit/test_tool.py +++ b/tests/unit/test_tool.py @@ -1,12 +1,9 @@ import unittest from mavis.constants import COLUMNS, ORIENT, STRAND, SVTYPE -from mavis.tools import ( - _convert_tool_row, - SUPPORTED_TOOL, - _parse_transabyss, -) -from mavis.tools.vcf import parse_bnd_alt as _parse_bnd_alt, convert_record as _parse_vcf_record +from mavis.tools import SUPPORTED_TOOL, _convert_tool_row, _parse_transabyss +from mavis.tools.vcf import convert_record as _parse_vcf_record +from mavis.tools.vcf import parse_bnd_alt as _parse_bnd_alt from .mock import Mock @@ -285,8 +282,8 @@ def test_convert_deletion(self): self.assertEqual(9412400, bpp.break2.start) self.assertEqual(9412404, bpp.break2.end) self.assertEqual('21', bpp.break2.chr) - print(bpp, bpp.tracking_id) - self.assertEqual('manta-MantaDEL:20644:0:2:0:0:0', bpp.tracking_id) + print(bpp, bpp.data['tracking_id']) + self.assertEqual('manta-MantaDEL:20644:0:2:0:0:0', bpp.data['tracking_id']) def test_convert_duplication(self): row = Mock( @@ -302,7 +299,7 @@ def test_convert_duplication(self): bpp = bpp_list[0] self.assertEqual('1', bpp.break1.chr) self.assertEqual('1', bpp.break2.chr) - self.assertEqual('manta-MantaDUP:TANDEM:22477:0:1:0:9:0', bpp.tracking_id) + self.assertEqual('manta-MantaDUP:TANDEM:22477:0:1:0:9:0', bpp.data['tracking_id']) def test_non_trans_bnd(self): row = Mock( @@ -330,7 +327,7 @@ def test_non_trans_bnd(self): self.assertEqual(234912188, bpp.break2.start) self.assertEqual('R', bpp.break1.orient) self.assertEqual('R', bpp.break2.orient) - self.assertEqual('manta-MantaBND:207:0:1:0:0:0:0', bpp.tracking_id) + self.assertEqual('manta-MantaBND:207:0:1:0:0:0:0', bpp.data['tracking_id']) self.assertEqual(1, len(bpp_list)) def test_non_trans_bnd_from_mate(self): @@ -359,7 +356,7 @@ def test_non_trans_bnd_from_mate(self): self.assertEqual(234912188, bpp.break2.start) self.assertEqual('R', bpp.break1.orient) self.assertEqual('R', bpp.break2.orient) - self.assertEqual('manta-MantaBND:207:0:1:0:0:0:1', bpp.tracking_id) + self.assertEqual('manta-MantaBND:207:0:1:0:0:0:1', bpp.data['tracking_id']) self.assertEqual(1, len(bpp_list)) @@ -386,7 +383,7 @@ def test_convert_inverted_translocation(self): self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) self.assertEqual(ORIENT.LEFT, bpp.break2.orient) self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.tracking_id) + self.assertEqual('defuse-1', bpp.data['tracking_id']) def test_convert_translocation(self): row = { @@ -410,7 +407,7 @@ def test_convert_translocation(self): self.assertEqual(ORIENT.LEFT, bpp.break1.orient) self.assertEqual(ORIENT.LEFT, bpp.break2.orient) self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.tracking_id) + self.assertEqual('defuse-1', bpp.data['tracking_id']) def test_convert_indel(self): row = { @@ -434,7 +431,7 @@ def test_convert_indel(self): self.assertEqual(ORIENT.LEFT, bpp.break1.orient) self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.tracking_id) + self.assertEqual('defuse-1', bpp.data['tracking_id']) def test_convert_inversion(self): row = { @@ -458,7 +455,7 @@ def test_convert_inversion(self): self.assertEqual(ORIENT.LEFT, bpp.break1.orient) self.assertEqual(ORIENT.LEFT, bpp.break2.orient) self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.tracking_id) + self.assertEqual('defuse-1', bpp.data['tracking_id']) class TestChimerascan(unittest.TestCase): From 559de2ab17cc8027d5cc2db86fbe3e9530a9e571 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 23:39:05 -0700 Subject: [PATCH 006/137] Add type annotations --- mavis/annotate/base.py | 8 ++++ mavis/annotate/variant.py | 91 +++++++++++++++++++++++--------------- mavis/pairing/main.py | 15 ++++--- mavis/pairing/pairing.py | 26 +++++++---- mavis/summary/main.py | 17 +++---- mavis/summary/summary.py | 11 +++-- mavis/validate/base.py | 41 +++++++++++------ mavis/validate/call.py | 30 +++++++------ mavis/validate/evidence.py | 18 +------- 9 files changed, 150 insertions(+), 107 deletions(-) diff --git a/mavis/annotate/base.py b/mavis/annotate/base.py index e174ad25..bf0dea8f 100644 --- a/mavis/annotate/base.py +++ b/mavis/annotate/base.py @@ -1,4 +1,5 @@ import re +from typing import Any, Dict, Optional from ..constants import STRAND from ..interval import Interval @@ -49,6 +50,13 @@ def __le__(self, other): class BioInterval: + position: Interval + name: Optional[str] + data: Dict + seq: Optional[str] + reference_object: Any + strand: Optional[str] + def __init__( self, reference_object, start, end=None, name=None, seq=None, data=None, strand=None ): diff --git a/mavis/annotate/variant.py b/mavis/annotate/variant.py index 64b3732e..0a81f34b 100644 --- a/mavis/annotate/variant.py +++ b/mavis/annotate/variant.py @@ -1,14 +1,16 @@ import itertools import json +from typing import Callable, Dict, List, Optional, Set, Tuple, Union + from shortuuid import uuid -from .fusion import determine_prime, FusionTranscript -from .genomic import IntergenicRegion from ..breakpoint import Breakpoint, BreakpointPair from ..constants import COLUMNS, GENE_PRODUCT_TYPE, PROTOCOL, STOP_AA, STRAND, SVTYPE from ..error import NotSpecifiedError from ..interval import Interval from ..util import DEVNULL +from .fusion import FusionTranscript, determine_prime +from .genomic import Gene, IntergenicRegion, PreTranscript, Transcript class Annotation(BreakpointPair): @@ -17,8 +19,22 @@ class Annotation(BreakpointPair): will also hold the other annotations for overlapping and encompassed and nearest genes """ + encompassed_genes: Set[Gene] + genes_proximal_to_break1: Set[Gene] + genes_proximal_to_break2: Set[Gene] + genes_overlapping_break1: Set[Gene] + genes_overlapping_break2: Set[Gene] + proximity: int + fusion: Optional[FusionTranscript] + transcript1: Optional[Transcript] + transcript2: Optional[Transcript] + + @property + def validation_id(self) -> Optional[str]: + return self.data.get(COLUMNS.validation_id) + def __init__( - self, bpp, transcript1=None, transcript2=None, proximity=5000, data=None, **kwargs + self, bpp: BreakpointPair, transcript1=None, transcript2=None, proximity=5000, **kwargs ): """ Holds a breakpoint call and a set of transcripts, other information is gathered relative to these @@ -27,8 +43,6 @@ def __init__( bpp (BreakpointPair): the breakpoint pair call. Will be adjusted and then stored based on the transcripts transcript1 (Transcript): transcript at the first breakpoint transcript2 (Transcript): Transcript at the second breakpoint - data (dict): optional dictionary to hold related attributes - event_type (SVTYPE): the type of event """ # narrow the breakpoint windows by the transcripts being used for annotation temp = bpp.break1 if transcript1 is None else bpp.break1 & transcript1 @@ -528,15 +542,19 @@ def overlapping_transcripts(ref_ann, breakpoint): return putative_annotations -def _gather_breakpoint_annotations(ref_ann, breakpoint): +def _gather_breakpoint_annotations( + ref_ann: Dict[str, List[Gene]], breakpoint: Breakpoint +) -> Tuple[ + List[Union[PreTranscript, IntergenicRegion]], List[Union[PreTranscript, IntergenicRegion]] +]: """ Args: - ref_ann (Dict[str,List[Gene]]): the reference annotations split + ref_ann: the reference annotations split into lists of genes by chromosome - breakpoint (Breakpoint): the breakpoint annotations are to be gathered for + breakpoint: the breakpoint annotations are to be gathered for Returns: - Tuple[List[Union[PreTranscript,IntergenicRegion]],List[Union[PreTranscript,IntergenicRegion]]]: + transcripts: - transcripts or intergenic regions overlapping the breakpoint on the positive strand - transcripts or intergenic regions overlapping the breakpoint on the negative strand @@ -618,16 +636,15 @@ def _gather_breakpoint_annotations(ref_ann, breakpoint): ) -def _gather_annotations(ref, bp, proximity=None): +def _gather_annotations(ref: Dict[str, List[Gene]], bp: BreakpointPair, proximity=None): """ each annotation is defined by the annotations selected at the breakpoints the other annotations are given relative to this the annotation at the breakpoint can be a transcript or an intergenic region Args: - ref (Dict[str,List[Gene]]): the list of reference genes hashed - by chromosomes - breakpoint_pairs (List[BreakpointPair]): breakpoint pairs we wish to annotate as events + ref: the list of reference genes hashedby chromosomes + breakpoint_pairs: breakpoint pair we wish to annotate as events Returns: List[Annotation]: The annotations @@ -636,7 +653,9 @@ def _gather_annotations(ref, bp, proximity=None): break1_pos, break1_neg = _gather_breakpoint_annotations(ref, bp.break1) break2_pos, break2_neg = _gather_breakpoint_annotations(ref, bp.break2) - combinations = [] + combinations: List[ + Tuple[Union[PreTranscript, IntergenicRegion], Union[PreTranscript, IntergenicRegion]] + ] = [] if bp.stranded: if bp.break1.strand == STRAND.POS: @@ -653,7 +672,7 @@ def _gather_annotations(ref, bp, proximity=None): # single transcript starts .... for t in (set(break1_pos) | set(break1_neg)) & (set(break2_pos) | set(break2_neg)): try: - t.gene + t.gene # type: ignore except AttributeError: pass else: @@ -682,7 +701,7 @@ def _gather_annotations(ref, bp, proximity=None): if (a1, a2) in annotations: # ignore duplicates continue try: - if a1.gene == a2.gene and a1 != a2: + if a1.gene == a2.gene and a1 != a2: # type: ignore continue except AttributeError: pass @@ -719,7 +738,7 @@ def _gather_annotations(ref, bp, proximity=None): return filtered -def choose_more_annotated(ann_list): +def choose_more_annotated(ann_list: List[Annotation]) -> List[Annotation]: """ for a given set of annotations if there are annotations which contain transcripts and annotations that are simply intergenic regions, discard the intergenic region annotations @@ -729,18 +748,18 @@ def choose_more_annotated(ann_list): that land in the intergenic region Args: - ann_list (List[Annotation]): list of input annotations + ann_list: list of input annotations Warning: input annotations are assumed to be the same event (the same validation_id) the logic used would not apply to different events Returns: - List[Annotation]: the filtered list + the filtered list """ - two_transcript = [] - one_transcript = [] - intergenic = [] + two_transcript: List[Annotation] = [] + one_transcript: List[Annotation] = [] + intergenic: List[Annotation] = [] for ann in ann_list: if isinstance(ann.transcript1, IntergenicRegion) and isinstance( @@ -762,7 +781,7 @@ def choose_more_annotated(ann_list): return intergenic -def choose_transcripts_by_priority(ann_list): +def choose_transcripts_by_priority(ann_list: List[Annotation]): """ for each set of annotations with the same combinations of genes, choose the annotation with the most "best_transcripts" or most "alphanumeric" choices @@ -778,8 +797,10 @@ def choose_transcripts_by_priority(ann_list): Returns: List[Annotation]: the filtered list """ - annotations_by_gene_combination = {} - genes = set() + annotations_by_gene_combination: Dict[ + Tuple[Optional[Gene], Optional[Gene]], List[Annotation] + ] = {} + genes: Set[Gene] = set() for ann in ann_list: gene1 = None @@ -822,16 +843,16 @@ def choose_transcripts_by_priority(ann_list): def annotate_events( - bpps, - annotations, - reference_genome, - max_proximity=5000, - min_orf_size=200, - min_domain_mapping_match=0.95, - max_orf_cap=3, - log=DEVNULL, - filters=None, -): + bpps: List[BreakpointPair], + annotations: Dict[str, List[Gene]], + reference_genome: Dict[str, str], + max_proximity: int = 5000, + min_orf_size: int = 200, + min_domain_mapping_match: float = 0.95, + max_orf_cap: int = 3, + log: Callable = DEVNULL, + filters: List[Callable] = None, +) -> List[Annotation]: """ Args: bpps (List[mavis.breakpoint.BreakpointPair]): list of events diff --git a/mavis/pairing/main.py b/mavis/pairing/main.py index 8a6dbe1a..1e332002 100644 --- a/mavis/pairing/main.py +++ b/mavis/pairing/main.py @@ -1,10 +1,11 @@ import itertools import os import time -from typing import Dict, List +from typing import Dict, List, Set, Tuple from ..annotate.constants import SPLICE_TYPE from ..annotate.file_io import ReferenceFile +from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs from .pairing import inferred_equivalent, pair_by_distance, product_key @@ -70,9 +71,9 @@ def main( reference_transcripts[unspliced_t.name] = unspliced_t # map the calls by library and ensure there are no name/key conflicts - calls_by_cat = dict() - calls_by_ann = dict() - bpp_by_product_key = dict() + calls_by_cat: Dict[Tuple[str, str, bool, str], List[BreakpointPair]] = dict() + calls_by_ann: Dict[Tuple[str, str], List[BreakpointPair]] = dict() + bpp_by_product_key: Dict[str, BreakpointPair] = dict() libraries = set() # initialize the pairing mappings @@ -100,8 +101,8 @@ def main( ) bpp_by_product_key[product_key(bpp)] = bpp - distance_pairings = {} - product_pairings = {} + distance_pairings: Dict[str, Set[str]] = {} + product_pairings: Dict[str, Set[str]] = {} LOG('computing distance based pairings') # pairwise comparison of breakpoints between all libraries for set_num, (category, calls) in enumerate( @@ -117,7 +118,7 @@ def main( LOG('computing inferred (by product) pairings') for calls in calls_by_ann.values(): - calls_by_lib = {} + calls_by_lib: Dict[str, List[BreakpointPair]] = {} for call in calls: calls_by_lib.setdefault(call.library, []).append(call) diff --git a/mavis/pairing/pairing.py b/mavis/pairing/pairing.py index 316598c2..f1fb52b5 100644 --- a/mavis/pairing/pairing.py +++ b/mavis/pairing/pairing.py @@ -1,5 +1,7 @@ +from typing import Callable, Dict, List, Optional, Set + from ..annotate.variant import determine_prime -from ..breakpoint import Breakpoint +from ..breakpoint import Breakpoint, BreakpointPair from ..constants import CALL_METHOD, COLUMNS, ORIENT, PRIME, PROTOCOL, STRAND from ..error import NotSpecifiedError from ..interval import Interval @@ -7,7 +9,7 @@ from .constants import PAIRING_DISTANCES -def product_key(bpp): +def product_key(bpp: BreakpointPair) -> str: """ unique id for the product row """ @@ -26,7 +28,7 @@ def product_key(bpp): ) -def predict_transcriptome_breakpoint(breakpoint, transcript): +def predict_transcriptome_breakpoint(breakpoint: Breakpoint, transcript): """ for a given genomic breakpoint and the target transcript. Predicts the possible transcriptomic breakpoints that would be expected based on the splicing model for abrogated splice sites @@ -121,7 +123,7 @@ def predict_transcriptome_breakpoint(breakpoint, transcript): return sorted(tbreaks) -def _equivalent_events(event1, event2): +def _equivalent_events(event1: BreakpointPair, event2: BreakpointPair) -> bool: # basic checks if any( [ @@ -138,7 +140,9 @@ def _equivalent_events(event1, event2): return True -def comparison_distance(event1, event2, input_distances=None): +def comparison_distance( + event1: BreakpointPair, event2: BreakpointPair, input_distances: Optional[Dict] = None +) -> int: distances = {} distances.update(PAIRING_DISTANCES.items()) if input_distances is not None: @@ -150,7 +154,7 @@ def comparison_distance(event1, event2, input_distances=None): return max_distance -def equivalent(event1, event2, distances=None): +def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -> bool: """ compares two events by breakpoint position to see if they are equivalent """ @@ -178,11 +182,13 @@ def equivalent(event1, event2, distances=None): return True -def pair_by_distance(calls, distances, log=DEVNULL, against_self=False): +def pair_by_distance( + calls: List[BreakpointPair], distances, log: Callable = DEVNULL, against_self: bool = False +) -> Dict[str, Set[str]]: """ for a set of input calls, pair by distance """ - distance_pairings = {} + distance_pairings: Dict[str, Set[str]] = {} break1_sorted = sorted(calls, key=lambda b: b.break1.start) break2_sorted = sorted(calls, key=lambda b: b.break2.start) lowest_resolution = max([len(b.break1) for b in calls] + [len(b.break2) for b in calls] + [1]) @@ -239,7 +245,9 @@ def pair_by_distance(calls, distances, log=DEVNULL, against_self=False): return distance_pairings -def inferred_equivalent(event1, event2, reference_transcripts, distances=None): +def inferred_equivalent( + event1: BreakpointPair, event2: BreakpointPair, reference_transcripts: Dict, distances=None +) -> bool: """ comparison of events using product prediction and breakpoint prediction """ diff --git a/mavis/summary/main.py b/mavis/summary/main.py index 7e46f22d..c34cb5e8 100644 --- a/mavis/summary/main.py +++ b/mavis/summary/main.py @@ -2,11 +2,12 @@ import re import time from functools import partial -from typing import Dict, List +from typing import Dict, List, Tuple import tab from ..annotate.file_io import ReferenceFile +from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs, soft_cast from .constants import HOMOPOLYMER_MIN_LENGTH @@ -184,7 +185,7 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( pair.data[COLUMNS.filter_comment] = 'low evidence' filtered_pairs.append(pair) - bpps_by_library = {} # split the input pairs by library + bpps_by_library: Dict[str, List[BreakpointPair]] = {} # split the input pairs by library libraries = {} for bpp in bpps: bpps_by_library.setdefault(bpp.library, []).append(bpp) @@ -192,12 +193,12 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( # collapse identical calls with different call methods for library in bpps_by_library: - uncollapsed = dict() + uncollapsed: Dict[Tuple, List[BreakpointPair]] = dict() for bpp in bpps_by_library[library]: - group = ( + group: Tuple[BreakpointPair, str, str, str, str, int, int] = ( bpp, - bpp.transcript1, - bpp.transcript2, + bpp.data.get(COLUMNS.transcript1), + bpp.data.get(COLUMNS.transcript2), bpp.fusion_sequence_fasta_id, bpp.fusion_splicing_pattern, bpp.fusion_cdna_coding_start, @@ -242,8 +243,8 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( bpp.opposing_strands, bpp.break1.strand, bpp.break2.strand, - bpp.transcript1 if bpp.gene1 else None, - bpp.transcript2 if bpp.gene2 else None, + bpp.data.get(COLUMNS.transcript1) if bpp.data.get(COLUMNS.gene1) else None, + bpp.data.get(COLUMNS.transcript2) if bpp.data.get(COLUMNS.gene2) else None, bpp.fusion_sequence_fasta_id, # id is a hash of the sequence bpp.fusion_cdna_coding_start, bpp.fusion_cdna_coding_end, diff --git a/mavis/summary/summary.py b/mavis/summary/summary.py index a3ab1ecc..67ef7420 100644 --- a/mavis/summary/summary.py +++ b/mavis/summary/summary.py @@ -1,16 +1,19 @@ -from .constants import PAIRING_STATE +from typing import Dict, List + +from ..annotate.genomic import Transcript from ..breakpoint import Breakpoint, BreakpointPair from ..constants import CALL_METHOD, COLUMNS, DISEASE_STATUS, PROTOCOL, SVTYPE from ..interval import Interval from ..pairing.pairing import pair_by_distance, product_key from ..util import get_connected_components +from .constants import PAIRING_STATE -def filter_by_annotations(bpp_list, best_transcripts): +def filter_by_annotations(bpp_list: List[BreakpointPair], best_transcripts: Dict[str, Transcript]): """ Args: - bpp_list (List[BreakpointPair]): list of pairs to filter - best_transcripts (Dict[str,Transcript]): the best transcripts of the annotations + bpp_list: list of pairs to filter + best_transcripts: the best transcripts of the annotations based on their names """ diff --git a/mavis/validate/base.py b/mavis/validate/base.py index 3b2da0aa..e6767d3a 100644 --- a/mavis/validate/base.py +++ b/mavis/validate/base.py @@ -10,8 +10,17 @@ from ..bam import read as _read from ..bam.cache import BamCache from ..breakpoint import Breakpoint, BreakpointPair -from ..constants import (CIGAR, COLUMNS, NA_MAPPING_QUALITY, ORIENT, PROTOCOL, - PYSAM_READ_FLAGS, STRAND, SVTYPE, reverse_complement) +from ..constants import ( + CIGAR, + COLUMNS, + NA_MAPPING_QUALITY, + ORIENT, + PROTOCOL, + PYSAM_READ_FLAGS, + STRAND, + SVTYPE, + reverse_complement, +) from ..error import NotSpecifiedError from ..interval import Interval from ..schemas import DEFAULTS @@ -19,23 +28,29 @@ class Evidence(BreakpointPair): + assembly_max_kmer_size: int bam_cache: BamCache classification: Optional[str] - reference_genome: Dict - read_length: int - stdev_fragment_size: int - median_fragment_size: int - split_reads: Tuple[Set, Set] - flanking_pairs: Set compatible_flanking_pairs: Set - spanning_reads: Set - counts: List[int] - contigs: List - half_mapped: Tuple[Set, Set] compatible_window1: Optional[Interval] compatible_window2: Optional[Interval] config: Dict - assenmbly_max_kmer_size: int + contigs: List + counts: List[int] + flanking_pairs: Set + half_mapped: Tuple[Set, Set] + median_fragment_size: int + read_length: int + reference_genome: Dict + spanning_reads: Set + split_reads: Tuple[Set, Set] + stdev_fragment_size: int + strand_determining_read: int + # abstract properties + inner_window1: Interval + inner_window2: Interval + outer_window1: Interval + outer_window2: Interval @property def min_expected_fragment_size(self): diff --git a/mavis/validate/call.py b/mavis/validate/call.py index c5a8048e..002534a3 100644 --- a/mavis/validate/call.py +++ b/mavis/validate/call.py @@ -1,9 +1,10 @@ import itertools import math import statistics -from typing import Optional, Set +from typing import List, Optional, Set from ..align import SplitAlignment, call_paired_read_event, call_read_events, convert_to_duplication +from ..assemble import Contig from ..bam import read as _read from ..breakpoint import Breakpoint, BreakpointPair from ..constants import ( @@ -16,6 +17,7 @@ reverse_complement, ) from ..interval import Interval +from ..validate.base import Evidence class EventCall(BreakpointPair): @@ -30,9 +32,9 @@ class for holding evidence and the related calls since we can't freeze the evide break1_split_reads: Set break2_split_reads: Set compatible_flanking_pairs: Set - compatible_type: str - contig: Optional - contig_alignment: Optional + compatible_type: Optional[str] + contig: Optional[Contig] + contig_alignment: Optional[SplitAlignment] @property def has_compatible(self): @@ -40,14 +42,14 @@ def has_compatible(self): def __init__( self, - b1, - b2, - source_evidence, - event_type, - call_method, - contig=None, - contig_alignment=None, - untemplated_seq=None, + b1: Breakpoint, + b2: Breakpoint, + source_evidence: Evidence, + event_type: str, + call_method: str, + contig: Optional[Contig] = None, + contig_alignment: Optional[SplitAlignment] = None, + untemplated_seq: Optional[str] = None, ): """ Args: @@ -624,7 +626,7 @@ def filter_consumed_pairs(pairs, consumed_reads): return temp -def _call_by_spanning_reads(source_evidence, consumed_evidence): +def _call_by_spanning_reads(source_evidence: Evidence, consumed_evidence): spanning_calls = {} available_flanking_pairs = filter_consumed_pairs( source_evidence.flanking_pairs, consumed_evidence @@ -695,7 +697,7 @@ def _call_by_spanning_reads(source_evidence, consumed_evidence): return filtered_events -def call_events(source_evidence): +def call_events(source_evidence) -> List[EventCall]: """ generates a set of event calls based on the evidence associated with the source_evidence object will also narrow down the event type diff --git a/mavis/validate/evidence.py b/mavis/validate/evidence.py index f371a72e..a689170c 100644 --- a/mavis/validate/evidence.py +++ b/mavis/validate/evidence.py @@ -7,21 +7,13 @@ from ..annotate.variant import overlapping_transcripts from ..bam import cigar as _cigar from ..breakpoint import Breakpoint -from ..constants import CIGAR, ORIENT, PROTOCOL, STRAND, SVTYPE +from ..constants import CIGAR, COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE from ..interval import Interval from ..schemas import DEFAULTS from .base import Evidence class GenomeEvidence(Evidence): - outer_window1: Interval - outer_window2: Interval - inner_window1: Interval - inner_window2: Interval - compatible_window1: Interval - compatible_window2: Interval - protocol: str - @property def min_mapping_quality(self): return self.config['validate.min_mapping_quality'] @@ -92,14 +84,6 @@ def compute_fragment_size( class TranscriptomeEvidence(Evidence): - outer_window1: Interval - outer_window2: Interval - inner_window1: Interval - inner_window2: Interval - compatible_window1: Interval - compatible_window2: Interval - protocol: str - @property def min_mapping_quality(self): return self.config['validate.trans_min_mapping_quality'] From 99a8b10d6afc1d132593889a07b1c4259ef3df2e Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 23:39:53 -0700 Subject: [PATCH 007/137] reference data dict directly --- tools/calculate_ref_alt_counts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/calculate_ref_alt_counts.py b/tools/calculate_ref_alt_counts.py index ee43dcfe..d873daba 100644 --- a/tools/calculate_ref_alt_counts.py +++ b/tools/calculate_ref_alt_counts.py @@ -7,7 +7,6 @@ import statistics as stats import pysam - from mavis.annotate.file_io import load_reference_genome from mavis.constants import SVTYPE from mavis.util import LOG as log @@ -196,7 +195,7 @@ def calculate_all_counts(self, input_files, output_file): for bpp in bpps: # only use precise bpps that are within a certain event size try: - processed_bpps[bpp.product_id] = self.calculate_ref_counts(bpp) + processed_bpps[bpp.data['product_id']] = self.calculate_ref_counts(bpp) except ValueError: # wrong event type to calculate a ref/alt count filtered_events.append(bpp) From 426a22644a5bdbd6aae3a7c41e378e3de75494fb Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 19 Apr 2021 23:41:26 -0700 Subject: [PATCH 008/137] Add codecov .pth file --- .github/workflows/build.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ded37cdc..37c862f0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,14 +47,13 @@ jobs: export PATH=$PATH:$(pwd):$(pwd)/bwa pytest tests -v \ --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ - --cov mavis \ - --cov-report term-missing \ - --cov-report xml \ - --durations=10 \ - --cov-branch + --durations=10 env: RUN_FULL: 0 if: github.event_name != 'pull_request' + - name: set up .pth file + run: | + python tests/setup_subprocess_cov.py - name: run full tests with pytest run: | export PATH=$PATH:$(pwd):$(pwd)/bwa From e2e81ade4a6bda0883c4d29238402793c786e3cf Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 15:15:15 -0700 Subject: [PATCH 009/137] Use class-style syntax for constants --- Snakefile | 2 +- mavis/align.py | 18 +- mavis/annotate/constants.py | 43 +- mavis/config.py | 9 +- mavis/constants.py | 1001 +++++++++++++--------------------- mavis/main.py | 2 +- mavis/pairing/constants.py | 18 +- mavis/summary/constants.py | 26 +- mavis/tools/constants.py | 55 +- mavis/util.py | 10 +- tests/unit/test_constants.py | 72 +-- tests/unit/test_util.py | 57 +- 12 files changed, 480 insertions(+), 833 deletions(-) diff --git a/Snakefile b/Snakefile index 2e617c92..7710c0d9 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,5 @@ from snakemake.utils import validate -from snakemake import WorkflowError +from snakemake.exceptions import WorkflowError import os from typing import List, Dict import re diff --git a/mavis/align.py b/mavis/align.py index f5f493a7..be81b28d 100644 --- a/mavis/align.py +++ b/mavis/align.py @@ -27,14 +27,18 @@ from .interval import Interval from .util import DEVNULL -SUPPORTED_ALIGNER = MavisNamespace( - BWA_MEM='bwa mem', BLAT='blat', __name__='mavis.align.SUPPORTED_ALIGNER' -) -"""MavisNamespace: supported aligners -- [blat](/glossary/#blat) -- [bwa mem](/glossary/#bwa-mem) -""" +class SUPPORTED_ALIGNER(MavisNamespace): + """ + supported aligners + + Attributes: + BLAT: [blat](/glossary/#blat) + BWA_MEM: [bwa mem](/glossary/#bwa-mem) + """ + + BWA_MEM = 'bwa mem' + BLAT = 'blat' class SplitAlignment(BreakpointPair): diff --git a/mavis/annotate/constants.py b/mavis/annotate/constants.py index d2fbf5c2..1a4e324d 100644 --- a/mavis/annotate/constants.py +++ b/mavis/annotate/constants.py @@ -3,30 +3,35 @@ import tab from ..constants import MavisNamespace, float_fraction -from ..util import WeakMavisNamespace PASS_FILENAME = 'annotations.tab' -SPLICE_TYPE = MavisNamespace( - RETAIN='retained intron', - SKIP='skipped exon', - NORMAL='normal', - MULTI_RETAIN='retained multiple introns', - MULTI_SKIP='skipped multiple exons', - COMPLEX='complex', -) -"""MavisNamespace: holds controlled vocabulary for allowed splice type classification values - -- ``RETAIN``: an intron was retained -- ``SKIP``: an exon was skipped -- ``NORMAL``: no exons were skipped and no introns were retained. the normal/expected splicing pattern was followed -- ``MULTI_RETAIN``: multiple introns were retained -- ``MULTI_SKIP``: multiple exons were skipped -- ``COMPLEX``: some combination of exon skipping and intron retention -""" +class SPLICE_TYPE(MavisNamespace): + """ + holds controlled vocabulary for allowed splice type classification values + + Attributes: + RETAIN: an intron was retained + SKIP: an exon was skipped + NORMAL: no exons were skipped and no introns were retained. the normal/expected splicing pattern was followed + MULTI_RETAIN: multiple introns were retained + MULTI_SKIP: multiple exons were skipped + COMPLEX: some combination of exon skipping and intron retention + """ + + RETAIN: str = 'retained intron' + SKIP: str = 'skipped exon' + NORMAL: str = 'normal' + MULTI_RETAIN: str = 'retained multiple introns' + MULTI_SKIP: str = 'skipped multiple exons' + COMPLEX: str = 'complex' + + +class SPLICE_SITE_TYPE(MavisNamespace): + DONOR: int = 3 + ACCEPTOR: int = 5 -SPLICE_SITE_TYPE = MavisNamespace(DONOR=3, ACCEPTOR=5) SPLICE_SITE_RADIUS = 2 """int: number of bases away from an exon boundary considered to be part of the splice site such that if it were altered diff --git a/mavis/config.py b/mavis/config.py index 480eb301..6ff74392 100644 --- a/mavis/config.py +++ b/mavis/config.py @@ -12,14 +12,7 @@ from .bam import stats from .bam.cache import BamCache from .constants import PROTOCOL, SUBCOMMAND, float_fraction -from .util import WeakMavisNamespace, bash_expands, filepath - -CONVERT_OPTIONS = WeakMavisNamespace() -CONVERT_OPTIONS.add( - 'assume_no_untemplated', - True, - defn='assume that if not given there is no untemplated sequence between the breakpoints', -) +from .util import bash_expands, filepath def calculate_bam_stats(config: Dict, library_name: str) -> Dict: diff --git a/mavis/constants.py b/mavis/constants.py index 72fccdf4..da2a8d40 100644 --- a/mavis/constants.py +++ b/mavis/constants.py @@ -4,6 +4,7 @@ import argparse import os import re +from typing import List from Bio.Alphabet import Gapped from Bio.Alphabet.IUPAC import ambiguous_dna @@ -11,234 +12,48 @@ from Bio.Seq import Seq from tab import cast_boolean, cast_null -PROGNAME = 'mavis' -EXIT_OK = 0 -EXIT_ERROR = 1 -EXIT_INCOMPLETE = 2 +PROGNAME: str = 'mavis' +EXIT_OK: int = 0 +EXIT_ERROR: int = 1 +EXIT_INCOMPLETE: int = 2 -class MavisNamespace: - """ - Namespace to hold module constants - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.thing - 1 - >>> nspace.otherthing - 2 - """ - - DELIM = r'[;,\s]+' - """str: delimiter to use is parsing listable variables from the environment or config file""" - - def __init__(self, *pos, **kwargs): - object.__setattr__(self, '_defns', {}) - object.__setattr__(self, '_types', {}) - object.__setattr__(self, '_members', {}) - object.__setattr__(self, '_nullable', set()) - object.__setattr__(self, '_listable', set()) - object.__setattr__(self, '_env_overwritable', set()) - object.__setattr__(self, '_env_prefix', 'MAVIS') - if '__name__' in kwargs: # for building auto documentation - object.__setattr__(self, '__name__', kwargs.pop('__name__')) - - for k in pos: - if k in self._members: - raise AttributeError('Cannot respecify existing attribute', k, self._members[k]) - self[k] = k - - for attr, val in kwargs.items(): - if attr in self._members: - raise AttributeError( - 'Cannot respecify existing attribute', attr, self._members[attr] - ) - self[attr] = val - - for attr, value in self._members.items(): - self._set_type(attr, type(value)) - - def __repr__(self): - return '{}({})'.format( - self.__class__.__name__, - ', '.join(sorted(['{}={}'.format(k, repr(v)) for k, v in self.items()])), - ) - - def discard(self, attr): - """ - Remove a variable if it exists - """ - self._members.pop(attr, None) - self._listable.discard(attr) - self._nullable.discard(attr) - self._defns.pop(attr, None) - self._types.pop(attr, None) - self._env_overwritable.discard(attr) - - def get_env_name(self, attr): - """ - Get the name of the corresponding environment variable +class EnumType(type): + def __contains__(cls, member): + return member in cls.values() - Example: - >>> nspace = MavisNamespace(a=1) - >>> nspace.get_env_name('a') - 'MAVIS_A' - """ - if self._env_prefix: - return '{}_{}'.format(self._env_prefix, attr).upper() - return attr.upper() + def __getitem__(cls, item): + for k, v in cls.items(): + if k == item: + return v + raise KeyError(item) - def get_env_var(self, attr): + def __iter__(cls): """ - retrieve the environment variable definition of a given attribute + Returns members in definition order. """ - env_name = self.get_env_name(attr) - env = os.environ[env_name].strip() - attr_type = self._types.get(attr, str) + return cls.values() - if attr in self._listable: - return self.parse_listable_string(env, attr_type, attr in self._nullable) - if attr in self._nullable and env.lower() == 'none': - return None - return attr_type(env) +class MavisNamespace(metaclass=EnumType): @classmethod - def parse_listable_string(cls, string, cast_type=str, nullable=False): - """ - Given some string, parse it into a list - - Example: - >>> MavisNamespace.parse_listable_string('1,2,3', int) - [1, 2, 3] - >>> MavisNamespace.parse_listable_string('1;2,None', int, True) - [1, 2, None] - """ - result = [] - string = string.strip() - for val in re.split(cls.DELIM, string) if string else []: - if nullable and val.lower() == 'none': - result.append(None) - else: - result.append(cast_type(val)) - return result - - def is_env_overwritable(self, attr): - """ - Returns: - bool: True if the variable is overrided by specifying the environment variable equivalent - """ - return attr in self._env_overwritable - - def is_listable(self, attr): - """ - Returns: - bool: True if the variable should be parsed as a list - """ - return attr in self._listable - - def is_nullable(self, attr): - """ - Returns: - bool: True if the variable can be set to None - """ - return attr in self._nullable - - def __getattribute__(self, attr): - try: - return object.__getattribute__(self, attr) - except AttributeError as err: - variables = object.__getattribute__(self, '_members') - if attr not in variables: - raise err - if self.is_env_overwritable(attr): - try: - return self.get_env_var(attr) - except KeyError: - pass - return variables[attr] - - def items(self): - """ - Example: - >>> MavisNamespace(thing=1, otherthing=2).items() - [('thing', 1), ('otherthing', 2)] - """ - return [(k, self[k]) for k in self.keys()] - - def to_dict(self): - return dict(self.items()) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, val): - self.__setattr__(key, val) - - def __setattr__(self, attr, val): - if attr.startswith('_'): - raise ValueError('cannot set private', attr) - object.__getattribute__(self, '_members')[attr] = val - - def copy_from(self, source, attrs=None): - """ - Copy variables from one namespace onto the current namespace - """ - if attrs is None: - attrs = source.keys() - for attr in attrs: - self.add( - attr, - source[attr], - listable=source.is_listable(attr), - nullable=source.is_nullable(attr), - defn=source.define(attr, None), - cast_type=source.type(attr, None), - ) - - def get(self, key, *pos): - """ - get an attribute, return a default (if given) if the attribute does not exist - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.get('thing', 2) - 1 - >>> nspace.get('nonexistant_thing', 2) - 2 - >>> nspace.get('nonexistant_thing') - Traceback (most recent call last): - .... - """ - if len(pos) > 1: - raise TypeError('too many arguments. get takes a single \'default\' value argument') - try: - return self[key] - except AttributeError as err: - if pos: - return pos[0] - raise err - - def keys(self): - """ - get the attribute keys as a list + def items(cls): + return [(k, v) for k, v in cls.__dict__.items() if not k.startswith('_')] - Example: - >>> MavisNamespace(thing=1, otherthing=2).keys() - ['thing', 'otherthing'] - """ - return [k for k in self._members] + @classmethod + def to_dict(cls): + return dict(cls.items()) - def values(self): - """ - get the attribute values as a list + @classmethod + def keys(cls): + return [k for k, v in cls.items()] - Example: - >>> MavisNamespace(thing=1, otherthing=2).values() - [1, 2] - """ - return [self[k] for k in self._members] + @classmethod + def values(cls): + return [v for k, v in cls.items()] - def enforce(self, value): + @classmethod + def enforce(cls, value): """ checks that the current namespace has a given value @@ -249,18 +64,18 @@ def enforce(self, value): KeyError: the value did not exist Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.enforce(1) 1 >>> nspace.enforce(3) Traceback (most recent call last): .... """ - if value not in self.values(): - raise KeyError('value {0} is not a valid member of '.format(repr(value)), self.values()) + if value not in cls.values(): + raise KeyError('value {0} is not a valid member of '.format(repr(value)), cls.values()) return value - def reverse(self, value): + @classmethod + def reverse(cls, value): """ for a given value, return the associated key @@ -272,13 +87,12 @@ def reverse(self, value): KeyError: the value is not assigned Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.reverse(1) 'thing' """ result = [] - for key in self.keys(): - if self[key] == value: + for key in cls.keys(): + if cls[key] == value: result.append(key) if len(result) > 1: raise KeyError('could not reverse, the mapping is not unique', value, result) @@ -286,121 +100,6 @@ def reverse(self, value): raise KeyError('input value is not assigned to a key', value) return result[0] - def __iter__(self): - return iter(self.keys()) - - def _set_type(self, attr, cast_type): - if cast_type == bool: - self._types[attr] = cast_boolean - else: - self._types[attr] = cast_type - - def type(self, attr, *pos): - """ - returns the type - - Example: - >>> nspace = MavisNamespace(thing=1, otherthing=2) - >>> nspace.type('thing') - - """ - if len(pos) > 1: - raise TypeError('too many arguments. type takes a single \'default\' value argument') - try: - return self._types[attr] - except AttributeError as err: - if pos: - return pos[0] - raise err - - def define(self, attr, *pos): - """ - Get the definition of a given attribute or return a default (when given) if the attribute does not exist - - Returns: - str: definition for the attribute - - Raises: - KeyError: the attribute does not exist and a default was not given - - Example: - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1, defn='I am a thing') - >>> nspace.add('otherthing', 2) - >>> nspace.define('thing') - 'I am a thing' - >>> nspace.define('otherthing') - Traceback (most recent call last): - .... - >>> nspace.define('otherthing', 'I am some other thing') - 'I am some other thing' - """ - if len(pos) > 1: - raise TypeError('too many arguments. define takes a single \'default\' value argument') - try: - return self._defns[attr] - except KeyError as err: - if pos: - return pos[0] - raise err - - def add( - self, - attr, - value, - defn=None, - cast_type=None, - nullable=False, - env_overwritable=False, - listable=False, - ): - """ - Add an attribute to the name space - - Args: - attr (str): name of the attribute being added - value: the value of the attribute - defn (str): the definition, will be used in generating documentation and help menus - cast_type (Callable): the function to use in casting the value - nullable (bool): True if this attribute can have a None value - env_overwritable (bool): True if this attribute will be overriden by its environment variable equivalent - listable (bool): True if this attribute can have multiple values - - Example: - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1, int, 'I am a thing') - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1, int) - >>> nspace = MavisNamespace() - >>> nspace.add('thing', 1) - >>> nspace = MavisNamespace() - >>> nspace.add('thing', value=1, cast_type=int, defn='I am a thing') - """ - if cast_type: - self._set_type(attr, cast_type) - else: - self._set_type(attr, type(value)) - if defn: - self._defns[attr] = defn - - if nullable: - self._nullable.add(attr) - if env_overwritable: - self._env_overwritable.add(attr) - if listable: - self._listable.add(attr) - self[attr] = value - - def __call__(self, value): - try: - return self.enforce(value) - except KeyError: - raise TypeError( - 'Invalid value {} for {}. Must be a valid member: {}'.format( - repr(value), self.__class__.__name__, self.values() - ) - ) - def float_fraction(num): """ @@ -424,34 +123,27 @@ def float_fraction(num): return num -COMPLETE_STAMP = 'MAVIS.COMPLETE' -"""str: Filename for all complete stamp files""" - -SUBCOMMAND = MavisNamespace( - ANNOTATE='annotate', - VALIDATE='validate', - CLUSTER='cluster', - PAIR='pairing', - SUMMARY='summary', - CONVERT='convert', - OVERLAY='overlay', - SETUP='setup', -) -"""MavisNamespace: holds controlled vocabulary for allowed pipeline stage values - -- annotate -- cluster -- config -- convert -- pairing -- pipeline -- summary -- validate -""" +COMPLETE_STAMP: str = 'MAVIS.COMPLETE' +"""Filename for all complete stamp files""" + + +class SUBCOMMAND(MavisNamespace): + """ + holds controlled vocabulary for allowed pipeline stage values + """ + + ANNOTATE: str = 'annotate' + VALIDATE: str = 'validate' + CLUSTER: str = 'cluster' + PAIR: str = 'pairing' + SUMMARY: str = 'summary' + CONVERT: str = 'convert' + OVERLAY: str = 'overlay' + SETUP: str = 'setup' -CODON_SIZE = 3 -"""int: the number of bases making up a codon""" +CODON_SIZE: int = 3 +"""the number of bases making up a codon""" def reverse_complement(s): @@ -478,16 +170,16 @@ def reverse_complement(s): return str(input_string.reverse_complement()) -def translate(s, reading_frame=0): +def translate(s: str, reading_frame: int = 0) -> str: """ given a DNA sequence, translates it and returns the protein amino acid sequence Args: - s (str): the input DNA sequence - reading_frame (int): where to start translating the sequence + s: the input DNA sequence + reading_frame: where to start translating the sequence Returns: - str: the amino acid sequence + the amino acid sequence """ reading_frame = reading_frame % CODON_SIZE @@ -497,117 +189,167 @@ def translate(s, reading_frame=0): elif len(temp) % 3 == 2: temp = temp[:-2] temp = Seq(temp, DNA_ALPHABET) - return str(temp.translate()) + return str(temp.translate()) # type: ignore -GAP = '-' +GAP: str = '-' -ORIENT = MavisNamespace(LEFT='L', RIGHT='R', NS='?') -"""MavisNamespace: holds controlled vocabulary for allowed orientation values -- ``LEFT``: left wrt to the positive/forward strand -- ``RIGHT``: right wrt to the positive/forward strand -- ``NS``: orientation is not specified -""" -setattr(ORIENT, 'expand', lambda x: [ORIENT.LEFT, ORIENT.RIGHT] if x == ORIENT.NS else [x]) -setattr(ORIENT, 'compare', lambda x, y: True if ORIENT.NS in [x, y] else (x == y)) +class ORIENT(MavisNamespace): + """ + holds controlled vocabulary for allowed orientation values -PROTOCOL = MavisNamespace(GENOME='genome', TRANS='transcriptome') -"""MavisNamespace: holds controlled vocabulary for allowed protocol values + Attributes: + LEFT: left wrt to the positive/forward strand + RIGHT: right wrt to the positive/forward strand + NS: orientation is not specified + """ -- ``GENOME``: genome -- ``TRANS``: transcriptome -""" + LEFT: str = 'L' + RIGHT: str = 'R' + NS: str = '?' -DISEASE_STATUS = MavisNamespace(DISEASED='diseased', NORMAL='normal') -"""MavisNamespace: holds controlled vocabulary for allowed disease status + @classmethod + def expand(cls, orientation) -> List[str]: + if orientation == cls.NS: + return [cls.LEFT, cls.RIGHT] + return [orientation] -- ``DISEASED``: diseased -- ``NORMAL``: normal -""" + @classmethod + def compare(cls, first, second) -> bool: + if cls.NS in {first, second}: + return True + return first == second -STRAND = MavisNamespace(POS='+', NEG='-', NS='?') -"""MavisNamespace: holds controlled vocabulary for allowed strand values -- ``POS``: the positive/forward strand -- ``NEG``: the negative/reverse strand -- ``NS``: strand is not specified -""" -setattr(STRAND, 'expand', lambda x: [STRAND.POS, STRAND.NEG] if x == STRAND.NS else [x]) -setattr(STRAND, 'compare', lambda x, y: True if STRAND.NS in [x, y] else (x == y)) - -SVTYPE = MavisNamespace( - DEL='deletion', - TRANS='translocation', - ITRANS='inverted translocation', - INV='inversion', - INS='insertion', - DUP='duplication', -) -"""MavisNamespace: holds controlled vocabulary for acceptable structural variant classifications - -- ``DEL``: deletion -- ``TRANS``: translocation -- ``ITRANS``: inverted translocation -- ``INV``: inversion -- ``INS``: insertion -- ``DUP``: duplication -""" +class PROTOCOL(MavisNamespace): + """ + holds controlled vocabulary for allowed protocol values + """ -CIGAR = MavisNamespace(M=0, I=1, D=2, N=3, S=4, H=5, P=6, X=8, EQ=7) # noqa -"""MavisNamespace: Enum-like. For readable cigar values + GENOME: str = 'genome' + TRANS: str = 'transcriptome' -- ``M``: alignment match (can be a sequence match or mismatch) -- ``I``: insertion to the reference -- ``D``: deletion from the reference -- ``N``: skipped region from the reference -- ``S``: soft clipping (clipped sequences present in SEQ) -- ``H``: hard clipping (clipped sequences NOT present in SEQ) -- ``P``: padding (silent deletion from padded reference) -- ``EQ``: sequence match (=) -- ``X``: sequence mismatch -note: descriptions are taken from the `samfile documentation `_ -""" +class DISEASE_STATUS(MavisNamespace): + """ + holds controlled vocabulary for allowed disease status + """ -NA_MAPPING_QUALITY = 255 -"""int: mapping quality value to indicate mapping was not performed/calculated""" - -PYSAM_READ_FLAGS = MavisNamespace( - REVERSE=16, - MATE_REVERSE=32, - UNMAPPED=4, - MATE_UNMAPPED=8, - FIRST_IN_PAIR=64, - LAST_IN_PAIR=128, - SECONDARY=256, - MULTIMAP=1, - SUPPLEMENTARY=2048, - TARGETED_ALIGNMENT='ta', - RECOMPUTED_CIGAR='rc', - BLAT_RANK='br', - BLAT_SCORE='bs', - BLAT_ALIGNMENTS='ba', - BLAT_PERCENT_IDENTITY='bi', - BLAT_PMS='bp', -) - -"""MavisNamespace: Enum-like. For readable PYSAM flag constants - -- ``MULTIMAP``: template having multiple segments in sequencing -- ``UNMAPPED``: segment unmapped -- ``MATE_UNMAPPED``: next segment in the template unmapped -- ``REVERSE``: SEQ being reverse complemented -- ``MATE_REVERSE``: SEQ of the next segment in the template being reverse complemented -- ``FIRST_IN_PAIR``: the first segment in the template -- ``LAST_IN_PAIR``: the last segment in the template -- ``SECONDARY``: secondary alignment -- ``SUPPLEMENTARY``: supplementary alignment - -note: descriptions are taken from the `samfile documentation `_ -""" + DISEASED: str = 'diseased' + NORMAL: str = 'normal' + + +class STRAND(MavisNamespace): + """ + holds controlled vocabulary for allowed strand values + + Attributes: + POS: the positive/forward strand + NEG: the negative/reverse strand + NS: strand is not specified + """ + + POS: str = '+' + NEG: str = '-' + NS: str = '?' + + @classmethod + def expand(cls, strand: str) -> List[str]: + if strand == cls.NS: + return [cls.POS, cls.NEG] + return [strand] + + @classmethod + def compare(cls, first, second) -> bool: + if cls.NS in {first, second}: + return True + return first == second + + +class SVTYPE(MavisNamespace): + """ + holds controlled vocabulary for acceptable structural variant classifications + """ -# read paired, read mapped in proper pair, mate reverse strand, first in pair + DEL = 'deletion' + TRANS = 'translocation' + ITRANS: str = 'inverted translocation' + INV: str = 'inversion' + INS: str = 'insertion' + DUP: str = 'duplication' + + +class CIGAR(MavisNamespace): + """ + Enum-like. For readable cigar values + + + Attributes: + M: alignment match (can be a sequence match or mismatch) + I: insertion to the reference + D: deletion from the reference + N: skipped region from the reference + S: soft clipping (clipped sequences present in SEQ) + H: hard clipping (clipped sequences NOT present in SEQ) + P: padding (silent deletion from padded reference) + EQ: sequence match (=) + X: sequence mismatch + + Note: + descriptions are taken from the `samfile documentation `_ + """ + + M = 0 + I = 1 + D = 2 + N = 3 + S = 4 + H = 5 + P = 6 + X = 8 + EQ = 7 + + +NA_MAPPING_QUALITY: int = 255 +"""mapping quality value to indicate mapping was not performed/calculated""" + + +class PYSAM_READ_FLAGS(MavisNamespace): + """ + Enum-like. For readable PYSAM flag constants + + Attributes: + MULTIMAP: template having multiple segments in sequencing + UNMAPPED: segment unmapped + MATE_UNMAPPED: next segment in the template unmapped + REVERSE: SEQ being reverse complemented + MATE_REVERSE: SEQ of the next segment in the template being reverse complemented + FIRST_IN_PAIR: the first segment in the template + LAST_IN_PAIR: the last segment in the template + SECONDARY: secondary alignment + SUPPLEMENTARY: supplementary alignment + + Note: + descriptions are taken from the `samfile documentation `_ + """ + + REVERSE: int = 16 + MATE_REVERSE: int = 32 + UNMAPPED: int = 4 + MATE_UNMAPPED: int = 8 + FIRST_IN_PAIR: int = 64 + LAST_IN_PAIR: int = 128 + SECONDARY: int = 256 + MULTIMAP: int = 1 + SUPPLEMENTARY: int = 2048 + TARGETED_ALIGNMENT: str = 'ta' + RECOMPUTED_CIGAR: str = 'rc' + BLAT_RANK: str = 'br' + BLAT_SCORE: str = 'bs' + BLAT_ALIGNMENTS: str = 'ba' + BLAT_PERCENT_IDENTITY: str = 'bi' + BLAT_PMS: str = 'bp' def _match_ambiguous_dna(x, y): @@ -631,177 +373,200 @@ def _match_ambiguous_dna(x, y): DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-') DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y) -FLAGS = MavisNamespace(LQ='LOWQUAL') -READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR') +class FLAGS(MavisNamespace): + LQ: str = 'LOWQUAL' -CALL_METHOD = MavisNamespace( - CONTIG='contig', - SPLIT='split reads', - FLANK='flanking reads', - SPAN='spanning reads', - INPUT='input', -) -"""MavisNamespace: holds controlled vocabulary for allowed call methods -- ``CONTIG``: a contig was assembled and aligned across the breakpoints -- ``SPLIT``: the event was called by [split read](/glossary/#split-read) -- ``FLANK``: the event was called by [flanking read pair](/glossary/#flanking-read-pair) -- ``SPAN``: the event was called by [spanning read](/glossary/#spanning-read) -""" +class READ_PAIR_TYPE(MavisNamespace): + RR: str = 'RR' + LL: str = 'LL' + RL: str = 'RL' + LR: str = 'LR' -GENE_PRODUCT_TYPE = MavisNamespace(SENSE='sense', ANTI_SENSE='anti-sense') -"""MavisNamespace: controlled vocabulary for gene products -- ``SENSE``: the gene product is a sense fusion -- ``ANTI_SENSE``: the gene product is anti-sense -""" +class CALL_METHOD(MavisNamespace): + """ + holds controlled vocabulary for allowed call methods -PRIME = MavisNamespace(FIVE=5, THREE=3) -"""MavisNamespace: holds controlled vocabulary + Attributes: + CONTIG: a contig was assembled and aligned across the breakpoints + SPLIT: the event was called by [split read](/glossary/#split-read) + FLANK: the event was called by [flanking read pair](/glossary/#flanking-read-pair) + SPAN: the event was called by [spanning read](/glossary/#spanning-read)""" -- ``FIVE``: five prime -- ``THREE``: three prime -""" + CONTIG: str = 'contig' + SPLIT: str = 'split reads' + FLANK: str = 'flanking reads' + SPAN: str = 'spanning reads' + INPUT: str = 'input' + + +class GENE_PRODUCT_TYPE(MavisNamespace): + """ + controlled vocabulary for gene products + + Attributes: + SENSE: the gene product is a sense fusion + ANTI_SENSE: the gene product is anti-sense + """ + + SENSE: str = 'sense' + ANTI_SENSE: str = 'anti-sense' + + +class PRIME(MavisNamespace): + """ + Attributes: + FIVE: five prime + THREE: three prime + """ + + FIVE: int = 5 + THREE: int = 3 -START_AA = 'M' -"""str: The amino acid expected to start translation + +START_AA: str = 'M' +"""The amino acid expected to start translation """ -STOP_AA = '*' -"""str: The amino acid expected to end translation +STOP_AA: str = '*' +"""The amino acid expected to end translation """ -GIEMSA_STAIN = MavisNamespace( - GNEG='gneg', - GPOS33='gpos33', - GPOS50='gpos50', - GPOS66='gpos66', - GPOS75='gpos75', - GPOS25='gpos25', - GPOS100='gpos100', - ACEN='acen', - GVAR='gvar', - STALK='stalk', -) -"""MavisNamespace: holds controlled vocabulary relating to stains of chromosome bands""" + +class GIEMSA_STAIN(MavisNamespace): + """ + holds controlled vocabulary relating to stains of chromosome bands + """ + + GNEG: str = 'gneg' + GPOS33: str = 'gpos33' + GPOS50: str = 'gpos50' + GPOS66: str = 'gpos66' + GPOS75: str = 'gpos75' + GPOS25: str = 'gpos25' + GPOS100: str = 'gpos100' + ACEN: str = 'acen' + GVAR: str = 'gvar' + STALK: str = 'stalk' + # content related to tabbed files for input/output # ensure that we don't have to change ALL the code when we update column names +class COLUMNS(MavisNamespace): + """ + Column names for i/o files used throughout the pipeline + see [column descriptions](/outputs/columns) + """ -COLUMNS = MavisNamespace( - tracking_id='tracking_id', - library='library', - cluster_id='cluster_id', - cluster_size='cluster_size', - validation_id='validation_id', - annotation_id='annotation_id', - product_id='product_id', - event_type='event_type', - pairing='pairing', - inferred_pairing='inferred_pairing', - gene1='gene1', - gene1_direction='gene1_direction', - gene2='gene2', - gene2_direction='gene2_direction', - gene1_aliases='gene1_aliases', - gene2_aliases='gene2_aliases', - gene_product_type='gene_product_type', - transcript1='transcript1', - transcript2='transcript2', - fusion_splicing_pattern='fusion_splicing_pattern', - fusion_cdna_coding_start='fusion_cdna_coding_start', - fusion_cdna_coding_end='fusion_cdna_coding_end', - fusion_mapped_domains='fusion_mapped_domains', - fusion_sequence_fasta_id='fusion_sequence_fasta_id', - fusion_sequence_fasta_file='fusion_sequence_fasta_file', - fusion_protein_hgvs='fusion_protein_hgvs', - annotation_figure='annotation_figure', - annotation_figure_legend='annotation_figure_legend', - genes_encompassed='genes_encompassed', - genes_overlapping_break1='genes_overlapping_break1', - genes_overlapping_break2='genes_overlapping_break2', - genes_proximal_to_break1='genes_proximal_to_break1', - genes_proximal_to_break2='genes_proximal_to_break2', - break1_chromosome='break1_chromosome', - break1_position_start='break1_position_start', - break1_position_end='break1_position_end', - break1_orientation='break1_orientation', - exon_last_5prime='exon_last_5prime', - exon_first_3prime='exon_first_3prime', - break1_strand='break1_strand', - break1_seq='break1_seq', - break2_chromosome='break2_chromosome', - break2_position_start='break2_position_start', - break2_position_end='break2_position_end', - break2_orientation='break2_orientation', - break2_strand='break2_strand', - break2_seq='break2_seq', - opposing_strands='opposing_strands', - stranded='stranded', - protocol='protocol', - disease_status='disease_status', - tools='tools', - call_method='call_method', - break1_ewindow='break1_ewindow', - break1_ewindow_count='break1_ewindow_count', - break1_ewindow_practical_coverage='break1_ewindow_practical_coverage', - break1_homologous_seq='break1_homologous_seq', - break1_split_read_names='break1_split_read_names', - break1_split_reads='break1_split_reads', - break1_split_reads_forced='break1_split_reads_forced', - break2_ewindow='break2_ewindow', - break2_ewindow_count='break2_ewindow_count', - break2_ewindow_practical_coverage='break2_ewindow_practical_coverage', - break2_homologous_seq='break2_homologous_seq', - break2_split_read_names='break2_split_read_names', - break2_split_reads='break2_split_reads', - break2_split_reads_forced='break2_split_reads_forced', - contig_alignment_query_consumption='contig_alignment_query_consumption', - contig_alignment_score='contig_alignment_score', - contig_alignment_query_name='contig_alignment_query_name', - contig_read_depth='contig_read_depth', - contig_break1_read_depth='contig_break1_read_depth', - contig_break2_read_depth='contig_break2_read_depth', - contig_alignment_rank='contig_alignment_rank', - contig_build_score='contig_build_score', - contig_remap_score='contig_remap_score', - contig_remap_coverage='contig_remap_coverage', - contig_remapped_read_names='contig_remapped_read_names', - contig_remapped_reads='contig_remapped_reads', - contig_seq='contig_seq', - contig_strand_specific='contig_strand_specific', - contigs_assembled='contigs_assembled', - call_sequence_complexity='call_sequence_complexity', - spanning_reads='spanning_reads', - spanning_read_names='spanning_read_names', - flanking_median_fragment_size='flanking_median_fragment_size', - flanking_pairs='flanking_pairs', - flanking_pairs_compatible='flanking_pairs_compatible', - flanking_pairs_read_names='flanking_pairs_read_names', - flanking_pairs_compatible_read_names='flanking_pairs_compatible_read_names', - flanking_stdev_fragment_size='flanking_stdev_fragment_size', - linking_split_read_names='linking_split_read_names', - linking_split_reads='linking_split_reads', - raw_break1_half_mapped_reads='raw_break1_half_mapped_reads', - raw_break1_split_reads='raw_break1_split_reads', - raw_break2_half_mapped_reads='raw_break2_half_mapped_reads', - raw_break2_split_reads='raw_break2_split_reads', - raw_flanking_pairs='raw_flanking_pairs', - raw_spanning_reads='raw_spanning_reads', - untemplated_seq='untemplated_seq', - filter_comment='filter_comment', - cdna_synon='cdna_synon', - protein_synon='protein_synon', - supplementary_call='supplementary_call', - net_size='net_size', - repeat_count='repeat_count', - assumed_untemplated='assumed_untemplated', -) -"""MavisNamespace: Column names for i/o files used throughout the pipeline - -see [column descriptions](/outputs/columns) -""" + tracking_id: str = 'tracking_id' + library: str = 'library' + cluster_id: str = 'cluster_id' + cluster_size: str = 'cluster_size' + validation_id: str = 'validation_id' + annotation_id: str = 'annotation_id' + product_id: str = 'product_id' + event_type: str = 'event_type' + pairing: str = 'pairing' + inferred_pairing: str = 'inferred_pairing' + gene1: str = 'gene1' + gene1_direction: str = 'gene1_direction' + gene2: str = 'gene2' + gene2_direction: str = 'gene2_direction' + gene1_aliases: str = 'gene1_aliases' + gene2_aliases: str = 'gene2_aliases' + gene_product_type: str = 'gene_product_type' + transcript1: str = 'transcript1' + transcript2: str = 'transcript2' + fusion_splicing_pattern: str = 'fusion_splicing_pattern' + fusion_cdna_coding_start: str = 'fusion_cdna_coding_start' + fusion_cdna_coding_end: str = 'fusion_cdna_coding_end' + fusion_mapped_domains: str = 'fusion_mapped_domains' + fusion_sequence_fasta_id: str = 'fusion_sequence_fasta_id' + fusion_sequence_fasta_file: str = 'fusion_sequence_fasta_file' + fusion_protein_hgvs: str = 'fusion_protein_hgvs' + annotation_figure: str = 'annotation_figure' + annotation_figure_legend: str = 'annotation_figure_legend' + genes_encompassed: str = 'genes_encompassed' + genes_overlapping_break1: str = 'genes_overlapping_break1' + genes_overlapping_break2: str = 'genes_overlapping_break2' + genes_proximal_to_break1: str = 'genes_proximal_to_break1' + genes_proximal_to_break2: str = 'genes_proximal_to_break2' + break1_chromosome: str = 'break1_chromosome' + break1_position_start: str = 'break1_position_start' + break1_position_end: str = 'break1_position_end' + break1_orientation: str = 'break1_orientation' + exon_last_5prime: str = 'exon_last_5prime' + exon_first_3prime: str = 'exon_first_3prime' + break1_strand: str = 'break1_strand' + break1_seq: str = 'break1_seq' + break2_chromosome: str = 'break2_chromosome' + break2_position_start: str = 'break2_position_start' + break2_position_end: str = 'break2_position_end' + break2_orientation: str = 'break2_orientation' + break2_strand: str = 'break2_strand' + break2_seq: str = 'break2_seq' + opposing_strands: str = 'opposing_strands' + stranded: str = 'stranded' + protocol: str = 'protocol' + disease_status: str = 'disease_status' + tools: str = 'tools' + call_method: str = 'call_method' + break1_ewindow: str = 'break1_ewindow' + break1_ewindow_count: str = 'break1_ewindow_count' + break1_ewindow_practical_coverage: str = 'break1_ewindow_practical_coverage' + break1_homologous_seq: str = 'break1_homologous_seq' + break1_split_read_names: str = 'break1_split_read_names' + break1_split_reads: str = 'break1_split_reads' + break1_split_reads_forced: str = 'break1_split_reads_forced' + break2_ewindow: str = 'break2_ewindow' + break2_ewindow_count: str = 'break2_ewindow_count' + break2_ewindow_practical_coverage: str = 'break2_ewindow_practical_coverage' + break2_homologous_seq: str = 'break2_homologous_seq' + break2_split_read_names: str = 'break2_split_read_names' + break2_split_reads: str = 'break2_split_reads' + break2_split_reads_forced: str = 'break2_split_reads_forced' + contig_alignment_query_consumption: str = 'contig_alignment_query_consumption' + contig_alignment_score: str = 'contig_alignment_score' + contig_alignment_query_name: str = 'contig_alignment_query_name' + contig_read_depth: str = 'contig_read_depth' + contig_break1_read_depth: str = 'contig_break1_read_depth' + contig_break2_read_depth: str = 'contig_break2_read_depth' + contig_alignment_rank: str = 'contig_alignment_rank' + contig_build_score: str = 'contig_build_score' + contig_remap_score: str = 'contig_remap_score' + contig_remap_coverage: str = 'contig_remap_coverage' + contig_remapped_read_names: str = 'contig_remapped_read_names' + contig_remapped_reads: str = 'contig_remapped_reads' + contig_seq: str = 'contig_seq' + contig_strand_specific: str = 'contig_strand_specific' + contigs_assembled: str = 'contigs_assembled' + call_sequence_complexity: str = 'call_sequence_complexity' + spanning_reads: str = 'spanning_reads' + spanning_read_names: str = 'spanning_read_names' + flanking_median_fragment_size: str = 'flanking_median_fragment_size' + flanking_pairs: str = 'flanking_pairs' + flanking_pairs_compatible: str = 'flanking_pairs_compatible' + flanking_pairs_read_names: str = 'flanking_pairs_read_names' + flanking_pairs_compatible_read_names: str = 'flanking_pairs_compatible_read_names' + flanking_stdev_fragment_size: str = 'flanking_stdev_fragment_size' + linking_split_read_names: str = 'linking_split_read_names' + linking_split_reads: str = 'linking_split_reads' + raw_break1_half_mapped_reads: str = 'raw_break1_half_mapped_reads' + raw_break1_split_reads: str = 'raw_break1_split_reads' + raw_break2_half_mapped_reads: str = 'raw_break2_half_mapped_reads' + raw_break2_split_reads: str = 'raw_break2_split_reads' + raw_flanking_pairs: str = 'raw_flanking_pairs' + raw_spanning_reads: str = 'raw_spanning_reads' + untemplated_seq: str = 'untemplated_seq' + filter_comment: str = 'filter_comment' + cdna_synon: str = 'cdna_synon' + protein_synon: str = 'protein_synon' + supplementary_call: str = 'supplementary_call' + net_size: str = 'net_size' + repeat_count: str = 'repeat_count' + assumed_untemplated: str = 'assumed_untemplated' def sort_columns(input_columns): diff --git a/mavis/main.py b/mavis/main.py index dfa9f127..1d86fc17 100644 --- a/mavis/main.py +++ b/mavis/main.py @@ -158,7 +158,7 @@ def create_parser(argv): action=_config.RangeAppendAction, ) - return parser, _util.MavisNamespace(**parser.parse_args(argv).__dict__) + return parser, parser.parse_args(argv) def main(argv=None): diff --git a/mavis/pairing/constants.py b/mavis/pairing/constants.py index eebeffec..69077f4d 100644 --- a/mavis/pairing/constants.py +++ b/mavis/pairing/constants.py @@ -1,12 +1,12 @@ +from typing import Dict + from ..constants import CALL_METHOD, MavisNamespace from ..schemas import DEFAULTS -PAIRING_DISTANCES = MavisNamespace( - **{ - CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'], - CALL_METHOD.SPAN: DEFAULTS['pairing.spanning_call_distance'], - CALL_METHOD.SPLIT: DEFAULTS['pairing.split_call_distance'], - CALL_METHOD.CONTIG: DEFAULTS['pairing.contig_call_distance'], - CALL_METHOD.INPUT: DEFAULTS['pairing.input_call_distance'], - } -) +PAIRING_DISTANCES: Dict[str, int] = { + CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'], + CALL_METHOD.SPAN: DEFAULTS['pairing.spanning_call_distance'], + CALL_METHOD.SPLIT: DEFAULTS['pairing.split_call_distance'], + CALL_METHOD.CONTIG: DEFAULTS['pairing.contig_call_distance'], + CALL_METHOD.INPUT: DEFAULTS['pairing.input_call_distance'], +} diff --git a/mavis/summary/constants.py b/mavis/summary/constants.py index 27de3695..4ac58d00 100644 --- a/mavis/summary/constants.py +++ b/mavis/summary/constants.py @@ -2,16 +2,16 @@ HOMOPOLYMER_MIN_LENGTH = 3 -PAIRING_STATE = MavisNamespace( - EXP='expressed', - NO_EXP='not expressed', - SOMATIC='somatic', - GERMLINE='germline', - CO_EXP='co-expressed', - GERMLINE_EXP='germline expression', - SOMATIC_EXP='somatic expression', - MATCH='matched', - NO_MATCH='not matched', - GENOMIC='genomic support', - NO_GENOMIC='no genomic support', -) + +class PAIRING_STATE(MavisNamespace): + EXP = 'expressed' + NO_EXP = 'not expressed' + SOMATIC = 'somatic' + GERMLINE = 'germline' + CO_EXP = 'co-expressed' + GERMLINE_EXP = 'germline expression' + SOMATIC_EXP = 'somatic expression' + MATCH = 'matched' + NO_MATCH = 'not matched' + GENOMIC = 'genomic support' + NO_GENOMIC = 'no genomic support' diff --git a/mavis/tools/constants.py b/mavis/tools/constants.py index 3df79b7d..d2412dfa 100644 --- a/mavis/tools/constants.py +++ b/mavis/tools/constants.py @@ -1,32 +1,35 @@ -from ..constants import MavisNamespace, SVTYPE +from ..constants import SVTYPE, MavisNamespace -SUPPORTED_TOOL = MavisNamespace( - MANTA='manta', - DELLY='delly', - TA='transabyss', - PINDEL='pindel', - CHIMERASCAN='chimerascan', - MAVIS='mavis', - DEFUSE='defuse', - BREAKDANCER='breakdancer', - VCF='vcf', - BREAKSEQ='breakseq', - CNVNATOR='cnvnator', - STRELKA='strelka', - STARFUSION='starfusion', -) -""" -Supported Tools used to call SVs and then used as input into MAVIS -- chimerascan [Iyer-2011]_ -- defuse [McPherson-2011]_ -- delly [Rausch-2012]_ -- manta [Chen-2016]_ -- pindel [Ye-2009]_ -- transabyss [Robertson-2010]_ -""" +class SUPPORTED_TOOL(MavisNamespace): + """ + Supported Tools used to call SVs and then used as input into MAVIS + + Attributes: + CHIMERASCAN: chimerascan [Iyer-2011]_ + DEFUSE: defuse [McPherson-2011]_ + DELLY: delly [Rausch-2012]_ + MANTA: manta [Chen-2016]_ + PINDEL: pindel [Ye-2009]_ + TA: transabyss [Robertson-2010]_ + """ + + MANTA = 'manta' + DELLY = 'delly' + TA = 'transabyss' + PINDEL = 'pindel' + CHIMERASCAN = 'chimerascan' + MAVIS = 'mavis' + DEFUSE = 'defuse' + BREAKDANCER = 'breakdancer' + VCF = 'vcf' + BREAKSEQ = 'breakseq' + CNVNATOR = 'cnvnator' + STRELKA = 'strelka' + STARFUSION = 'starfusion' + -TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} +TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} # type: ignore TOOL_SVTYPE_MAPPING.update( { 'DEL': [SVTYPE.DEL], diff --git a/mavis/util.py b/mavis/util.py index db673fa2..8cf5c558 100644 --- a/mavis/util.py +++ b/mavis/util.py @@ -15,7 +15,8 @@ from tab import tab from .breakpoint import Breakpoint, BreakpointPair -from .constants import COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE, MavisNamespace, sort_columns +from .constants import (COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE, + MavisNamespace, sort_columns) from .error import InvalidRearrangement from .interval import Interval @@ -132,11 +133,6 @@ def get_env_variable(arg, default, cast_type=None): return default -class WeakMavisNamespace(MavisNamespace): - def is_env_overwritable(self, attr): - return True - - def bash_expands(*expressions): """ expand a file glob expression, allowing bash-style brackets. @@ -169,7 +165,7 @@ def log_arguments(args): """ LOG('arguments', time_stamp=True) with LOG.indent() as log: - for arg, val in sorted(args.items()): + for arg, val in sorted(args.__dict__.items()): if isinstance(val, list): if len(val) <= 1: log(arg, '= {}'.format(val)) diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py index 9da45677..b69571db 100644 --- a/tests/unit/test_constants.py +++ b/tests/unit/test_constants.py @@ -1,11 +1,12 @@ import unittest + from mavis.constants import ( COLUMNS, - MavisNamespace, ORIENT, + STRAND, + MavisNamespace, reverse_complement, sort_columns, - STRAND, translate, ) @@ -51,70 +52,3 @@ def test_column_matches_column_name(self): self.assertEqual(COLUMNS.library, COLUMNS.library) s = set([COLUMNS.library, COLUMNS.library]) self.assertEqual(1, len(s)) - - -class TestMavisNamespace(unittest.TestCase): - def setUp(self): - self.namespace = MavisNamespace(a=1, b=2, c=3) - - def test_get_item(self): - self.assertEqual(1, self.namespace['a']) - self.assertEqual(1, self.namespace.a) - self.assertEqual(1, self.namespace.get('a', None)) - - def test_to_dict(self): - self.assertEqual({'a': 1, 'b': 2, 'c': 3}, self.namespace.to_dict()) - - def test_get_with_default(self): - self.assertEqual(4, self.namespace.get('d', 4)) - - def test_get_without_default_errors(self): - self.assertEqual(None, self.namespace.get('d', None)) - - def test_error_on_undefined(self): - with self.assertRaises(KeyError): - self.namespace.define('a') - - def test_infered_typing(self): - self.assertEqual(int, self.namespace.type('a')) - - def test_keys(self): - self.assertEqual(['a', 'b', 'c'], self.namespace.keys()) - - def test_add(self): - self.namespace.add('d', 4, defn='this is the letter d', cast_type=float) - self.assertEqual(float, self.namespace.type('d')) - self.assertEqual('this is the letter d', self.namespace.define('d')) - self.assertEqual(4, self.namespace.d) - - def test_add_infer_type(self): - self.namespace.add('d', 4, defn='this is the letter d') - self.assertEqual(int, self.namespace.type('d')) - self.assertEqual('this is the letter d', self.namespace.define('d')) - self.assertEqual(4, self.namespace.d) - - def test_error_on_enforce_bad_value(self): - with self.assertRaises(KeyError): - self.namespace.enforce(5) - - def test_reverse(self): - self.assertEqual('a', self.namespace.reverse(1)) - - def test_reverse_nonunique_error(self): - self.namespace['d'] = 1 - with self.assertRaises(KeyError): - self.namespace.reverse(1) - - def test_reverse_bad_value_error(self): - with self.assertRaises(KeyError): - self.namespace.reverse(5) - - def test_get_argument_error(self): - with self.assertRaises(TypeError): - self.namespace.get('a', 1, 1) - with self.assertRaises(AttributeError): - self.namespace.get('d') - - def test_iterating(self): - for act, exp in zip(self.namespace, ['a', 'b', 'c']): - self.assertEqual(exp, act) diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index 00d8c90d..62b6910e 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -4,13 +4,11 @@ from mavis.constants import COLUMNS, ORIENT, STRAND from mavis.error import NotSpecifiedError from mavis.util import ( - cast, ENV_VAR_PREFIX, + cast, + get_connected_components, get_env_variable, - MavisNamespace, - WeakMavisNamespace, read_bpp_from_input_file, - get_connected_components, ) from .mock import Mock @@ -74,57 +72,6 @@ def test_needs_casting(self): self.assertEqual(15, get_env_variable('test_env', 1)) -class TestMavisNamespace(unittest.TestCase): - def setUp(self): - self.namespace = MavisNamespace() - - def test_item_getter(self): - self.namespace.thing = 2 - self.assertEqual(2, self.namespace['thing']) - self.assertEqual(2, self.namespace.thing) - - def test_items(self): - print(self.namespace) - self.namespace.thing = 2 - print(self.namespace) - self.namespace.otherthing = 3 - print(self.namespace) - self.assertEqual({'thing': 2, 'otherthing': 3}, self.namespace._members) - self.assertEqual([('otherthing', 3), ('thing', 2)], list(sorted(self.namespace.items()))) - - -class TestWeakMavisNamespace(unittest.TestCase): - def setUp(self): - self.namespace = WeakMavisNamespace(a=1, b=2, c=3) - print(self.namespace._members) - for v in ['a', 'b', 'c']: - v = ENV_VAR_PREFIX + v.upper() - if v in os.environ: - del os.environ[v] - - def test_no_env_set(self): - self.assertEqual(1, self.namespace.a) - self.assertEqual(1, self.namespace['a']) - - def test_env_overrides_default(self): - os.environ['MAVIS_A'] = '5' - env_name = self.namespace.get_env_name('a') - self.assertEqual('MAVIS_A', env_name) - self.assertEqual('5', os.environ[env_name]) - self.assertTrue(self.namespace.is_env_overwritable('a')) - self.assertEqual(5, self.namespace.a) - self.assertEqual(1, self.namespace._members['a']) - self.assertEqual(5, self.namespace['a']) - - def test_error_on_invalid_attr(self): - with self.assertRaises(AttributeError): - self.namespace.other - - def test_iterate_keys(self): - self.assertEqual(['a', 'b', 'c'], list(self.namespace.keys())) - self.assertEqual(['a', 'b', 'c'], [k for k in self.namespace]) - - class TestReadBreakpointPairsFromFile(unittest.TestCase): def build_filehandle(self, row): header = [c for c in row] From f88b6c4cd26e0f91b715507c2fa4c17871d9c84b Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 17:02:52 -0700 Subject: [PATCH 010/137] Update documentation --- docs/configuration/general.md | 68 +- docs/configuration/pipeline.md | 122 +-- docs/configuration/settings.md | 1829 +++++++++++++++++++++----------- docs/hooks.py | 109 +- docs/outputs/columns.md | 24 +- docs/tutorials/full.md | 289 +++-- 6 files changed, 1422 insertions(+), 1019 deletions(-) diff --git a/docs/configuration/general.md b/docs/configuration/general.md index 443cedee..e22f5339 100644 --- a/docs/configuration/general.md +++ b/docs/configuration/general.md @@ -4,54 +4,31 @@ An exhaustive list of the various configurable settings can be found [here](../s ## Pipeline Configuration File -The pipeline can be run in steps or it can be configured using a +The pipeline can be run in steps or it can be configured using a JSON configuration file and setup in a single step. Scripts will be generated -to run all steps following clustering. The configuration file can be -built from scratch or a template can be output as shown below +to run all steps following clustering. -```bash -mavis config --write template.cfg -``` +The config schema is found in the mavis package under `mavis/schemas/config.json` -This will create a template config file called template.cfg which can -then be edited by the user. However this will be a simple config with no -library information. To generate a configuration file with the library -information as well as estimates for the fragment size parameters more -inputs are required (see -[generating the config file](../../tutorials/full/#generating-the-config-file) for more information). - -## Environment Variables - -Most of the default settings can be changed by using environment -variables. The value given by the environment variables will be used as -the new default. Config or command-line parameters will still override -these settings. - -All environment variables are prefixed with MAVIS and an underscore. -Otherwise the variable name is the same as that used for the command -line parameter or config setting (uppercased). For example to change the -default minimum mapping quality used during the validate stage - -```bash -export MAVIS_MIN_MAPPING_QUALITY=10 -``` +Top level settings follow the pattern `

.`. The convert and library +sections are nested objects. ## Adjusting the Resource Requirements ### Choosing the Number of Validation/Annotation Jobs MAVIS chooses the number of jobs to split validate/annotate stages into -based on two settings: [max_files](../../configuration/settings/#max_files) and -[min_clusters_per_file](../../configuration/settings/#min-clusters-per-file). +based on two settings: [cluster.max_files](../../configuration/settings/#clustermax_files) and +[cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file). For example, in the following situation say you have: 1000 clusters, -`max_files=10`, and `min_clusters_per_file=10`. Then MAVIS will set up +`cluster.max_files=10`, and `cluster.min_clusters_per_file=10`. Then MAVIS will set up 10 validation jobs each with 100 events. -However, if `min_clusters_per_file=500`, then MAVIS would only set up 2 +However, if `cluster.min_clusters_per_file=500`, then MAVIS would only set up 2 jobs each with 500 events. This is because -[min_clusters_per_file](../../configuration/settings/#min-clusters-per-file) takes precedence -over [max_files](../../configuration/settings/#max_files). +[cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file) takes precedence +over [custer.max_files](../../configuration/settings/#clustermax_files). Splitting into more jobs will lower the resource requirements per job (see [resource requirements](../performance/)). The memory and time requirements for validation are linear @@ -60,27 +37,8 @@ with respect to the number of events to be validated. ### Uninformative Filter For example, if the user is only interested in events in genes, then the -[uninformative_filter](../../configuration/settings/#uninformative_filter) can be used. This +[cluster.uninformative_filter](../../configuration/settings/#clusteruninformative_filter) can be used. This will drop all events that are not within a certain distance -([max_proximity](../../configuration/settings/#max_proximity)) to any annotation in +([cluster.max_proximity](../../configuration/settings/#clustermax_proximity)) to any annotation in the annotations reference file. These events will be dropped prior to the validation stage which results in significant speed up. - -This can be set using the environment variable - -```bash -export MAVIS_UNINFORMATIVE_FILTER=True -``` - -or in the pipeline config file - -```text -[cluster] -uninformative_filter = True -``` - -or as a command line argument to the cluster stage - -```bash -mavis cluster --uninformative_filter True .... -``` diff --git a/docs/configuration/pipeline.md b/docs/configuration/pipeline.md index 76ffdbbf..d8073a1c 100644 --- a/docs/configuration/pipeline.md +++ b/docs/configuration/pipeline.md @@ -5,135 +5,21 @@ MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling and setup -The setup step of MAVIS is set up to use a job scheduler on a -compute cluster. will generate submission scripts and a wrapper bash -script for the user to execute on their cluster head node. - -![](../images/pipeline_options.svg) - The MAVIS pipeline is highly configurable. Some pipeline steps (cluster, validate) are optional and can be automatically skipped. The standard pipeline is far-left. - -The most common use case is -[auto-generating a configuration file](../../tutorials/full/#generating-the-config-file) and then running the pipeline setup step. The pipeline setup -step will run clustering and create scripts for running the other steps. +The most common use case is running the pipeline through snakemake ```bash -mavis config .... -w config.cfg -mavis setup config.cfg -o /path/to/top/output_dir +snakemake -j --configfile ``` -This will create the build.cfg configuration file, which is used by the -scheduler to submit jobs. To use a particular scheduler you will need to -set the `MAVIS_SCHEDULER` environment variable. After the -build configuration file has been created you can run the mavis schedule -option to submit your jobs +If you are submitting to a cluster, use the [snakemake profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) ```bash -ssh cluster_head_node -mavis schedule -o /path/to/output_dir --submit +snakemake -j --configfile --profile ``` This will submit a series of jobs with dependencies. - -![](../images/pipeline_dependency_graph.svg) - -Dependency graph of MAVIS jobs for the standard pipeline setup. The -notation on the arrows indicates the SLURM setting on the job to add the -dependency on the previous -job. - - -### Configuring Scheduler Settings - -There are multiple ways to configure the scheduler settings. Some of the -configurable options are listed below - -- [MAVIS_QUEUE](../../configuration/settings/#queue) -- [MAVIS_MEMORY_LIMIT](../../configuration/settings/#memory_limit) -- [MAVIS_TIME_LIMIT](../../configuration/settings/#time_limit) -- [MAVIS_IMPORT_ENV](../../configuration/settings/#import_env) -- [MAVIS_SCHEDULER](../../configuration/settings/#scheduler) - -For example to set the job queue default using an -[environment variable](../../configuration/general/#environment-variables) - -```bash -export MAVIS_QUEUE=QUEUENAME -``` - -Or it can also be added to the config file manually - - [schedule] - queue = QUEUENAME - -### Troubleshooting Dependency Failures - -The most common error to occur when running MAVIS on the cluster is a -memory or time limit exception. These can be detected by running the -schedule step or looking for dependency failures reported on the -cluster. The suffix of the job name will be a number and will correspond -to the suffix of the job directory. - -```bash -mavis schedule -o /path/to/output/dir -``` - -This will report any failed jobs. For example if this were a crash -report for one of the validation jobs we might expect to see something -like below in the schedule output - - [2018-05-31 13:02:06] validate - MV__- () is FAILED - CRASH: - -Any jobs in an error, failed, etc. state can be resubmitted by running -mavis schedule with the resubmit flag - -```bash -mavis schedule -o /path/to/output/dir --resubmit -``` - -If a job has failed due to memory or time limits, editing the -`/path/to/output/dir/build.cfg` file can allow the user to change a job -without resetting up and rerunning the other jobs. For example, below is -the configuration for a validation job - - [MV_mock-A47933_batch-D2nTiy9AhGye4UZNapAik6] - stage = validate - job_ident = 1691742 - name = MV_mock-A47933_batch-D2nTiy9AhGye4UZNapAik6 - dependencies = - script = /path/to/output/dir/mock-A47933_diseased_transcriptome/validate/submit.sh - status = FAILED - output_dir = /path/to/output/dir/mock-A47933_diseased_transcriptome/validate/batch-D2nTiy9AhGye4UZNapAik6-{task_ident} - stdout = /path/to/output/dir/mock-A47933_diseased_transcriptome/validate/batch-D2nTiy9AhGye4UZNapAik6-{task_ident}/job-{name}-{job_ident}-{task_ident}.log - created_at = 1527641526 - status_comment = - memory_limit = 18000 - queue = short - time_limit = 57600 - import_env = True - mail_user = - mail_type = NONE - concurrency_limit = None - task_list = 1 - 2 - 3 - -The memory\_limit is in Mb and the time\_limit is in seconds. Editing -the values here will cause the job to be resubmitted with the new -values. - -!!! warning - Incorrectly editing the build.cfg file may have unanticipated results - and require re-setting up MAVIS to fix. Generally the user should ONLY - edit `memory_limit` and `time_limit` values. - - If memory errors are frequent then it would be better to adjust the - default values ([trans_validation_memory](../../configuration/settings/#trans_validation_memory), - [validation_memory](../../configuration/settings/#validation_memory), - [time_limit](../../configuration/settings/#time_limit)) diff --git a/docs/configuration/settings.md b/docs/configuration/settings.md index 1921e93b..9b388424 100644 --- a/docs/configuration/settings.md +++ b/docs/configuration/settings.md @@ -1,1211 +1,1798 @@ # Configurable Settings -## aligner +## annotate.annotation_filters -**type**: `#!python mavis.align.SUPPORTED_ALIGNER` +**type**: `#!python List[str]` -**environment variable**: `MAVIS_ALIGNER` +**default**: `#!python ['choose_more_annotated', 'choose_transcripts_by_priority']` -**default**: `#!python 'blat'` +A comma separated list of filters to apply to putative annotations -**accepted values**: `'bwa mem'`, `'blat'` +**schema definition**: +```json +{ + "items": { + "enum": [ + "choose_more_annotated", + "choose_transcripts_by_priority" + ], + "type": "string" + }, + "type": "array" +} +``` -The aligner to use to map the contigs/reads back to the reference e.g blat or bwa - +## annotate.draw_fusions_only -## aligner_reference +**type**: `#!python boolean` -**type**: `#!python filepath` +**default**: `#!python True` -**environment variable**: `MAVIS_ALIGNER_REFERENCE` +Flag to indicate if events which do not produce a fusion transcript should produce illustrations -**default**: `#!python None` +**schema definition**: +```json +{ + "type": "boolean" +} +``` -Path to the aligner reference file used for aligning the contig sequences - -## annotation_filters +## annotate.draw_non_synonymous_cdna_only -**type**: `#!python str` +**type**: `#!python boolean` + +**default**: `#!python True` -**environment variable**: `MAVIS_ANNOTATION_FILTERS` +Flag to indicate if events which are synonymous at the cdna level should produce illustrations -**default**: `#!python 'choose_more_annotated,choose_transcripts_by_priority'` +**schema definition**: +```json +{ + "type": "boolean" +} +``` -A comma separated list of filters to apply to putative annotations - -## annotation_memory +## annotate.max_orf_cap **type**: `#!python int` -**environment variable**: `MAVIS_ANNOTATION_MEMORY` +**default**: `#!python 3` -**default**: `#!python 12000` +The maximum number of orfs to return (best putative orfs will be retained) -Default memory limit (mb) for the annotation stage - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## annotations -**type**: `#!python filepath` +## annotate.min_domain_mapping_match -**environment variable**: `MAVIS_ANNOTATIONS` +**type**: `#!python number` -**default**: `#!python []` +**default**: `#!python 0.9` -Path to the reference annotations of genes, transcript, exons, domains, etc - +A number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript to be displayed -## assembly_kmer_size +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -**type**: `#!python float_fraction` -**environment variable**: `MAVIS_ASSEMBLY_KMER_SIZE` +## annotate.min_orf_size -**default**: `#!python 0.74` +**type**: `#!python int` -The percent of the read length to make kmers for assembly - +**default**: `#!python 300` + +The minimum length (in base pairs) to retain a putative open reading frame (orf) + +**schema definition**: +```json +{ + "type": "integer" +} +``` + + +## bam_stats.distribution_fraction + +**type**: `#!python number` + +**default**: `#!python 0.97` -## assembly_max_paths +the proportion of the distribution to use in computing stdev + +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0.01, + "type": "number" +} +``` + + +## bam_stats.sample_bin_size **type**: `#!python int` -**environment variable**: `MAVIS_ASSEMBLY_MAX_PATHS` +**default**: `#!python 1000` -**default**: `#!python 8` +how large to make the sample bin (in bp) + +**schema definition**: +```json +{ + "type": "integer" +} +``` -The maximum number of paths to resolve. this is used to limit when there is a messy assembly graph to resolve. the assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater than the given setting - -## assembly_min_edge_trim_weight +## bam_stats.sample_cap **type**: `#!python int` -**environment variable**: `MAVIS_ASSEMBLY_MIN_EDGE_TRIM_WEIGHT` +**default**: `#!python 1000` -**default**: `#!python 3` +maximum number of reads to collect for any given sample region + +**schema definition**: +```json +{ + "type": "integer" +} +``` -This is used to simplify the debruijn graph before path finding. edges with less than this frequency will be discarded if they are non-cutting, at a fork, or the end of a path - -## assembly_min_exact_match_to_remap +## bam_stats.sample_size **type**: `#!python int` -**environment variable**: `MAVIS_ASSEMBLY_MIN_EXACT_MATCH_TO_REMAP` +**default**: `#!python 500` -**default**: `#!python 15` +the number of genes/bins to compute stats over -The minimum length of exact matches to initiate remapping a read to a contig - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## assembly_min_remap_coverage -**type**: `#!python float_fraction` +## cluster.cluster_initial_size_limit -**environment variable**: `MAVIS_ASSEMBLY_MIN_REMAP_COVERAGE` +**type**: `#!python int` -**default**: `#!python 0.9` +**default**: `#!python 25` + +The maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering phase (combining based on overlap) + +**schema definition**: +```json +{ + "type": "integer" +} +``` -Minimum fraction of the contig sequence which the remapped sequences must align over - -## assembly_min_remapped_seq +## cluster.cluster_radius **type**: `#!python int` -**environment variable**: `MAVIS_ASSEMBLY_MIN_REMAPPED_SEQ` +**default**: `#!python 100` -**default**: `#!python 3` +Maximum distance allowed between paired breakpoint pairs -The minimum input sequences that must remap for an assembled contig to be used - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## assembly_min_uniq -**type**: `#!python float_fraction` +## cluster.limit_to_chr -**environment variable**: `MAVIS_ASSEMBLY_MIN_UNIQ` +**type**: `#!python Union[List, null]` -**default**: `#!python 0.1` +**default**: `#!python ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']` -Minimum percent uniq required to keep separate assembled contigs. if contigs are more similar then the lower scoring, then shorter, contig is dropped - +A list of chromosome names to use. breakpointpairs on other chromosomes will be filteredout. for example '1 2 3 4' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4 -## assembly_strand_concordance +**schema definition**: +```json +{ + "items": { + "type": "string" + }, + "type": [ + "array", + "null" + ] +} +``` -**type**: `#!python float_fraction` -**environment variable**: `MAVIS_ASSEMBLY_STRAND_CONCORDANCE` +## cluster.max_files -**default**: `#!python 0.51` +**type**: `#!python int` -When the number of remapped reads from each strand are compared, the ratio must be above this number to decide on the strand - +**default**: `#!python 200` + +The maximum number of files to output from clustering/splitting + +**schema definition**: +```json +{ + "minimum": 1, + "type": "integer" +} +``` -## blat_limit_top_aln + +## cluster.max_proximity **type**: `#!python int` -**environment variable**: `MAVIS_BLAT_LIMIT_TOP_ALN` +**default**: `#!python 5000` -**default**: `#!python 10` +The maximum distance away from an annotation before the region in considered to be uninformative -Number of results to return from blat (ranking based on score) - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## blat_min_identity -**type**: `#!python float_fraction` +## cluster.min_clusters_per_file -**environment variable**: `MAVIS_BLAT_MIN_IDENTITY` +**type**: `#!python int` -**default**: `#!python 0.9` +**default**: `#!python 50` -The minimum percent identity match required for blat results when aligning contigs - +The minimum number of breakpoint pairs to output to a file -## breakpoint_color +**schema definition**: +```json +{ + "minimum": 1, + "type": "integer" +} +``` + + +## cluster.split_only + +**type**: `#!python boolean` + +**default**: `#!python False` + +just split the input files, do not merge input breakpoints into clusters + +**schema definition**: +```json +{ + "type": "boolean" +} +``` + + +## cluster.uninformative_filter + +**type**: `#!python boolean` + +**default**: `#!python False` + +Flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered out prior to clustering + +**schema definition**: +```json +{ + "type": "boolean" +} +``` -**type**: `#!python str` -**environment variable**: `MAVIS_BREAKPOINT_COLOR` +## illustrate.breakpoint_color + +**type**: `#!python str` **default**: `#!python '#000000'` Breakpoint outline color - -## call_error +**schema definition**: +```json +{ + "type": "string" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_CALL_ERROR` +## illustrate.domain_color -**default**: `#!python 10` +**type**: `#!python str` -Buffer zone for the evidence window - +**default**: `#!python '#ccccb3'` -## clean_aligner_files +Domain fill color -**type**: `#!python cast_boolean` +**schema definition**: +```json +{ + "type": "string" +} +``` -**environment variable**: `MAVIS_CLEAN_ALIGNER_FILES` -**default**: `#!python False` +## illustrate.domain_mismatch_color -Remove the aligner output files after the validation stage is complete. not required for subsequent steps but can be useful in debugging and deep investigation of events - +**type**: `#!python str` -## cluster_initial_size_limit +**default**: `#!python '#b2182b'` -**type**: `#!python int` +Domain fill color on 0%% match -**environment variable**: `MAVIS_CLUSTER_INITIAL_SIZE_LIMIT` +**schema definition**: +```json +{ + "type": "string" +} +``` -**default**: `#!python 25` -The maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering phase (combining based on overlap) - +## illustrate.domain_name_regex_filter + +**type**: `#!python str` -## cluster_radius +**default**: `#!python '^PF\\d+$'` -**type**: `#!python int` +The regular expression used to select domains to be displayed (filtered by name) -**environment variable**: `MAVIS_CLUSTER_RADIUS` +**schema definition**: +```json +{ + "type": "string" +} +``` -**default**: `#!python 100` -Maximum distance allowed between paired breakpoint pairs - +## illustrate.domain_scaffold_color -## concurrency_limit +**type**: `#!python str` -**type**: `#!python int` +**default**: `#!python '#000000'` -**environment variable**: `MAVIS_CONCURRENCY_LIMIT` +The color of the domain scaffold -**default**: `#!python None` +**schema definition**: +```json +{ + "type": "string" +} +``` -The concurrency limit for tasks in any given job array or the number of concurrent processes allowed for a local run - -## contig_aln_max_event_size +## illustrate.drawing_width_iter_increase **type**: `#!python int` -**environment variable**: `MAVIS_CONTIG_ALN_MAX_EVENT_SIZE` +**default**: `#!python 500` -**default**: `#!python 50` +The amount (in pixels) by which to increase the drawing width upon failure to fit + +**schema definition**: +```json +{ + "type": "integer" +} +``` -Relates to determining breakpoints when pairing contig alignments. for any given read in a putative pair the soft clipping is extended to include any events of greater than this size. the softclipping is added to the side of the alignment as indicated by the breakpoint we are assigning pairs to - -## contig_aln_merge_inner_anchor +## illustrate.exon_min_focus_size **type**: `#!python int` -**environment variable**: `MAVIS_CONTIG_ALN_MERGE_INNER_ANCHOR` +**default**: `#!python 10` -**default**: `#!python 20` +Minimum size of an exon for it to be granted a label or min exon width -The minimum number of consecutive exact match base pairs to not merge events within a contig alignment - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## contig_aln_merge_outer_anchor -**type**: `#!python int` +## illustrate.gene1_color -**environment variable**: `MAVIS_CONTIG_ALN_MERGE_OUTER_ANCHOR` +**type**: `#!python str` -**default**: `#!python 15` +**default**: `#!python '#657e91'` -Minimum consecutively aligned exact matches to anchor an end for merging internal events - +The color of genes near the first gene -## contig_aln_min_anchor_size +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_CONTIG_ALN_MIN_ANCHOR_SIZE` +## illustrate.gene1_color_selected -**default**: `#!python 50` +**type**: `#!python str` -The minimum number of aligned bases for a contig (m or =) in order to simplify. do not have to be consecutive - +**default**: `#!python '#518dc5'` -## contig_aln_min_extend_overlap +The color of the first gene -**type**: `#!python int` +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -**environment variable**: `MAVIS_CONTIG_ALN_MIN_EXTEND_OVERLAP` -**default**: `#!python 10` +## illustrate.gene2_color -Minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment - +**type**: `#!python str` -## contig_aln_min_query_consumption +**default**: `#!python '#325556'` -**type**: `#!python float_fraction` +The color of genes near the second gene -**environment variable**: `MAVIS_CONTIG_ALN_MIN_QUERY_CONSUMPTION` +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -**default**: `#!python 0.9` -Minimum fraction of the original query sequence that must be used by the read(s) of the alignment - +## illustrate.gene2_color_selected -## contig_aln_min_score +**type**: `#!python str` -**type**: `#!python float_fraction` +**default**: `#!python '#4c9677'` -**environment variable**: `MAVIS_CONTIG_ALN_MIN_SCORE` +The color of the second gene -**default**: `#!python 0.9` +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -Minimum score for a contig to be used as evidence in a call by contig - -## contig_call_distance +## illustrate.label_color -**type**: `#!python int` +**type**: `#!python str` -**environment variable**: `MAVIS_CONTIG_CALL_DISTANCE` +**default**: `#!python '#000000'` -**default**: `#!python 10` +The label color -The maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair - +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` + + +## illustrate.mask_fill -## dgv_annotation +**type**: `#!python str` + +**default**: `#!python '#ffffff'` + +Color of mask (for deleted region etc.) + +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` + + +## illustrate.mask_opacity + +**type**: `#!python number` -**type**: `#!python filepath` +**default**: `#!python 0.7` + +Opacity of the mask layer -**environment variable**: `MAVIS_DGV_ANNOTATION` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -**default**: `#!python []` -Path to the dgv reference processed to look like the cytoband file - +## illustrate.max_drawing_retries + +**type**: `#!python int` + +**default**: `#!python 5` + +The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output -## domain_color +**schema definition**: +```json +{ + "type": "integer" +} +``` + + +## illustrate.novel_exon_color **type**: `#!python str` -**environment variable**: `MAVIS_DOMAIN_COLOR` +**default**: `#!python '#5D3F6A'` -**default**: `#!python '#ccccb3'` +Novel exon fill color + +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -Domain fill color - -## domain_mismatch_color +## illustrate.scaffold_color **type**: `#!python str` -**environment variable**: `MAVIS_DOMAIN_MISMATCH_COLOR` +**default**: `#!python '#000000'` -**default**: `#!python '#b2182b'` +The color used for the gene/transcripts scaffolds + +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -Domain fill color on 0%% match - -## domain_name_regex_filter +## illustrate.splice_color **type**: `#!python str` -**environment variable**: `MAVIS_DOMAIN_NAME_REGEX_FILTER` +**default**: `#!python '#000000'` -**default**: `#!python '^PF\\d+$'` +Splicing lines color -The regular expression used to select domains to be displayed (filtered by name) - +**schema definition**: +```json +{ + "pattern": "^#[a-zA-Z0-9]{6}", + "type": "string" +} +``` -## domain_scaffold_color + +## illustrate.width + +**type**: `#!python int` + +**default**: `#!python 1000` + +The drawing width in pixels + +**schema definition**: +```json +{ + "type": "integer" +} +``` + + +## log **type**: `#!python str` -**environment variable**: `MAVIS_DOMAIN_SCAFFOLD_COLOR` +**default**: `#!python None` -**default**: `#!python '#000000'` -The color of the domain scaffold - -## draw_fusions_only +**schema definition**: +```json +{ + "type": "string" +} +``` -**type**: `#!python cast_boolean` -**environment variable**: `MAVIS_DRAW_FUSIONS_ONLY` +## log_level -**default**: `#!python True` +**type**: `#!python str` -Flag to indicate if events which do not produce a fusion transcript should produce illustrations - +**default**: `#!python 'INFO'` -## draw_non_synonymous_cdna_only -**type**: `#!python cast_boolean` -**environment variable**: `MAVIS_DRAW_NON_SYNONYMOUS_CDNA_ONLY` +**schema definition**: +```json +{ + "enum": [ + "INFO", + "DEBUG" + ], + "type": "string" +} +``` -**default**: `#!python True` -Flag to indicate if events which are synonymous at the cdna level should produce illustrations - +## output_dir -## drawing_width_iter_increase +**type**: `#!python str` + +**default**: `#!python None` + + + +**schema definition**: +```json +{ + "type": "string" +} +``` + + +## pairing.contig_call_distance **type**: `#!python int` -**environment variable**: `MAVIS_DRAWING_WIDTH_ITER_INCREASE` +**default**: `#!python 10` -**default**: `#!python 500` +The maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair + +**schema definition**: +```json +{ + "type": "integer" +} +``` -The amount (in pixels) by which to increase the drawing width upon failure to fit - -## exon_min_focus_size +## pairing.flanking_call_distance **type**: `#!python int` -**environment variable**: `MAVIS_EXON_MIN_FOCUS_SIZE` +**default**: `#!python 50` -**default**: `#!python 10` +The maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair -Minimum size of an exon for it to be granted a label or min exon width - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## fetch_min_bin_size + +## pairing.input_call_distance **type**: `#!python int` -**environment variable**: `MAVIS_FETCH_MIN_BIN_SIZE` +**default**: `#!python 20` -**default**: `#!python 50` +The maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair -The minimum size of any bin for reading from a bam file. increasing this number will result in smaller bins being merged or less bins being created (depending on the fetch method) - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## fetch_reads_bins + +## pairing.spanning_call_distance **type**: `#!python int` -**environment variable**: `MAVIS_FETCH_READS_BINS` +**default**: `#!python 20` -**default**: `#!python 5` +The maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair + +**schema definition**: +```json +{ + "type": "integer" +} +``` -Number of bins to split an evidence window into to ensure more even sampling of high coverage regions - -## fetch_reads_limit +## pairing.split_call_distance **type**: `#!python int` -**environment variable**: `MAVIS_FETCH_READS_LIMIT` +**default**: `#!python 20` -**default**: `#!python 3000` +The maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair -Maximum number of reads, cap, to loop over for any given evidence window - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## filter_cdna_synon -**type**: `#!python cast_boolean` +## reference.aligner_reference -**environment variable**: `MAVIS_FILTER_CDNA_SYNON` +**type**: `#!python List[str]` -**default**: `#!python True` +**default**: `#!python None` -Filter all annotations synonymous at the cdna level - -## filter_min_complexity -**type**: `#!python float_fraction` +**schema definition**: +```json +{ + "examples": [ + "tests/data/mock_reference_genome.2bit" + ], + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" +} +``` -**environment variable**: `MAVIS_FILTER_MIN_COMPLEXITY` -**default**: `#!python 0.2` +## reference.annotations -Filter event calls based on call sequence complexity - +**type**: `#!python List[str]` -## filter_min_flanking_reads +**default**: `#!python None` -**type**: `#!python int` -**environment variable**: `MAVIS_FILTER_MIN_FLANKING_READS` -**default**: `#!python 10` +**schema definition**: +```json +{ + "examples": [ + "tests/data/mock_annotations.json" + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" +} +``` -Minimum number of flanking pairs for a call by flanking pairs - -## filter_min_linking_split_reads +## reference.dgv_annotation -**type**: `#!python int` +**type**: `#!python List[str]` -**environment variable**: `MAVIS_FILTER_MIN_LINKING_SPLIT_READS` +**default**: `#!python None` -**default**: `#!python 1` -Minimum number of linking split reads for a call by split reads - -## filter_min_remapped_reads +**schema definition**: +```json +{ + "examples": [ + [ + "tests/data/mock_dgv_annotation.txt" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_FILTER_MIN_REMAPPED_READS` +## reference.masking -**default**: `#!python 5` +**type**: `#!python List[str]` -Minimum number of remapped reads for a call by contig - +**default**: `#!python None` -## filter_min_spanning_reads -**type**: `#!python int` -**environment variable**: `MAVIS_FILTER_MIN_SPANNING_READS` +**schema definition**: +```json +{ + "examples": [ + [ + "tests/data/mock_masking.tab" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" +} +``` -**default**: `#!python 5` -Minimum number of spanning reads for a call by spanning reads - +## reference.reference_genome -## filter_min_split_reads +**type**: `#!python List[str]` -**type**: `#!python int` +**default**: `#!python None` -**environment variable**: `MAVIS_FILTER_MIN_SPLIT_READS` -**default**: `#!python 5` -Minimum number of split reads for a call by split reads - +**schema definition**: +```json +{ + "examples": [ + [ + "tests/data/mock_reference_genome.fa" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" +} +``` + + +## reference.template_metadata + +**type**: `#!python List[str]` + +**default**: `#!python None` + + -## filter_protein_synon +**schema definition**: +```json +{ + "examples": [ + [ + "tests/data/cytoBand.txt" + ] + ], + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" +} +``` -**type**: `#!python cast_boolean` -**environment variable**: `MAVIS_FILTER_PROTEIN_SYNON` +## skip_stage.validate + +**type**: `#!python boolean` **default**: `#!python False` -Filter all annotations synonymous at the protein level - +skip the validation stage of the MAVIS pipeline + +**schema definition**: +```json +{ + "type": "boolean" +} +``` + -## filter_secondary_alignments +## summary.filter_cdna_synon -**type**: `#!python cast_boolean` +**type**: `#!python boolean` + +**default**: `#!python True` + +Filter all annotations synonymous at the cdna level -**environment variable**: `MAVIS_FILTER_SECONDARY_ALIGNMENTS` +**schema definition**: +```json +{ + "type": "boolean" +} +``` -**default**: `#!python True` -Filter secondary alignments when gathering read evidence - +## summary.filter_min_complexity -## filter_trans_homopolymers +**type**: `#!python number` -**type**: `#!python cast_boolean` +**default**: `#!python 0.2` -**environment variable**: `MAVIS_FILTER_TRANS_HOMOPOLYMERS` +Filter event calls based on call sequence complexity -**default**: `#!python True` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -Filter all single bp ins/del/dup events that are in a homopolymer region of at least 3 bps and are not paired to a genomic event - -## flanking_call_distance +## summary.filter_min_flanking_reads **type**: `#!python int` -**environment variable**: `MAVIS_FLANKING_CALL_DISTANCE` +**default**: `#!python 10` -**default**: `#!python 50` +Minimum number of flanking pairs for a call by flanking pairs -The maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## fuzzy_mismatch_number -**type**: `#!python int` +## summary.filter_min_linking_split_reads -**environment variable**: `MAVIS_FUZZY_MISMATCH_NUMBER` +**type**: `#!python int` **default**: `#!python 1` -The number of events/mismatches allowed to be considered a fuzzy match - +Minimum number of linking split reads for a call by split reads -## gene1_color +**schema definition**: +```json +{ + "type": "integer" +} +``` -**type**: `#!python str` -**environment variable**: `MAVIS_GENE1_COLOR` +## summary.filter_min_remapped_reads -**default**: `#!python '#657e91'` +**type**: `#!python int` -The color of genes near the first gene - +**default**: `#!python 5` -## gene1_color_selected +Minimum number of remapped reads for a call by contig -**type**: `#!python str` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**environment variable**: `MAVIS_GENE1_COLOR_SELECTED` -**default**: `#!python '#518dc5'` +## summary.filter_min_spanning_reads -The color of the first gene - +**type**: `#!python int` -## gene2_color +**default**: `#!python 5` -**type**: `#!python str` +Minimum number of spanning reads for a call by spanning reads -**environment variable**: `MAVIS_GENE2_COLOR` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**default**: `#!python '#325556'` -The color of genes near the second gene - +## summary.filter_min_split_reads -## gene2_color_selected +**type**: `#!python int` -**type**: `#!python str` +**default**: `#!python 5` -**environment variable**: `MAVIS_GENE2_COLOR_SELECTED` +Minimum number of split reads for a call by split reads -**default**: `#!python '#4c9677'` +**schema definition**: +```json +{ + "type": "integer" +} +``` -The color of the second gene - -## import_env +## summary.filter_protein_synon -**type**: `#!python cast_boolean` +**type**: `#!python boolean` -**environment variable**: `MAVIS_IMPORT_ENV` +**default**: `#!python False` -**default**: `#!python True` +Filter all annotations synonymous at the protein level -Flag to import environment variables - +**schema definition**: +```json +{ + "type": "boolean" +} +``` -## input_call_distance -**type**: `#!python int` +## summary.filter_trans_homopolymers -**environment variable**: `MAVIS_INPUT_CALL_DISTANCE` +**type**: `#!python boolean` -**default**: `#!python 20` +**default**: `#!python True` -The maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair - +Filter all single bp ins/del/dup events that are in a homopolymer region of at least 3 bps and are not paired to a genomic event -## label_color +**schema definition**: +```json +{ + "type": "boolean" +} +``` -**type**: `#!python str` -**environment variable**: `MAVIS_LABEL_COLOR` +## validate.aligner -**default**: `#!python '#000000'` +**type**: `#!python str` -The label color - +**default**: `#!python 'blat'` -## limit_to_chr +The aligner to use to map the contigs/reads back to the reference e.g blat or bwa -**type**: `#!python str` +**schema definition**: +```json +{ + "enum": [ + "bwa mem", + "blat" + ], + "type": "string" +} +``` -**environment variable**: `MAVIS_LIMIT_TO_CHR` -**default**: `#!python ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']` +## validate.assembly_kmer_size -A list of chromosome names to use. breakpointpairs on other chromosomes will be filteredout. for example '1 2 3 4' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4 - +**type**: `#!python number` -## mail_type +**default**: `#!python 0.74` -**type**: `#!python mavis.schedule.constants.MAIL_TYPE` +The percent of the read length to make kmers for assembly -**environment variable**: `MAVIS_MAIL_TYPE` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -**default**: `#!python 'NONE'` -**accepted values**: `'BEGIN'`, `'END'`, `'FAIL'`, `'ALL'`, `'NONE'` +## validate.assembly_max_paths +**type**: `#!python int` -When to notify the mail_user (if given) - +**default**: `#!python 8` -## mail_user +The maximum number of paths to resolve. this is used to limit when there is a messy assembly graph to resolve. the assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater than the given setting -**type**: `#!python str` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**environment variable**: `MAVIS_MAIL_USER` -**default**: `#!python ''` +## validate.assembly_min_edge_trim_weight -User(s) to send notifications to - +**type**: `#!python int` -## mask_fill +**default**: `#!python 3` -**type**: `#!python str` +This is used to simplify the debruijn graph before path finding. edges with less than this frequency will be discarded if they are non-cutting, at a fork, or the end of a path -**environment variable**: `MAVIS_MASK_FILL` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**default**: `#!python '#ffffff'` -Color of mask (for deleted region etc.) - +## validate.assembly_min_exact_match_to_remap -## mask_opacity +**type**: `#!python int` -**type**: `#!python float_fraction` +**default**: `#!python 15` -**environment variable**: `MAVIS_MASK_OPACITY` +The minimum length of exact matches to initiate remapping a read to a contig -**default**: `#!python 0.7` +**schema definition**: +```json +{ + "type": "integer" +} +``` -Opacity of the mask layer - -## masking +## validate.assembly_min_remap_coverage + +**type**: `#!python number` -**type**: `#!python filepath` +**default**: `#!python 0.9` -**environment variable**: `MAVIS_MASKING` +Minimum fraction of the contig sequence which the remapped sequences must align over -**default**: `#!python []` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -File containing regions for which input events overlapping them are dropped prior to validation - -## max_drawing_retries +## validate.assembly_min_remapped_seq **type**: `#!python int` -**environment variable**: `MAVIS_MAX_DRAWING_RETRIES` +**default**: `#!python 3` -**default**: `#!python 5` +The minimum input sequences that must remap for an assembled contig to be used -The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## max_files -**type**: `#!python int` +## validate.assembly_min_uniq -**environment variable**: `MAVIS_MAX_FILES` +**type**: `#!python number` -**default**: `#!python 200` +**default**: `#!python 0.1` -The maximum number of files to output from clustering/splitting - +Minimum percent uniq required to keep separate assembled contigs. if contigs are more similar then the lower scoring, then shorter, contig is dropped -## max_orf_cap +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_MAX_ORF_CAP` +## validate.assembly_strand_concordance -**default**: `#!python 3` +**type**: `#!python number` -The maximum number of orfs to return (best putative orfs will be retained) - +**default**: `#!python 0.51` -## max_proximity +When the number of remapped reads from each strand are compared, the ratio must be above this number to decide on the strand -**type**: `#!python int` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -**environment variable**: `MAVIS_MAX_PROXIMITY` -**default**: `#!python 5000` +## validate.blat_limit_top_aln -The maximum distance away from an annotation before the region in considered to be uninformative - +**type**: `#!python int` -## max_sc_preceeding_anchor +**default**: `#!python 10` -**type**: `#!python int` +Number of results to return from blat (ranking based on score) -**environment variable**: `MAVIS_MAX_SC_PRECEEDING_ANCHOR` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**default**: `#!python 6` -When remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of where we expect it. for example for a softclipped read on a breakpoint with a left orientation this limits the amount of softclipping that is allowed on the right. if this is set to none then there is no limit on softclipping - +## validate.blat_min_identity -## memory_limit +**type**: `#!python number` -**type**: `#!python int` +**default**: `#!python 0.9` -**environment variable**: `MAVIS_MEMORY_LIMIT` +The minimum percent identity match required for blat results when aligning contigs -**default**: `#!python 16000` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -The maximum number of megabytes (mb) any given job is allowed - -## min_anchor_exact +## validate.call_error **type**: `#!python int` -**environment variable**: `MAVIS_MIN_ANCHOR_EXACT` +**default**: `#!python 10` -**default**: `#!python 6` +Buffer zone for the evidence window -Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum number of consecutive exact matches to anchor a read to initiate targeted realignment - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## min_anchor_fuzzy -**type**: `#!python int` +## validate.clean_aligner_files -**environment variable**: `MAVIS_MIN_ANCHOR_FUZZY` +**type**: `#!python boolean` -**default**: `#!python 10` +**default**: `#!python False` -Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum length of a fuzzy match to anchor a read to initiate targeted realignment - +Remove the aligner output files after the validation stage is complete. not required for subsequent steps but can be useful in debugging and deep investigation of events -## min_anchor_match +**schema definition**: +```json +{ + "type": "boolean" +} +``` -**type**: `#!python float_fraction` -**environment variable**: `MAVIS_MIN_ANCHOR_MATCH` +## validate.contig_aln_max_event_size -**default**: `#!python 0.9` +**type**: `#!python int` -Minimum percent match for a read to be kept as evidence - +**default**: `#!python 50` -## min_call_complexity +Relates to determining breakpoints when pairing contig alignments. for any given read in a putative pair the soft clipping is extended to include any events of greater than this size. the softclipping is added to the side of the alignment as indicated by the breakpoint we are assigning pairs to -**type**: `#!python float_fraction` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**environment variable**: `MAVIS_MIN_CALL_COMPLEXITY` -**default**: `#!python 0.1` +## validate.contig_aln_merge_inner_anchor -The minimum complexity score for a call sequence. is an average for non-contig calls. filters low complexity contigs before alignment. see [contig_complexity](#contig_complexity) - +**type**: `#!python int` -## min_clusters_per_file +**default**: `#!python 20` -**type**: `#!python int` +The minimum number of consecutive exact match base pairs to not merge events within a contig alignment -**environment variable**: `MAVIS_MIN_CLUSTERS_PER_FILE` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**default**: `#!python 50` -The minimum number of breakpoint pairs to output to a file - +## validate.contig_aln_merge_outer_anchor -## min_domain_mapping_match +**type**: `#!python int` -**type**: `#!python float_fraction` +**default**: `#!python 15` -**environment variable**: `MAVIS_MIN_DOMAIN_MAPPING_MATCH` +Minimum consecutively aligned exact matches to anchor an end for merging internal events -**default**: `#!python 0.9` +**schema definition**: +```json +{ + "type": "integer" +} +``` -A number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript to be displayed - -## min_double_aligned_to_estimate_insertion_size +## validate.contig_aln_min_anchor_size **type**: `#!python int` -**environment variable**: `MAVIS_MIN_DOUBLE_ALIGNED_TO_ESTIMATE_INSERTION_SIZE` +**default**: `#!python 50` -**default**: `#!python 2` +The minimum number of aligned bases for a contig (m or =) in order to simplify. do not have to be consecutive -The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## min_flanking_pairs_resolution -**type**: `#!python int` +## validate.contig_aln_min_extend_overlap -**environment variable**: `MAVIS_MIN_FLANKING_PAIRS_RESOLUTION` +**type**: `#!python int` **default**: `#!python 10` -The minimum number of flanking reads required to call a breakpoint by flanking evidence - - -## min_linking_split_reads +Minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment -**type**: `#!python int` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**environment variable**: `MAVIS_MIN_LINKING_SPLIT_READS` -**default**: `#!python 2` +## validate.contig_aln_min_query_consumption -The minimum number of split reads which aligned to both breakpoints - +**type**: `#!python number` -## min_mapping_quality +**default**: `#!python 0.9` -**type**: `#!python int` +Minimum fraction of the original query sequence that must be used by the read(s) of the alignment -**environment variable**: `MAVIS_MIN_MAPPING_QUALITY` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -**default**: `#!python 5` -The minimum mapping quality of reads to be used as evidence - +## validate.contig_aln_min_score -## min_non_target_aligned_split_reads +**type**: `#!python number` -**type**: `#!python int` +**default**: `#!python 0.9` -**environment variable**: `MAVIS_MIN_NON_TARGET_ALIGNED_SPLIT_READS` +Minimum score for a contig to be used as evidence in a call by contig -**default**: `#!python 1` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local alignment to the target region to call a breakpoint by split read evidence - -## min_orf_size +## validate.fetch_min_bin_size **type**: `#!python int` -**environment variable**: `MAVIS_MIN_ORF_SIZE` +**default**: `#!python 50` + +The minimum size of any bin for reading from a bam file. increasing this number will result in smaller bins being merged or less bins being created (depending on the fetch method) -**default**: `#!python 300` +**schema definition**: +```json +{ + "type": "integer" +} +``` -The minimum length (in base pairs) to retain a putative open reading frame (orf) - -## min_sample_size_to_apply_percentage +## validate.fetch_reads_bins **type**: `#!python int` -**environment variable**: `MAVIS_MIN_SAMPLE_SIZE_TO_APPLY_PERCENTAGE` +**default**: `#!python 5` -**default**: `#!python 10` +Number of bins to split an evidence window into to ensure more even sampling of high coverage regions -Minimum number of aligned bases to compute a match percent. if there are less than this number of aligned bases (match or mismatch) the percent comparator is not used - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## min_softclipping + +## validate.fetch_reads_limit **type**: `#!python int` -**environment variable**: `MAVIS_MIN_SOFTCLIPPING` +**default**: `#!python 3000` -**default**: `#!python 6` +Maximum number of reads, cap, to loop over for any given evidence window -Minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## min_spanning_reads_resolution -**type**: `#!python int` +## validate.filter_secondary_alignments -**environment variable**: `MAVIS_MIN_SPANNING_READS_RESOLUTION` +**type**: `#!python boolean` -**default**: `#!python 5` +**default**: `#!python True` + +Filter secondary alignments when gathering read evidence + +**schema definition**: +```json +{ + "type": "boolean" +} +``` -Minimum number of spanning reads required to call an event by spanning evidence - -## min_splits_reads_resolution +## validate.fuzzy_mismatch_number **type**: `#!python int` -**environment variable**: `MAVIS_MIN_SPLITS_READS_RESOLUTION` +**default**: `#!python 1` -**default**: `#!python 3` +The number of events/mismatches allowed to be considered a fuzzy match -Minimum number of split reads required to call a breakpoint by split reads - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## novel_exon_color -**type**: `#!python str` +## validate.max_sc_preceeding_anchor -**environment variable**: `MAVIS_NOVEL_EXON_COLOR` +**type**: `#!python int` -**default**: `#!python '#5D3F6A'` +**default**: `#!python 6` -Novel exon fill color - +When remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of where we expect it. for example for a softclipped read on a breakpoint with a left orientation this limits the amount of softclipping that is allowed on the right. if this is set to none then there is no limit on softclipping -## outer_window_min_event_size +**schema definition**: +```json +{ + "type": "integer" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_OUTER_WINDOW_MIN_EVENT_SIZE` +## validate.min_anchor_exact -**default**: `#!python 125` +**type**: `#!python int` -The minimum size of an event in order for flanking read evidence to be collected - +**default**: `#!python 6` -## queue +Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum number of consecutive exact matches to anchor a read to initiate targeted realignment -**type**: `#!python str` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**environment variable**: `MAVIS_QUEUE` -**default**: `#!python ''` +## validate.min_anchor_fuzzy -The queue jobs are to be submitted to - +**type**: `#!python int` -## reference_genome +**default**: `#!python 10` -**type**: `#!python filepath` +Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum length of a fuzzy match to anchor a read to initiate targeted realignment -**environment variable**: `MAVIS_REFERENCE_GENOME` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**default**: `#!python []` -Path to the human reference genome fasta file - +## validate.min_anchor_match -## remote_head_ssh +**type**: `#!python number` -**type**: `#!python str` +**default**: `#!python 0.9` -**environment variable**: `MAVIS_REMOTE_HEAD_SSH` +Minimum percent match for a read to be kept as evidence -**default**: `#!python ''` +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -Ssh target for remote scheduler commands - -## scaffold_color +## validate.min_call_complexity -**type**: `#!python str` +**type**: `#!python number` -**environment variable**: `MAVIS_SCAFFOLD_COLOR` +**default**: `#!python 0.1` -**default**: `#!python '#000000'` +The minimum complexity score for a call sequence. is an average for non-contig calls. filters low complexity contigs before alignment. see [contig_complexity](#contig_complexity) -The color used for the gene/transcripts scaffolds - +**schema definition**: +```json +{ + "maximum": 1, + "minimum": 0, + "type": "number" +} +``` -## scheduler -**type**: `#!python mavis.schedule.constants.SCHEDULER` +## validate.min_double_aligned_to_estimate_insertion_size -**environment variable**: `MAVIS_SCHEDULER` +**type**: `#!python int` -**default**: `#!python 'SLURM'` +**default**: `#!python 2` -**accepted values**: `'SGE'`, `'SLURM'`, `'TORQUE'`, `'LOCAL'` +The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping +**schema definition**: +```json +{ + "type": "integer" +} +``` -The scheduler being used - -## spanning_call_distance +## validate.min_flanking_pairs_resolution **type**: `#!python int` -**environment variable**: `MAVIS_SPANNING_CALL_DISTANCE` +**default**: `#!python 10` -**default**: `#!python 20` +The minimum number of flanking reads required to call a breakpoint by flanking evidence -The maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## splice_color -**type**: `#!python str` +## validate.min_linking_split_reads -**environment variable**: `MAVIS_SPLICE_COLOR` +**type**: `#!python int` -**default**: `#!python '#000000'` +**default**: `#!python 2` -Splicing lines color - +The minimum number of split reads which aligned to both breakpoints -## split_call_distance +**schema definition**: +```json +{ + "type": "integer" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_SPLIT_CALL_DISTANCE` +## validate.min_mapping_quality -**default**: `#!python 20` +**type**: `#!python int` -The maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair - +**default**: `#!python 5` -## stdev_count_abnormal +The minimum mapping quality of reads to be used as evidence -**type**: `#!python float` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**environment variable**: `MAVIS_STDEV_COUNT_ABNORMAL` -**default**: `#!python 3.0` +## validate.min_non_target_aligned_split_reads -The number of standard deviations away from the normal considered expected and therefore not qualifying as flanking reads - +**type**: `#!python int` -## strand_determining_read +**default**: `#!python 1` -**type**: `#!python int` +The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local alignment to the target region to call a breakpoint by split read evidence -**environment variable**: `MAVIS_STRAND_DETERMINING_READ` +**schema definition**: +```json +{ + "type": "integer" +} +``` -**default**: `#!python 2` -1 or 2. the read in the pair which determines if (assuming a stranded protocol) the first or second read in the pair matches the strand sequenced - +## validate.min_sample_size_to_apply_percentage -## template_metadata +**type**: `#!python int` -**type**: `#!python filepath` +**default**: `#!python 10` -**environment variable**: `MAVIS_TEMPLATE_METADATA` +Minimum number of aligned bases to compute a match percent. if there are less than this number of aligned bases (match or mismatch) the percent comparator is not used -**default**: `#!python []` +**schema definition**: +```json +{ + "type": "integer" +} +``` -File containing the cytoband template information. used for illustrations only - -## time_limit +## validate.min_softclipping **type**: `#!python int` -**environment variable**: `MAVIS_TIME_LIMIT` +**default**: `#!python 6` + +Minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence -**default**: `#!python 57600` +**schema definition**: +```json +{ + "type": "integer" +} +``` -The time in seconds any given jobs is allowed - -## trans_fetch_reads_limit +## validate.min_spanning_reads_resolution **type**: `#!python int` -**environment variable**: `MAVIS_TRANS_FETCH_READS_LIMIT` +**default**: `#!python 5` + +Minimum number of spanning reads required to call an event by spanning evidence -**default**: `#!python 12000` +**schema definition**: +```json +{ + "type": "integer" +} +``` -Related to [fetch_reads_limit](#fetch_reads_limit). overrides fetch_reads_limit for transcriptome libraries when set. if this has a value of none then fetch_reads_limit will be used for transcriptome libraries instead - -## trans_min_mapping_quality +## validate.min_splits_reads_resolution **type**: `#!python int` -**environment variable**: `MAVIS_TRANS_MIN_MAPPING_QUALITY` +**default**: `#!python 3` -**default**: `#!python 0` +Minimum number of split reads required to call a breakpoint by split reads -Related to [min_mapping_quality](#min_mapping_quality). overrides the min_mapping_quality if the library is a transcriptome and this is set to any number not none. if this value is none, min_mapping_quality is used for transcriptomes aswell as genomes - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## trans_validation_memory + +## validate.outer_window_min_event_size **type**: `#!python int` -**environment variable**: `MAVIS_TRANS_VALIDATION_MEMORY` +**default**: `#!python 125` -**default**: `#!python 18000` +The minimum size of an event in order for flanking read evidence to be collected -Default memory limit (mb) for the validation stage (for transcriptomes) - +**schema definition**: +```json +{ + "type": "integer" +} +``` -## uninformative_filter -**type**: `#!python cast_boolean` +## validate.stdev_count_abnormal -**environment variable**: `MAVIS_UNINFORMATIVE_FILTER` +**type**: `#!python number` -**default**: `#!python False` +**default**: `#!python 3` -Flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered out prior to clustering - +The number of standard deviations away from the normal considered expected and therefore not qualifying as flanking reads -## validation_memory +**schema definition**: +```json +{ + "type": "number" +} +``` -**type**: `#!python int` -**environment variable**: `MAVIS_VALIDATION_MEMORY` +## validate.trans_fetch_reads_limit -**default**: `#!python 16000` +**type**: `#!python Union[int, null]` -Default memory limit (mb) for the validation stage - +**default**: `#!python 12000` -## width +Related to [fetch_reads_limit](#fetch_reads_limit). overrides fetch_reads_limit for transcriptome libraries when set. if this has a value of none then fetch_reads_limit will be used for transcriptome libraries instead -**type**: `#!python int` +**schema definition**: +```json +{ + "type": [ + "integer", + "null" + ] +} +``` -**environment variable**: `MAVIS_WIDTH` -**default**: `#!python 1000` +## validate.trans_min_mapping_quality -The drawing width in pixels - +**type**: `#!python Union[int, null]` + +**default**: `#!python 0` + +Related to [min_mapping_quality](#min_mapping_quality). overrides the min_mapping_quality if the library is a transcriptome and this is set to any number not none. if this value is none, min_mapping_quality is used for transcriptomes aswell as genomes + +**schema definition**: +```json +{ + "type": [ + "integer", + "null" + ] +} +``` -## write_evidence_files -**type**: `#!python cast_boolean` +## validate.write_evidence_files -**environment variable**: `MAVIS_WRITE_EVIDENCE_FILES` +**type**: `#!python boolean` **default**: `#!python True` Write the intermediate bam and bed files containing the raw evidence collected and contigs aligned. not required for subsequent steps but can be useful in debugging and deep investigation of events - + +**schema definition**: +```json +{ + "type": "boolean" +} +``` + diff --git a/docs/hooks.py b/docs/hooks.py index 30314742..dcb00448 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -1,71 +1,78 @@ +import json import os import re +from textwrap import dedent from markdown_refdocs.main import extract_to_markdown -from mavis.annotate.constants import DEFAULTS as ANNOTATION_DEFAULTS -from mavis.cluster.constants import DEFAULTS as CLUSTER_DEFAULTS -from mavis.config import REFERENCE_DEFAULTS -from mavis.illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS -from mavis.pairing.constants import DEFAULTS as PAIRING_DEFAULTS -from mavis.summary.constants import DEFAULTS as SUMMARY_DEFAULTS +from mavis.schemas import DEFAULTS from mavis.util import ENV_VAR_PREFIX -from mavis.validate.constants import DEFAULTS as VALIDATION_DEFAULTS -def generate_settings_doc(): - dirname = os.path.dirname(os.path.abspath(__file__)) +def json_to_pytype(record): + input_type = record + try: + input_type = record['type'] + except TypeError: + pass + types = {'string': 'str', 'integer': 'int', 'float': 'float'} + + if input_type == 'array': + try: + sub_type = json_to_pytype(record['items']['type']) + return f'List[{sub_type}]' + except TypeError: + return 'List' - for (filepath, title, namespaces) in [ - ( - 'configuration/settings.md', - 'Configurable Settings', - [ - REFERENCE_DEFAULTS, - SUMMARY_DEFAULTS, - PAIRING_DEFAULTS, - ANNOTATION_DEFAULTS, - VALIDATION_DEFAULTS, - CLUSTER_DEFAULTS, - ILLUSTRATION_DEFAULTS, - ], - ), - ]: - fname = os.path.join(dirname, filepath) - print('writing:', fname) - with open(fname, 'w') as fh: - fh.write(f'\n\n# {title}\n') - glossary = {} - for namespace in namespaces: - for term, value in namespace.items(): - typ = namespace.type(term).__name__ - # typ = CUSTOM_TYPES.get(typ, typ) - desc = re.sub(r"\.?$", "", namespace.define(term, "")).capitalize() - accepted = '' - try: - accepted = '\n\n**accepted values**: {}\n'.format( - ', '.join(['`{}`'.format(repr(v)) for v in namespace.type(term).values()]) - ) - except AttributeError: - pass - defn = f'''## {term} + if isinstance(input_type, list): + # Union + types = ', '.join([json_to_pytype(t) for t in input_type]) + return f'Union[{types}]' + return types.get(input_type, input_type) -**type**: `#!python {typ}` -**environment variable**: `{ENV_VAR_PREFIX}{term.upper()}` +def generate_settings_doc(schema_file): + with open(schema_file, 'r') as fh: + schema = json.load(fh) + dirname = os.path.dirname(os.path.abspath(__file__)) + filepath = 'configuration/settings.md' + title = 'Configurable Settings' -**default**: `#!python {repr(value)}`{accepted} + fname = os.path.join(dirname, filepath) + print('writing:', fname) + with open(fname, 'w') as fh: + fh.write(f'\n\n# {title}\n') + glossary = {} + for term, defn in schema['properties'].items(): + if term in ['libraries', 'convert']: + continue + typ = json_to_pytype(defn) + desc = defn.get('description', '') + default_value = defn.get('default') + schema_defn = json.dumps( + {k: v for k, v in defn.items() if k not in ['description', 'default']}, + sort_keys=True, + indent=' ', + ) + schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n' -{desc} - ''' - glossary[term] = defn - for term, defn in sorted(glossary.items()): - fh.write(f'{defn}\n\n') + lines = [ + f'## {term}', + f'**type**: `#!python {typ}`', + f'**default**: `#!python {repr(default_value)}`', + desc, + schema_defn, + ] + glossary[term] = '\n\n'.join(lines) + for term, defn in sorted(glossary.items()): + fh.write(f'{defn}\n\n') def build_package_docs(config): - generate_settings_doc() + schema_file = os.path.join(os.path.dirname(__file__), '../mavis/schemas/config.json') + generate_settings_doc(schema_file) package_dir = os.path.join(os.path.dirname(__file__), '../mavis') output_dir = os.path.join(os.path.dirname(__file__), 'package') + extract_to_markdown( [package_dir], output_dir, diff --git a/docs/outputs/columns.md b/docs/outputs/columns.md index f2c8ba19..3dfd797f 100644 --- a/docs/outputs/columns.md +++ b/docs/outputs/columns.md @@ -34,7 +34,7 @@ decision from the annotation step ## event\_type -**type**: `mavis.constants.SVTYPE` +**type**: [`mavis.constants.SVTYPE`](/package/mavis/constants/#class-mavisconstantssvtype) The classification of the event @@ -57,7 +57,7 @@ Gene for the current annotation at the first breakpoint ## gene1\_direction -**type**: `mavis.constants.PRIME` +**type**: [`mavis.constants.PRIME`](/package/mavis/constants/#class-mavisconstantsprime) The direction/prime of the gene @@ -68,7 +68,7 @@ Gene for the current annotation at the second breakpoint ## gene2\_direction -**type**: `mavis.constants.PRIME` +**type**: [`mavis.constants.PRIME`](/package/mavis/constants/#class-mavisconstantsprime) The direction/prime of the gene. Has the following possible values @@ -85,7 +85,7 @@ second breakpoint ## gene\_product\_type -**type**: `mavis.constants.GENE_PRODUCT_TYPE` +**type**: [`mavis.constants.GENE_PRODUCT_TYPE`](/package/mavis/constants/#class-mavisconstantsgene_product_type) Describes if the putative fusion product will be sense or anti-sense @@ -105,7 +105,8 @@ Transcript for the current annotation at the second breakpoint ## fusion\_splicing\_pattern -`mavis.constants.SPLICE_TYPE` - +**type**: [`mavis.constants.SPLICE_TYPE`](/package/mavis/constants/#class-mavisconstantsslice_type) + Type of splicing pattern used to create the fusion cDNA. ## fusion\_cdna\_coding\_start @@ -205,14 +206,14 @@ End integer inclusive ## break1\_orientation -**type**: `mavis.constants.ORIENT` +**type**: [`mavis.constants.ORIENT`](/package/mavis/constants/#class-mavisconstantsorient) The side of the breakpoint wrt the positive/forward strand that is retained. ## break1\_strand -**type**: `mavis.constants.STRAND` +**type**: [`mavis.constants.STRAND`](/package/mavis/constants/#class-mavisconstantsstrand) The strand wrt to the reference positive/forward strand at this @@ -246,14 +247,14 @@ End integer inclusive ## break2\_orientation -**type**: `mavis.constants.ORIENT` +**type**: [`mavis.constants.ORIENT`](/package/mavis/constants/#class-mavisconstantsorient) The side of the breakpoint wrt the positive/forward strand that is retained. ## break2\_strand -**type**: `mavis.constants.STRAND` +**type**: [`mavis.constants.STRAND`](/package/mavis/constants/#class-mavisconstantsstrand) The strand wrt to the reference positive/forward strand at this @@ -283,7 +284,8 @@ protocol was strand specific or not. Expects a boolean ## protocol -`mavis.constants.PROTOCOL` - +**type**: [`mavis.constants.PROTOCOL`](/package/mavis/constants/#class-mavisconstantsprotocol) + Specifies the type of library ## tools @@ -404,7 +406,7 @@ event ## call\_method -**type**: `mavis.constants.CALL_METHOD` +**type**: [`mavis.constants.CALL_METHOD`](/package/mavis/constants/#class-mavisconstantscall_method) The method used to call the breakpoints diff --git a/docs/tutorials/full.md b/docs/tutorials/full.md index b42ae624..44b187bc 100644 --- a/docs/tutorials/full.md +++ b/docs/tutorials/full.md @@ -18,16 +18,16 @@ tar -xvzf tutorial_data.tar.gz The expected contents are -| Path | Description | -| ---------------------------------- | ------------------------------------------------------------------------------------------------------------------- | -| README | Information regarding the other files in the directory | -| L1522785992\_expected\_events.tab | The events that we expect to find, either experimentally validated or 'spiked' in | -| L1522785992\_normal.sorted.bam | Paired normal library BAM file | -| L1522785992\_normal.sorted.bam.bai | BAM index | -| L1522785992\_trans.sorted.bam | Tumour transcriptome BAM file | -| L1522785992\_trans.sorted.bam.bai | BAM index file | -| L1522785992\_tumour.sorted.bam | Tumour genome BAM file | -| L1522785992\_tumour.sorted.bam.bai | BAM index file | +| Path | Description | +| ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------ | +| README | Information regarding the other files in the directory | +| L1522785992\_expected\_events.tab | The events that we expect to find, either experimentally validated or 'spiked' in | +| L1522785992\_normal.sorted.bam | Paired normal library BAM file | +| L1522785992\_normal.sorted.bam.bai | BAM index | +| L1522785992\_trans.sorted.bam | Tumour transcriptome BAM file | +| L1522785992\_trans.sorted.bam.bai | BAM index file | +| L1522785992\_tumour.sorted.bam | Tumour genome BAM file | +| L1522785992\_tumour.sorted.bam.bai | BAM index file | | breakdancer-1.4.5/ | Contains the [BreakDancer](../../glossary/#breakdancer) output which was run on the tumour genome BAM file | | breakseq-2.2/ | Contains the [BreakSeq](../../glossary/#breakseq) output which was run on the tumour genome BAM file | | chimerascan-0.4.5/ | Contains the [ChimeraScan](../../glossary/#chimerascan) output which was run on the tumour transcriptome BAM file | @@ -45,40 +45,12 @@ bash get_hg19_reference_files.sh source reference_inputs/hg19_env.sh ``` -## Generating the Config File +## Creating the Config File -The [config](../../background/citations/#pipeline-config) command -does most of the work of creating the config for you but there are a few -things you need to tell it +Most settings can be left as defaults, however you will need to fill out the `libraries` and +`convert` sections to tell MAVIS how to convert your inputs and what libraries to expect. -1. **Where your bams are and what library they belong to** - -```text ---library L1522785992-normal genome normal False tutorial_data/L1522785992_normal.sorted.bam ---library L1522785992-tumour genome diseased False tutorial_data/L1522785992_tumour.sorted.bam ---library L1522785992-trans transcriptome diseased True tutorial_data/L1522785992_trans.sorted.bam -``` - -1. **Where your SV caller output files (events) are** - -If they are raw tool output as in the current example you will need to -use the convert argument to tell MAVIS the file type - -```text ---convert breakdancer tutorial_data/breakdancer-1.4.5/*txt breakdancer ---convert breakseq tutorial_data/breakseq-2.2/breakseq.vcf.gz breakseq ---convert chimerascan tutorial_data/chimerascan-0.4.5/chimeras.bedpe chimerascan ---convert defuse tutorial_data/defuse-0.6.2/results.classify.tsv defuse ---convert manta tutorial_data/manta-1.0.0/diploidSV.vcf.gz tutorial_data/manta-1.0.0/somaticSV.vcf manta -``` - -!!! note - For older versions of MAVIS the convert command may require the path to - the file(s) be quoted and the strandedness be specified (default is - False) - - -3. **Which events you should validate in which libraries** +### Libraries Settings For this example, because we want to determine which events are germline/somatic we are going to pass all genome calls to both genomes. @@ -86,142 +58,133 @@ We can use either full file paths (if the input is already in the standard format) or the alias from a conversion (the first argument given to the convert option) -```text ---assign L1522785992-trans chimerascan defuse ---assign L1522785992-tumour breakdancer breakseq manta ---assign L1522785992-normal breakdancer breakseq manta +```json +{ + "libraries": { + "L1522785992-normal": { // keyed by library name + "assign": [ // these are the names of the input files (or conversion aliases) to check for this library + "breakdancer", + "breakseq", + "manta" + ], + "bam_file": "tutorial_data/L1522785992_normal.sorted.bam", + "disease_status": "normal", + "protocol": "genome" + }, + "L1522785992-trans": { + "assign": [ + "chimerascan", + "defuse" + ], + "bam_file": "tutorial_data/L1522785992_trans.sorted.bam", + "disease_status": "diseased", + "protocol": "transcriptome", + "strand_specific": true + }, + "L1522785992-tumour": { + "assign": [ + "breakdancer", + "breakseq", + "manta" + ], + "bam_file": "tutorial_data/L1522785992_tumour.sorted.bam", + "disease_status": "diseased", + "protocol": "genome" + } + } +} ``` -Putting this altogether with a name to call the config, we have the -command to generate the pipeline config. You should expect this step -with these inputs to take about \~5GB memory. +### Convert Settings -```bash -mavis config \ - --library L1522785992-normal genome normal False tutorial_data/L1522785992_normal.sorted.bam \ - --library L1522785992-tumour genome diseased False tutorial_data/L1522785992_tumour.sorted.bam \ - --library L1522785992-trans transcriptome diseased True tutorial_data/L1522785992_trans.sorted.bam \ - --convert breakdancer tutorial_data/breakdancer-1.4.5/*txt breakdancer \ - --convert breakseq tutorial_data/breakseq-2.2/breakseq.vcf.gz breakseq \ - --convert chimerascan tutorial_data/chimerascan-0.4.5/chimeras.bedpe chimerascan \ - --convert defuse tutorial_data/defuse-0.6.2/results.classify.tsv defuse \ - --convert manta tutorial_data/manta-1.0.0/diploidSV.vcf.gz tutorial_data/manta-1.0.0/somaticSV.vcf manta \ - --assign L1522785992-trans chimerascan defuse \ - --assign L1522785992-tumour breakdancer breakseq manta \ - --assign L1522785992-normal breakdancer breakseq manta \ - -w mavis.cfg -``` - -## Setting Up the Pipeline - -The next step is running the setup stage. This will perform conversion, clustering, and creating the -submission scripts for the other stages. +If they are raw tool output as in the current example you will need to +use the convert argument to tell MAVIS the file type -```bash -mavis setup mavis.cfg -o output_dir/ +```json +{ + "convert": { + "breakdancer": { // conversion alias/key + "assume_no_untemplated": true, + "file_type": "breakdancer", // input/file type + "inputs": [ + "tutorial_data/breakdancer-1.4.5/*txt" + ] + }, + "breakseq": { + "assume_no_untemplated": true, + "file_type": "breakseq", + "inputs": [ + "tutorial_data/breakseq-2.2/breakseq.vcf.gz" + ] + }, + "chimerascan": { + "assume_no_untemplated": true, + "file_type": "chimerascan", + "inputs": [ + "tutorial_data/chimerascan-0.4.5/chimeras.bedpe" + ] + }, + "defuse": { + "assume_no_untemplated": true, + "file_type": "defuse", + "inputs": [ + "tutorial_data/defuse-0.6.2/results.classify.tsv" + ] + }, + "manta": { + "assume_no_untemplated": true, + "file_type": "manta", + "inputs": [ + "tutorial_data/manta-1.0.0/diploidSV.vcf.gz", + "tutorial_data/manta-1.0.0/somaticSV.vcf" + ] + } + } +} ``` -At this stage you should have something that looks like this. For -simplicity not all files/directories have been shown. - - output_dir/ - |-- build.cfg - |-- converted_inputs - | |-- breakdancer.tab - | |-- breakseq.tab - | |-- chimerascan.tab - | |-- defuse.tab - | `-- manta.tab - |-- L1522785992-normal_normal_genome - | |-- annotate - | | |-- batch-aUmErftiY7eEWvENfSeJwc-1/ - | | `-- submit.sh - | |-- cluster - | | |-- batch-aUmErftiY7eEWvENfSeJwc-1.tab - | | |-- cluster_assignment.tab - | | |-- clusters.bed - | | |-- filtered_pairs.tab - | | `-- MAVIS-batch-aUmErftiY7eEWvENfSeJwc.COMPLETE - | `-- validate - | |-- batch-aUmErftiY7eEWvENfSeJwc-1/ - | `-- submit.sh - |-- pairing - | `-- submit.sh - `-- summary - `-- submit.sh - -## Submitting Jobs to the Cluster - -The last step is simple, ssh to your head node of your -[SLURM](../../glossary/#slurm) cluster (or run locally if you -have configured [remote_head_ssh](../../configuration/settings/#remote_head_ssh) and -run the schedule step. This will submit the jobs and create the -dependency chain - -```bash -ssh head_node -mavis schedule -o output_dir --submit +### Top-level Settings + +Finally you will need to set output directory and the reference files + +```json +{ + "output_dir": "output_dir_full", // where to output files + "reference.aligner_reference": [ + "reference_inputs/hg19.2bit" + ], + "reference.annotations": [ + "reference_inputs/ensembl69_hg19_annotations.json" + ], + "reference.dgv_annotation": [ + "reference_inputs/dgv_hg19_variants.tab" + ], + "reference.masking": [ + "reference_inputs/hg19_masking.tab" + ], + "reference.reference_genome": [ + "reference_inputs/hg19.fa" + ], + "reference.template_metadata": [ + "reference_inputs/cytoBand.txt" + ] +} ``` -The schedule step also acts as a built-in checker and can be run to -check for errors or if the pipeline has completed. +## Running the Workflow -```bash -mavis schedule -o output_dir -``` - -This should give you output something like below (times may vary) after -your run completed correctly. +You are now ready to run the workflow -```text - MAVIS: 2.0.0 - hostname: gphost08.bcgsc.ca -[2018-06-02 19:47:56] arguments - command = 'schedule' - log = None - log_level = 'INFO' - output = 'output_dir/' - resubmit = False - submit = False -[2018-06-02 19:48:01] validate - MV_L1522785992-normal_batch-aUmErftiY7eEWvENfSeJwc (1701000) is COMPLETED - 200 tasks are COMPLETED - run time: 609 - MV_L1522785992-tumour_batch-aUmErftiY7eEWvENfSeJwc (1701001) is COMPLETED - 200 tasks are COMPLETED - run time: 669 - MV_L1522785992-trans_batch-aUmErftiY7eEWvENfSeJwc (1701002) is COMPLETED - 23 tasks are COMPLETED - run time: 1307 -[2018-06-02 19:48:02] annotate - MA_L1522785992-normal_batch-aUmErftiY7eEWvENfSeJwc (1701003) is COMPLETED - 200 tasks are COMPLETED - run time: 622 - MA_L1522785992-tumour_batch-aUmErftiY7eEWvENfSeJwc (1701004) is COMPLETED - 200 tasks are COMPLETED - run time: 573 - MA_L1522785992-trans_batch-aUmErftiY7eEWvENfSeJwc (1701005) is COMPLETED - 23 tasks are COMPLETED - run time: 537 -[2018-06-02 19:48:07] pairing - MP_batch-aUmErftiY7eEWvENfSeJwc (1701006) is COMPLETED - run time: 466 -[2018-06-02 19:48:07] summary - MS_batch-aUmErftiY7eEWvENfSeJwc (1701007) is COMPLETED - run time: 465 - parallel run time: 3545 - rewriting: output_dir/build.cfg - run time (hh/mm/ss): 0:00:11 - run time (s): 11 +```bash +snakemake --jobs 100 --configfile=tests/full-tutorial.config.json ``` -The parallel run time reported corresponds to the sum of the slowest job -for each stage and does not include any queue time etc. - ## Analyzing the Output The best place to start with looking at the MAVIS output is the summary folder which contains the final results. For column name definitions see the [glossary](../../outputs/columns). - output_dir/summary/mavis_summary_all_L1522785992-normal_L1522785992-trans_L1522785992-tumour.tab +```text +output_dir/summary/mavis_summary_all_L1522785992-normal_L1522785992-trans_L1522785992-tumour.tab +``` From dff468c4b98929349079e4667b751733157b5875 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 17:15:28 -0700 Subject: [PATCH 011/137] Fix linting --- mavis/util.py | 3 +-- tests/end_to_end/test_help.py | 1 - tests/snakemake/test_mini_workflow.py | 16 ++++++++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/mavis/util.py b/mavis/util.py index 8cf5c558..1b145d29 100644 --- a/mavis/util.py +++ b/mavis/util.py @@ -15,8 +15,7 @@ from tab import tab from .breakpoint import Breakpoint, BreakpointPair -from .constants import (COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE, - MavisNamespace, sort_columns) +from .constants import COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE, MavisNamespace, sort_columns from .error import InvalidRearrangement from .interval import Interval diff --git a/tests/end_to_end/test_help.py b/tests/end_to_end/test_help.py index 76823d7f..4ff3172a 100644 --- a/tests/end_to_end/test_help.py +++ b/tests/end_to_end/test_help.py @@ -28,7 +28,6 @@ def test_pipeline(self): else: self.assertEqual(0, returncode) - def test_cluster(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CLUSTER, '-h']): try: diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py index 37e81b56..1125cf3e 100644 --- a/tests/snakemake/test_mini_workflow.py +++ b/tests/snakemake/test_mini_workflow.py @@ -45,11 +45,19 @@ def test_workflow(output_dir): assert glob_exists(os.path.join(output_dir, 'summary', 'MAVIS.COMPLETE')) assert glob_exists(os.path.join(output_dir, 'pairing', 'MAVIS.COMPLETE')) assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'cluster', 'MAVIS.COMPLETE')) - assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'validate', '*', 'MAVIS.COMPLETE')) - assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'annotate', '*', 'MAVIS.COMPLETE')) + assert glob_exists( + os.path.join(output_dir, 'mock-A47933', 'validate', '*', 'MAVIS.COMPLETE') + ) + assert glob_exists( + os.path.join(output_dir, 'mock-A47933', 'annotate', '*', 'MAVIS.COMPLETE') + ) assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'cluster', 'MAVIS.COMPLETE')) - assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'validate', '*', 'MAVIS.COMPLETE')) - assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'annotate', '*', 'MAVIS.COMPLETE')) + assert glob_exists( + os.path.join(output_dir, 'mock-A36971', 'validate', '*', 'MAVIS.COMPLETE') + ) + assert glob_exists( + os.path.join(output_dir, 'mock-A36971', 'annotate', '*', 'MAVIS.COMPLETE') + ) except SystemExit as err: if err.code != 0: raise err From 1ae83ac57ab281ae0e6dca90200bfedbe884953a Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 17:45:58 -0700 Subject: [PATCH 012/137] Add library/convert sections to config list --- .gitignore | 1 + docs/configuration/settings.md | 1798 -------------------------------- docs/hooks.py | 114 +- 3 files changed, 89 insertions(+), 1824 deletions(-) delete mode 100644 docs/configuration/settings.md diff --git a/.gitignore b/.gitignore index 26638751..0745a3b2 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ junit /docs/package/mavis/*/*.md # don't ignore subpackage summary files !/docs/package/mavis/*/index.md +docs/configuration/settings.md .snakemake output_dir* diff --git a/docs/configuration/settings.md b/docs/configuration/settings.md deleted file mode 100644 index 9b388424..00000000 --- a/docs/configuration/settings.md +++ /dev/null @@ -1,1798 +0,0 @@ - - -# Configurable Settings -## annotate.annotation_filters - -**type**: `#!python List[str]` - -**default**: `#!python ['choose_more_annotated', 'choose_transcripts_by_priority']` - -A comma separated list of filters to apply to putative annotations - -**schema definition**: -```json -{ - "items": { - "enum": [ - "choose_more_annotated", - "choose_transcripts_by_priority" - ], - "type": "string" - }, - "type": "array" -} -``` - - -## annotate.draw_fusions_only - -**type**: `#!python boolean` - -**default**: `#!python True` - -Flag to indicate if events which do not produce a fusion transcript should produce illustrations - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## annotate.draw_non_synonymous_cdna_only - -**type**: `#!python boolean` - -**default**: `#!python True` - -Flag to indicate if events which are synonymous at the cdna level should produce illustrations - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## annotate.max_orf_cap - -**type**: `#!python int` - -**default**: `#!python 3` - -The maximum number of orfs to return (best putative orfs will be retained) - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## annotate.min_domain_mapping_match - -**type**: `#!python number` - -**default**: `#!python 0.9` - -A number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript to be displayed - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## annotate.min_orf_size - -**type**: `#!python int` - -**default**: `#!python 300` - -The minimum length (in base pairs) to retain a putative open reading frame (orf) - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## bam_stats.distribution_fraction - -**type**: `#!python number` - -**default**: `#!python 0.97` - -the proportion of the distribution to use in computing stdev - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0.01, - "type": "number" -} -``` - - -## bam_stats.sample_bin_size - -**type**: `#!python int` - -**default**: `#!python 1000` - -how large to make the sample bin (in bp) - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## bam_stats.sample_cap - -**type**: `#!python int` - -**default**: `#!python 1000` - -maximum number of reads to collect for any given sample region - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## bam_stats.sample_size - -**type**: `#!python int` - -**default**: `#!python 500` - -the number of genes/bins to compute stats over - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## cluster.cluster_initial_size_limit - -**type**: `#!python int` - -**default**: `#!python 25` - -The maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering phase (combining based on overlap) - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## cluster.cluster_radius - -**type**: `#!python int` - -**default**: `#!python 100` - -Maximum distance allowed between paired breakpoint pairs - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## cluster.limit_to_chr - -**type**: `#!python Union[List, null]` - -**default**: `#!python ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']` - -A list of chromosome names to use. breakpointpairs on other chromosomes will be filteredout. for example '1 2 3 4' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4 - -**schema definition**: -```json -{ - "items": { - "type": "string" - }, - "type": [ - "array", - "null" - ] -} -``` - - -## cluster.max_files - -**type**: `#!python int` - -**default**: `#!python 200` - -The maximum number of files to output from clustering/splitting - -**schema definition**: -```json -{ - "minimum": 1, - "type": "integer" -} -``` - - -## cluster.max_proximity - -**type**: `#!python int` - -**default**: `#!python 5000` - -The maximum distance away from an annotation before the region in considered to be uninformative - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## cluster.min_clusters_per_file - -**type**: `#!python int` - -**default**: `#!python 50` - -The minimum number of breakpoint pairs to output to a file - -**schema definition**: -```json -{ - "minimum": 1, - "type": "integer" -} -``` - - -## cluster.split_only - -**type**: `#!python boolean` - -**default**: `#!python False` - -just split the input files, do not merge input breakpoints into clusters - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## cluster.uninformative_filter - -**type**: `#!python boolean` - -**default**: `#!python False` - -Flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered out prior to clustering - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## illustrate.breakpoint_color - -**type**: `#!python str` - -**default**: `#!python '#000000'` - -Breakpoint outline color - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## illustrate.domain_color - -**type**: `#!python str` - -**default**: `#!python '#ccccb3'` - -Domain fill color - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## illustrate.domain_mismatch_color - -**type**: `#!python str` - -**default**: `#!python '#b2182b'` - -Domain fill color on 0%% match - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## illustrate.domain_name_regex_filter - -**type**: `#!python str` - -**default**: `#!python '^PF\\d+$'` - -The regular expression used to select domains to be displayed (filtered by name) - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## illustrate.domain_scaffold_color - -**type**: `#!python str` - -**default**: `#!python '#000000'` - -The color of the domain scaffold - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## illustrate.drawing_width_iter_increase - -**type**: `#!python int` - -**default**: `#!python 500` - -The amount (in pixels) by which to increase the drawing width upon failure to fit - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## illustrate.exon_min_focus_size - -**type**: `#!python int` - -**default**: `#!python 10` - -Minimum size of an exon for it to be granted a label or min exon width - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## illustrate.gene1_color - -**type**: `#!python str` - -**default**: `#!python '#657e91'` - -The color of genes near the first gene - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.gene1_color_selected - -**type**: `#!python str` - -**default**: `#!python '#518dc5'` - -The color of the first gene - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.gene2_color - -**type**: `#!python str` - -**default**: `#!python '#325556'` - -The color of genes near the second gene - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.gene2_color_selected - -**type**: `#!python str` - -**default**: `#!python '#4c9677'` - -The color of the second gene - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.label_color - -**type**: `#!python str` - -**default**: `#!python '#000000'` - -The label color - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.mask_fill - -**type**: `#!python str` - -**default**: `#!python '#ffffff'` - -Color of mask (for deleted region etc.) - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.mask_opacity - -**type**: `#!python number` - -**default**: `#!python 0.7` - -Opacity of the mask layer - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## illustrate.max_drawing_retries - -**type**: `#!python int` - -**default**: `#!python 5` - -The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## illustrate.novel_exon_color - -**type**: `#!python str` - -**default**: `#!python '#5D3F6A'` - -Novel exon fill color - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.scaffold_color - -**type**: `#!python str` - -**default**: `#!python '#000000'` - -The color used for the gene/transcripts scaffolds - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.splice_color - -**type**: `#!python str` - -**default**: `#!python '#000000'` - -Splicing lines color - -**schema definition**: -```json -{ - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" -} -``` - - -## illustrate.width - -**type**: `#!python int` - -**default**: `#!python 1000` - -The drawing width in pixels - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## log - -**type**: `#!python str` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## log_level - -**type**: `#!python str` - -**default**: `#!python 'INFO'` - - - -**schema definition**: -```json -{ - "enum": [ - "INFO", - "DEBUG" - ], - "type": "string" -} -``` - - -## output_dir - -**type**: `#!python str` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "type": "string" -} -``` - - -## pairing.contig_call_distance - -**type**: `#!python int` - -**default**: `#!python 10` - -The maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## pairing.flanking_call_distance - -**type**: `#!python int` - -**default**: `#!python 50` - -The maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## pairing.input_call_distance - -**type**: `#!python int` - -**default**: `#!python 20` - -The maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## pairing.spanning_call_distance - -**type**: `#!python int` - -**default**: `#!python 20` - -The maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## pairing.split_call_distance - -**type**: `#!python int` - -**default**: `#!python 20` - -The maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## reference.aligner_reference - -**type**: `#!python List[str]` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "examples": [ - "tests/data/mock_reference_genome.2bit" - ], - "items": { - "type": "string" - }, - "maxItems": 1, - "minItems": 1, - "type": "array" -} -``` - - -## reference.annotations - -**type**: `#!python List[str]` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "examples": [ - "tests/data/mock_annotations.json" - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" -} -``` - - -## reference.dgv_annotation - -**type**: `#!python List[str]` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "examples": [ - [ - "tests/data/mock_dgv_annotation.txt" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" -} -``` - - -## reference.masking - -**type**: `#!python List[str]` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "examples": [ - [ - "tests/data/mock_masking.tab" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" -} -``` - - -## reference.reference_genome - -**type**: `#!python List[str]` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "examples": [ - [ - "tests/data/mock_reference_genome.fa" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" -} -``` - - -## reference.template_metadata - -**type**: `#!python List[str]` - -**default**: `#!python None` - - - -**schema definition**: -```json -{ - "examples": [ - [ - "tests/data/cytoBand.txt" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" -} -``` - - -## skip_stage.validate - -**type**: `#!python boolean` - -**default**: `#!python False` - -skip the validation stage of the MAVIS pipeline - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## summary.filter_cdna_synon - -**type**: `#!python boolean` - -**default**: `#!python True` - -Filter all annotations synonymous at the cdna level - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## summary.filter_min_complexity - -**type**: `#!python number` - -**default**: `#!python 0.2` - -Filter event calls based on call sequence complexity - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## summary.filter_min_flanking_reads - -**type**: `#!python int` - -**default**: `#!python 10` - -Minimum number of flanking pairs for a call by flanking pairs - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## summary.filter_min_linking_split_reads - -**type**: `#!python int` - -**default**: `#!python 1` - -Minimum number of linking split reads for a call by split reads - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## summary.filter_min_remapped_reads - -**type**: `#!python int` - -**default**: `#!python 5` - -Minimum number of remapped reads for a call by contig - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## summary.filter_min_spanning_reads - -**type**: `#!python int` - -**default**: `#!python 5` - -Minimum number of spanning reads for a call by spanning reads - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## summary.filter_min_split_reads - -**type**: `#!python int` - -**default**: `#!python 5` - -Minimum number of split reads for a call by split reads - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## summary.filter_protein_synon - -**type**: `#!python boolean` - -**default**: `#!python False` - -Filter all annotations synonymous at the protein level - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## summary.filter_trans_homopolymers - -**type**: `#!python boolean` - -**default**: `#!python True` - -Filter all single bp ins/del/dup events that are in a homopolymer region of at least 3 bps and are not paired to a genomic event - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## validate.aligner - -**type**: `#!python str` - -**default**: `#!python 'blat'` - -The aligner to use to map the contigs/reads back to the reference e.g blat or bwa - -**schema definition**: -```json -{ - "enum": [ - "bwa mem", - "blat" - ], - "type": "string" -} -``` - - -## validate.assembly_kmer_size - -**type**: `#!python number` - -**default**: `#!python 0.74` - -The percent of the read length to make kmers for assembly - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.assembly_max_paths - -**type**: `#!python int` - -**default**: `#!python 8` - -The maximum number of paths to resolve. this is used to limit when there is a messy assembly graph to resolve. the assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater than the given setting - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.assembly_min_edge_trim_weight - -**type**: `#!python int` - -**default**: `#!python 3` - -This is used to simplify the debruijn graph before path finding. edges with less than this frequency will be discarded if they are non-cutting, at a fork, or the end of a path - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.assembly_min_exact_match_to_remap - -**type**: `#!python int` - -**default**: `#!python 15` - -The minimum length of exact matches to initiate remapping a read to a contig - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.assembly_min_remap_coverage - -**type**: `#!python number` - -**default**: `#!python 0.9` - -Minimum fraction of the contig sequence which the remapped sequences must align over - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.assembly_min_remapped_seq - -**type**: `#!python int` - -**default**: `#!python 3` - -The minimum input sequences that must remap for an assembled contig to be used - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.assembly_min_uniq - -**type**: `#!python number` - -**default**: `#!python 0.1` - -Minimum percent uniq required to keep separate assembled contigs. if contigs are more similar then the lower scoring, then shorter, contig is dropped - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.assembly_strand_concordance - -**type**: `#!python number` - -**default**: `#!python 0.51` - -When the number of remapped reads from each strand are compared, the ratio must be above this number to decide on the strand - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.blat_limit_top_aln - -**type**: `#!python int` - -**default**: `#!python 10` - -Number of results to return from blat (ranking based on score) - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.blat_min_identity - -**type**: `#!python number` - -**default**: `#!python 0.9` - -The minimum percent identity match required for blat results when aligning contigs - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.call_error - -**type**: `#!python int` - -**default**: `#!python 10` - -Buffer zone for the evidence window - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.clean_aligner_files - -**type**: `#!python boolean` - -**default**: `#!python False` - -Remove the aligner output files after the validation stage is complete. not required for subsequent steps but can be useful in debugging and deep investigation of events - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## validate.contig_aln_max_event_size - -**type**: `#!python int` - -**default**: `#!python 50` - -Relates to determining breakpoints when pairing contig alignments. for any given read in a putative pair the soft clipping is extended to include any events of greater than this size. the softclipping is added to the side of the alignment as indicated by the breakpoint we are assigning pairs to - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.contig_aln_merge_inner_anchor - -**type**: `#!python int` - -**default**: `#!python 20` - -The minimum number of consecutive exact match base pairs to not merge events within a contig alignment - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.contig_aln_merge_outer_anchor - -**type**: `#!python int` - -**default**: `#!python 15` - -Minimum consecutively aligned exact matches to anchor an end for merging internal events - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.contig_aln_min_anchor_size - -**type**: `#!python int` - -**default**: `#!python 50` - -The minimum number of aligned bases for a contig (m or =) in order to simplify. do not have to be consecutive - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.contig_aln_min_extend_overlap - -**type**: `#!python int` - -**default**: `#!python 10` - -Minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.contig_aln_min_query_consumption - -**type**: `#!python number` - -**default**: `#!python 0.9` - -Minimum fraction of the original query sequence that must be used by the read(s) of the alignment - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.contig_aln_min_score - -**type**: `#!python number` - -**default**: `#!python 0.9` - -Minimum score for a contig to be used as evidence in a call by contig - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.fetch_min_bin_size - -**type**: `#!python int` - -**default**: `#!python 50` - -The minimum size of any bin for reading from a bam file. increasing this number will result in smaller bins being merged or less bins being created (depending on the fetch method) - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.fetch_reads_bins - -**type**: `#!python int` - -**default**: `#!python 5` - -Number of bins to split an evidence window into to ensure more even sampling of high coverage regions - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.fetch_reads_limit - -**type**: `#!python int` - -**default**: `#!python 3000` - -Maximum number of reads, cap, to loop over for any given evidence window - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.filter_secondary_alignments - -**type**: `#!python boolean` - -**default**: `#!python True` - -Filter secondary alignments when gathering read evidence - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - -## validate.fuzzy_mismatch_number - -**type**: `#!python int` - -**default**: `#!python 1` - -The number of events/mismatches allowed to be considered a fuzzy match - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.max_sc_preceeding_anchor - -**type**: `#!python int` - -**default**: `#!python 6` - -When remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of where we expect it. for example for a softclipped read on a breakpoint with a left orientation this limits the amount of softclipping that is allowed on the right. if this is set to none then there is no limit on softclipping - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_anchor_exact - -**type**: `#!python int` - -**default**: `#!python 6` - -Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum number of consecutive exact matches to anchor a read to initiate targeted realignment - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_anchor_fuzzy - -**type**: `#!python int` - -**default**: `#!python 10` - -Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum length of a fuzzy match to anchor a read to initiate targeted realignment - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_anchor_match - -**type**: `#!python number` - -**default**: `#!python 0.9` - -Minimum percent match for a read to be kept as evidence - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.min_call_complexity - -**type**: `#!python number` - -**default**: `#!python 0.1` - -The minimum complexity score for a call sequence. is an average for non-contig calls. filters low complexity contigs before alignment. see [contig_complexity](#contig_complexity) - -**schema definition**: -```json -{ - "maximum": 1, - "minimum": 0, - "type": "number" -} -``` - - -## validate.min_double_aligned_to_estimate_insertion_size - -**type**: `#!python int` - -**default**: `#!python 2` - -The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_flanking_pairs_resolution - -**type**: `#!python int` - -**default**: `#!python 10` - -The minimum number of flanking reads required to call a breakpoint by flanking evidence - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_linking_split_reads - -**type**: `#!python int` - -**default**: `#!python 2` - -The minimum number of split reads which aligned to both breakpoints - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_mapping_quality - -**type**: `#!python int` - -**default**: `#!python 5` - -The minimum mapping quality of reads to be used as evidence - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_non_target_aligned_split_reads - -**type**: `#!python int` - -**default**: `#!python 1` - -The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local alignment to the target region to call a breakpoint by split read evidence - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_sample_size_to_apply_percentage - -**type**: `#!python int` - -**default**: `#!python 10` - -Minimum number of aligned bases to compute a match percent. if there are less than this number of aligned bases (match or mismatch) the percent comparator is not used - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_softclipping - -**type**: `#!python int` - -**default**: `#!python 6` - -Minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_spanning_reads_resolution - -**type**: `#!python int` - -**default**: `#!python 5` - -Minimum number of spanning reads required to call an event by spanning evidence - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.min_splits_reads_resolution - -**type**: `#!python int` - -**default**: `#!python 3` - -Minimum number of split reads required to call a breakpoint by split reads - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.outer_window_min_event_size - -**type**: `#!python int` - -**default**: `#!python 125` - -The minimum size of an event in order for flanking read evidence to be collected - -**schema definition**: -```json -{ - "type": "integer" -} -``` - - -## validate.stdev_count_abnormal - -**type**: `#!python number` - -**default**: `#!python 3` - -The number of standard deviations away from the normal considered expected and therefore not qualifying as flanking reads - -**schema definition**: -```json -{ - "type": "number" -} -``` - - -## validate.trans_fetch_reads_limit - -**type**: `#!python Union[int, null]` - -**default**: `#!python 12000` - -Related to [fetch_reads_limit](#fetch_reads_limit). overrides fetch_reads_limit for transcriptome libraries when set. if this has a value of none then fetch_reads_limit will be used for transcriptome libraries instead - -**schema definition**: -```json -{ - "type": [ - "integer", - "null" - ] -} -``` - - -## validate.trans_min_mapping_quality - -**type**: `#!python Union[int, null]` - -**default**: `#!python 0` - -Related to [min_mapping_quality](#min_mapping_quality). overrides the min_mapping_quality if the library is a transcriptome and this is set to any number not none. if this value is none, min_mapping_quality is used for transcriptomes aswell as genomes - -**schema definition**: -```json -{ - "type": [ - "integer", - "null" - ] -} -``` - - -## validate.write_evidence_files - -**type**: `#!python boolean` - -**default**: `#!python True` - -Write the intermediate bam and bed files containing the raw evidence collected and contigs aligned. not required for subsequent steps but can be useful in debugging and deep investigation of events - -**schema definition**: -```json -{ - "type": "boolean" -} -``` - - diff --git a/docs/hooks.py b/docs/hooks.py index dcb00448..baa09506 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -14,7 +14,13 @@ def json_to_pytype(record): input_type = record['type'] except TypeError: pass - types = {'string': 'str', 'integer': 'int', 'float': 'float'} + types = { + 'string': 'str', + 'integer': 'int', + 'float': 'float', + 'boolean': 'bool', + 'number': 'float', + } if input_type == 'array': try: @@ -30,6 +36,37 @@ def json_to_pytype(record): return types.get(input_type, input_type) +def list_properties(schema, skip_terms=tuple()): + glossary = {} + for term, defn in schema['properties'].items(): + if term in skip_terms: + continue + typ = json_to_pytype(defn) + desc = defn.get('description', '') + default_value = defn.get('default') + schema_fields = {k: v for k, v in defn.items() if k not in ['description', 'default']} + + if len(schema_fields) > 1: + schema_defn = json.dumps( + schema_fields, + sort_keys=True, + indent=' ', + ) + schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n' + else: + schema_defn = '' + + lines = [ + f'### {term}', + f'**type**: `#!python {typ}`', + f'**default**: `#!python {repr(default_value)}`' if default_value is not None else '', + desc, + schema_defn, + ] + glossary[term] = '\n\n'.join(lines) + return [v for k, v in sorted(glossary.items())] + + def generate_settings_doc(schema_file): with open(schema_file, 'r') as fh: schema = json.load(fh) @@ -38,33 +75,58 @@ def generate_settings_doc(schema_file): title = 'Configurable Settings' fname = os.path.join(dirname, filepath) + + result = [f'\n\n# {title}\n'] + result.append( + dedent( + '''\ + ## Defining Samples/Libraries + + The `libraries` property of the mavis config is required to run the snakemake + workflow. This is the section that defines what inputs to use, and what types of + samples are available. + + ```json + { + "libraries": { + "": { } // mapping of library name to library settings + } + } + ``` + + The library specific settings are listed below + ''' + ) + ) + result.extend(list_properties(schema['properties']['libraries']['additionalProperties'])) + result.append( + dedent( + '''\ + ## Defining Conversions + + If the input to MAVIS is raw tool output and has not been pre-converted to the + standard tab delimited format expected by MAVIS then you will need to add + a section to the config to tell mavis how to perform the required conversions + + ```json + { + "convert": { + "": { } // mapping of alias to conversion settings + } + } + ``` + + The conversion specific settings are listed below + ''' + ) + ) + result.extend(list_properties(schema['properties']['convert']['additionalProperties'])) + result.append('\n## General Settings\n') + result.extend(list_properties(schema, ('libraries', 'convert'))) + print('writing:', fname) with open(fname, 'w') as fh: - fh.write(f'\n\n# {title}\n') - glossary = {} - for term, defn in schema['properties'].items(): - if term in ['libraries', 'convert']: - continue - typ = json_to_pytype(defn) - desc = defn.get('description', '') - default_value = defn.get('default') - schema_defn = json.dumps( - {k: v for k, v in defn.items() if k not in ['description', 'default']}, - sort_keys=True, - indent=' ', - ) - schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n' - - lines = [ - f'## {term}', - f'**type**: `#!python {typ}`', - f'**default**: `#!python {repr(default_value)}`', - desc, - schema_defn, - ] - glossary[term] = '\n\n'.join(lines) - for term, defn in sorted(glossary.items()): - fh.write(f'{defn}\n\n') + fh.write('\n\n'.join(result) + '\n') def build_package_docs(config): From 325b188f357328e6f2d1012e6f2840074d45da63 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 17:52:57 -0700 Subject: [PATCH 013/137] Add more descriptions --- docs/tutorials/mini.md | 2 +- mavis/schemas/config.json | 44 ++++++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index bb7f00f2..657dac26 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -30,7 +30,7 @@ mavis/schemas/config.json Now you are ready to run MAVIS. This can be done in a single command using snakemake. ```bash -snakemake -j 1 --configfig tests/mini-tutorial.config.json +snakemake -j 1 --configfile=tests/mini-tutorial.config.json ``` Which will run the mini tutorial version and output files into a folder called `output_dir` in the diff --git a/mavis/schemas/config.json b/mavis/schemas/config.json index 9769754c..a948cd77 100644 --- a/mavis/schemas/config.json +++ b/mavis/schemas/config.json @@ -171,7 +171,8 @@ "type": "string" }, "minItems": 1, - "type": "array" + "type": "array", + "description": "List of input files" }, "strand_specific": { "default": false, @@ -189,12 +190,14 @@ "illustrate.domain_color": { "default": "#ccccb3", "description": "Domain fill color", - "type": "string" + "type": "string", + "pattern": "^#[a-zA-Z0-9]{6}" }, "illustrate.domain_mismatch_color": { "default": "#b2182b", "description": "Domain fill color on 0%% match", - "type": "string" + "type": "string", + "pattern": "^#[a-zA-Z0-9]{6}" }, "illustrate.domain_name_regex_filter": { "default": "^PF\\d+$", @@ -204,7 +207,8 @@ "illustrate.domain_scaffold_color": { "default": "#000000", "description": "The color of the domain scaffold", - "type": "string" + "type": "string", + "pattern": "^#[a-zA-Z0-9]{6}" }, "illustrate.drawing_width_iter_increase": { "default": 500, @@ -290,7 +294,8 @@ "illustrate.breakpoint_color": { "default": "#000000", "description": "Breakpoint outline color", - "type": "string" + "type": "string", + "pattern": "^#[a-zA-Z0-9]{6}" }, "libraries": { "additionalProperties": { @@ -301,15 +306,17 @@ "type": "string" }, "minItems": 1, - "type": "array" + "type": "array", + "description": "List of input files or conversion aliases that should be processed for this library" }, "total_batches": { "type": "integer", "min": 1, - "description": "The number of jobs to slit a library into for cluster/validate/annotate" + "description": "The number of jobs to slit a library into for cluster/validate/annotate. This will be set during initialization of the config if not given" }, "bam_file": { - "type": "string" + "type": "string", + "description": "Path to the bam file containing the sequencing reads for this library" }, "disease_status": { "enum": [ @@ -319,7 +326,8 @@ "type": "string" }, "median_fragment_size": { - "type": "integer" + "type": "integer", + "description": "The median fragment size in the paired-end read library. This will be computed from the bam during initialization of the config if not given" }, "protocol": { "enum": [ @@ -329,10 +337,12 @@ "type": "string" }, "read_length": { - "type": "integer" + "type": "integer", + "description": "The read length in the paired-end read library. This will be computed from the bam during initialization of the config if not given" }, "stdev_fragment_size": { - "type": "integer" + "type": "integer", + "description": "The standard deviation of fragment size in the paired-end read library. This will be computed from the bam during initialization of the config if not given" }, "strand_determining_read": { "default": 2, @@ -366,7 +376,8 @@ "type": "string" }, "output_dir": { - "type": "string" + "type": "string", + "description": "path to the directory to output the MAVIS files to" }, "pairing.contig_call_distance": { "default": 10, @@ -402,7 +413,8 @@ }, "maxItems": 1, "minItems": 1, - "type": "array" + "type": "array", + "description": "The reference genome file used by the aligner" }, "reference.annotations": { "examples": [ @@ -412,7 +424,8 @@ "type": "string" }, "minItems": 1, - "type": "array" + "type": "array", + "description": "The reference file containing gene/transcript position information" }, "reference.dgv_annotation": { "examples": [ @@ -436,7 +449,8 @@ "type": "string" }, "minItems": 1, - "type": "array" + "type": "array", + "description": "A list of regions to ignore in validation. Generally these are centromeres and telomeres or known poor mapping areas" }, "reference.reference_genome": { "examples": [ From edb5e8a1564f65145e7584da313c6e638e65a2c8 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 17:58:14 -0700 Subject: [PATCH 014/137] Remove commented out code --- mavis/breakpoint.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mavis/breakpoint.py b/mavis/breakpoint.py index dfdda6a5..10218017 100644 --- a/mavis/breakpoint.py +++ b/mavis/breakpoint.py @@ -87,17 +87,6 @@ class BreakpointPair: untemplated_seq: Optional[str] data: Dict - # def __getattr__(self, attr): - # data = object.__getattribute__(self, 'data') - # try: - # return data[COLUMNS[attr]] - # except (KeyError, AttributeError): - # try: - # return data[attr] - # except KeyError: - # pass - # raise AttributeError(attr) - def __getitem__(self, index): try: index = int(index) From 5c318f767916449eb53fa656d78e2d83c9b8d686 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 18:16:26 -0700 Subject: [PATCH 015/137] Try splitting workflows --- .github/workflows/build.yml | 31 ++++++------------ .github/workflows/quick-tests.yml | 53 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/quick-tests.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 37c862f0..411b1e07 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -3,7 +3,12 @@ name: build -on: [push, pull_request] +on: + push: + branches: + - master + - develop + pull_request: jobs: build: @@ -11,6 +16,7 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] + name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -21,16 +27,6 @@ jobs: run: | python -m pip install --upgrade pip setuptools pip install .[test] - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 mavis --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Lint with black - run: | - pip install black - # stop the build if black needs to be run - black mavis -S -l 100 --check - name: install bwa run: | git clone https://github.com/lh3/bwa.git @@ -42,21 +38,14 @@ jobs: run: | wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat chmod a+x blat - - name: run short tests with pytest - run: | - export PATH=$PATH:$(pwd):$(pwd)/bwa - pytest tests -v \ - --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ - --durations=10 - env: - RUN_FULL: 0 - if: github.event_name != 'pull_request' - name: set up .pth file run: | python tests/setup_subprocess_cov.py - name: run full tests with pytest run: | export PATH=$PATH:$(pwd):$(pwd)/bwa + export COVERAGE_PROCESS_START=$(pwd)/.coveragerc + pytest tests -v \ --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ --cov mavis \ @@ -66,7 +55,6 @@ jobs: --cov-branch env: RUN_FULL: 1 - if: github.event_name == 'pull_request' - name: Upload pytest test results uses: actions/upload-artifact@master with: @@ -83,4 +71,3 @@ jobs: env_vars: OS,PYTHON name: codecov-umbrella fail_ci_if_error: true - if: matrix.python-version == 3.7 && github.event_name == 'pull_request' diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml new file mode 100644 index 00000000..ffadaebc --- /dev/null +++ b/.github/workflows/quick-tests.yml @@ -0,0 +1,53 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: quick-tests + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + name: python-${{ matrix.python-version }} quick + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools + pip install .[test] + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 mavis --count --select=E9,F63,F7,F82 --show-source --statistics + - name: Lint with black + run: | + pip install black + # stop the build if black needs to be run + black mavis -S -l 100 --check + - name: install bwa + run: | + git clone https://github.com/lh3/bwa.git + cd bwa + git checkout v0.7.17 + make + cd .. + - name: install blat + run: | + wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat + chmod a+x blat + - name: run short tests with pytest + run: | + export PATH=$PATH:$(pwd):$(pwd)/bwa + pytest tests -v \ + --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ + --durations=10 + env: + RUN_FULL: 0 From a181d6a5b4adb1307be40c07137e9db8b36ad6ea Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 18:29:11 -0700 Subject: [PATCH 016/137] Only report coverage once --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 411b1e07..855df03b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -71,3 +71,4 @@ jobs: env_vars: OS,PYTHON name: codecov-umbrella fail_ci_if_error: true + if: matrix.python-version == 3.8 From be3772e572b169d60ec836e7843758a5896e113c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 20 Apr 2021 21:42:05 -0700 Subject: [PATCH 017/137] Fix coverage issues --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 855df03b..e8107623 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -26,7 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools - pip install .[test] + pip install -e .[test] # need editable to make sure the coverage reports correctly - name: install bwa run: | git clone https://github.com/lh3/bwa.git From 890cf05ca5d6b0646ea55a7b80174e445c5a184e Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 21 Apr 2021 12:06:28 -0700 Subject: [PATCH 018/137] Use src folder for code --- .github/workflows/quick-tests.yml | 4 ++-- MANIFEST.in | 8 +------- Snakefile | 2 +- docs/hooks.py | 4 ++-- setup.py | 3 ++- {mavis => src/mavis}/__init__.py | 0 {mavis => src/mavis}/align.py | 0 {mavis => src/mavis}/annotate/__init__.py | 0 {mavis => src/mavis}/annotate/base.py | 0 {mavis => src/mavis}/annotate/constants.py | 0 {mavis => src/mavis}/annotate/file_io.py | 0 {mavis => src/mavis}/annotate/fusion.py | 0 {mavis => src/mavis}/annotate/genomic.py | 0 {mavis => src/mavis}/annotate/main.py | 0 {mavis => src/mavis}/annotate/protein.py | 0 {mavis => src/mavis}/annotate/splicing.py | 0 {mavis => src/mavis}/annotate/variant.py | 0 {mavis => src/mavis}/assemble.py | 0 {mavis => src/mavis}/bam/__init__.py | 0 {mavis => src/mavis}/bam/cache.py | 0 {mavis => src/mavis}/bam/cigar.py | 0 {mavis => src/mavis}/bam/read.py | 0 {mavis => src/mavis}/bam/stats.py | 0 {mavis => src/mavis}/blat.py | 0 {mavis => src/mavis}/breakpoint.py | 0 {mavis => src/mavis}/cluster/__init__.py | 0 {mavis => src/mavis}/cluster/cluster.py | 0 {mavis => src/mavis}/cluster/main.py | 0 {mavis => src/mavis}/config.py | 0 {mavis => src/mavis}/constants.py | 0 {mavis => src/mavis}/error.py | 0 {mavis => src/mavis}/illustrate/__init__.py | 0 {mavis => src/mavis}/illustrate/constants.py | 0 {mavis => src/mavis}/illustrate/diagram.py | 0 {mavis => src/mavis}/illustrate/elements.py | 0 {mavis => src/mavis}/illustrate/scatter.py | 0 {mavis => src/mavis}/illustrate/util.py | 0 {mavis => src/mavis}/interval.py | 0 {mavis => src/mavis}/main.py | 0 {mavis => src/mavis}/overlay.py | 0 {mavis => src/mavis}/pairing/__init__.py | 0 {mavis => src/mavis}/pairing/constants.py | 0 {mavis => src/mavis}/pairing/main.py | 0 {mavis => src/mavis}/pairing/pairing.py | 0 {mavis => src/mavis}/schemas/__init__.py | 0 {mavis => src/mavis}/schemas/config.json | 0 {mavis => src/mavis}/schemas/overlay.json | 0 {mavis => src/mavis}/summary/__init__.py | 0 {mavis => src/mavis}/summary/constants.py | 0 {mavis => src/mavis}/summary/main.py | 0 {mavis => src/mavis}/summary/summary.py | 0 {mavis => src/mavis}/tools/__init__.py | 0 {mavis => src/mavis}/tools/breakdancer.py | 0 {mavis => src/mavis}/tools/chimerascan.py | 0 {mavis => src/mavis}/tools/cnvnator.py | 0 {mavis => src/mavis}/tools/constants.py | 0 {mavis => src/mavis}/tools/starfusion.py | 0 {mavis => src/mavis}/tools/transabyss.py | 0 {mavis => src/mavis}/tools/vcf.py | 0 {mavis => src/mavis}/util.py | 0 {mavis => src/mavis}/validate/__init__.py | 0 {mavis => src/mavis}/validate/base.py | 0 {mavis => src/mavis}/validate/call.py | 0 {mavis => src/mavis}/validate/constants.py | 0 {mavis => src/mavis}/validate/evidence.py | 0 {mavis => src/mavis}/validate/main.py | 0 {tab => src/tab}/__init__.py | 0 {tab => src/tab}/tab.py | 0 {tools => src/tools}/TSV.pm | 0 src/tools/__init__.py | 0 {tools => src/tools}/calculate_ref_alt_counts.py | 0 {tools => src/tools}/find_repeats.py | 0 {tools => src/tools}/generate_ensembl_json.py | 0 {tools => src/tools}/get_hg19_reference_files.sh | 0 74 files changed, 8 insertions(+), 13 deletions(-) rename {mavis => src/mavis}/__init__.py (100%) rename {mavis => src/mavis}/align.py (100%) rename {mavis => src/mavis}/annotate/__init__.py (100%) rename {mavis => src/mavis}/annotate/base.py (100%) rename {mavis => src/mavis}/annotate/constants.py (100%) rename {mavis => src/mavis}/annotate/file_io.py (100%) rename {mavis => src/mavis}/annotate/fusion.py (100%) rename {mavis => src/mavis}/annotate/genomic.py (100%) rename {mavis => src/mavis}/annotate/main.py (100%) rename {mavis => src/mavis}/annotate/protein.py (100%) rename {mavis => src/mavis}/annotate/splicing.py (100%) rename {mavis => src/mavis}/annotate/variant.py (100%) rename {mavis => src/mavis}/assemble.py (100%) rename {mavis => src/mavis}/bam/__init__.py (100%) rename {mavis => src/mavis}/bam/cache.py (100%) rename {mavis => src/mavis}/bam/cigar.py (100%) rename {mavis => src/mavis}/bam/read.py (100%) rename {mavis => src/mavis}/bam/stats.py (100%) rename {mavis => src/mavis}/blat.py (100%) rename {mavis => src/mavis}/breakpoint.py (100%) rename {mavis => src/mavis}/cluster/__init__.py (100%) rename {mavis => src/mavis}/cluster/cluster.py (100%) rename {mavis => src/mavis}/cluster/main.py (100%) rename {mavis => src/mavis}/config.py (100%) rename {mavis => src/mavis}/constants.py (100%) rename {mavis => src/mavis}/error.py (100%) rename {mavis => src/mavis}/illustrate/__init__.py (100%) rename {mavis => src/mavis}/illustrate/constants.py (100%) rename {mavis => src/mavis}/illustrate/diagram.py (100%) rename {mavis => src/mavis}/illustrate/elements.py (100%) rename {mavis => src/mavis}/illustrate/scatter.py (100%) rename {mavis => src/mavis}/illustrate/util.py (100%) rename {mavis => src/mavis}/interval.py (100%) rename {mavis => src/mavis}/main.py (100%) rename {mavis => src/mavis}/overlay.py (100%) rename {mavis => src/mavis}/pairing/__init__.py (100%) rename {mavis => src/mavis}/pairing/constants.py (100%) rename {mavis => src/mavis}/pairing/main.py (100%) rename {mavis => src/mavis}/pairing/pairing.py (100%) rename {mavis => src/mavis}/schemas/__init__.py (100%) rename {mavis => src/mavis}/schemas/config.json (100%) rename {mavis => src/mavis}/schemas/overlay.json (100%) rename {mavis => src/mavis}/summary/__init__.py (100%) rename {mavis => src/mavis}/summary/constants.py (100%) rename {mavis => src/mavis}/summary/main.py (100%) rename {mavis => src/mavis}/summary/summary.py (100%) rename {mavis => src/mavis}/tools/__init__.py (100%) rename {mavis => src/mavis}/tools/breakdancer.py (100%) rename {mavis => src/mavis}/tools/chimerascan.py (100%) rename {mavis => src/mavis}/tools/cnvnator.py (100%) rename {mavis => src/mavis}/tools/constants.py (100%) rename {mavis => src/mavis}/tools/starfusion.py (100%) rename {mavis => src/mavis}/tools/transabyss.py (100%) rename {mavis => src/mavis}/tools/vcf.py (100%) rename {mavis => src/mavis}/util.py (100%) rename {mavis => src/mavis}/validate/__init__.py (100%) rename {mavis => src/mavis}/validate/base.py (100%) rename {mavis => src/mavis}/validate/call.py (100%) rename {mavis => src/mavis}/validate/constants.py (100%) rename {mavis => src/mavis}/validate/evidence.py (100%) rename {mavis => src/mavis}/validate/main.py (100%) rename {tab => src/tab}/__init__.py (100%) rename {tab => src/tab}/tab.py (100%) rename {tools => src/tools}/TSV.pm (100%) create mode 100644 src/tools/__init__.py rename {tools => src/tools}/calculate_ref_alt_counts.py (100%) rename {tools => src/tools}/find_repeats.py (100%) rename {tools => src/tools}/generate_ensembl_json.py (100%) rename {tools => src/tools}/get_hg19_reference_files.sh (100%) diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml index ffadaebc..e6f6a917 100644 --- a/.github/workflows/quick-tests.yml +++ b/.github/workflows/quick-tests.yml @@ -26,12 +26,12 @@ jobs: run: | pip install flake8 # stop the build if there are Python syntax errors or undefined names - flake8 mavis --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 src/mavis --count --select=E9,F63,F7,F82 --show-source --statistics - name: Lint with black run: | pip install black # stop the build if black needs to be run - black mavis -S -l 100 --check + black src/mavis -S -l 100 --check - name: install bwa run: | git clone https://github.com/lh3/bwa.git diff --git a/MANIFEST.in b/MANIFEST.in index 165d54e6..8b270b97 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,5 @@ -recursive-include docs * -recursive-include tests *.py -include tests/*/data/* -recursive-include mavis *.py *.json -recursive-include tools *.pl *.py *.pm -recursive-include tab *.py +recursive-include src *.py *.json include README.md include LICENSE.txt -include mavis/config-schema.json prune docs/build prune docs/source/auto diff --git a/Snakefile b/Snakefile index 7710c0d9..351e0ab9 100644 --- a/Snakefile +++ b/Snakefile @@ -18,7 +18,7 @@ try: # TODO: replace with URL so that the user does not need a copy of the config schema validate( config, - os.path.join(os.getcwd(), 'mavis/schemas/config.json') + os.path.join(os.getcwd(), 'src/mavis/schemas/config.json') ) for key in [ "libraries", diff --git a/docs/hooks.py b/docs/hooks.py index baa09506..44931755 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -130,9 +130,9 @@ def generate_settings_doc(schema_file): def build_package_docs(config): - schema_file = os.path.join(os.path.dirname(__file__), '../mavis/schemas/config.json') + schema_file = os.path.join(os.path.dirname(__file__), '../src/mavis/schemas/config.json') generate_settings_doc(schema_file) - package_dir = os.path.join(os.path.dirname(__file__), '../mavis') + package_dir = os.path.join(os.path.dirname(__file__), '../src/mavis') output_dir = os.path.join(os.path.dirname(__file__), 'package') extract_to_markdown( diff --git a/setup.py b/setup.py index dd23fb29..53374dcb 100644 --- a/setup.py +++ b/setup.py @@ -101,7 +101,8 @@ def check_nonpython_dependencies(): version='{}'.format(VERSION), url='https://github.com/bcgsc/mavis.git', download_url='https://github.com/bcgsc/mavis/archive/v{}.tar.gz'.format(VERSION), - packages=find_packages(exclude=['tests']), + package_dir={'': 'src'}, + packages=find_packages(where='src'), description='A Structural Variant Post-Processing Package', long_description=parse_md_readme(), install_requires=INSTALL_REQS, diff --git a/mavis/__init__.py b/src/mavis/__init__.py similarity index 100% rename from mavis/__init__.py rename to src/mavis/__init__.py diff --git a/mavis/align.py b/src/mavis/align.py similarity index 100% rename from mavis/align.py rename to src/mavis/align.py diff --git a/mavis/annotate/__init__.py b/src/mavis/annotate/__init__.py similarity index 100% rename from mavis/annotate/__init__.py rename to src/mavis/annotate/__init__.py diff --git a/mavis/annotate/base.py b/src/mavis/annotate/base.py similarity index 100% rename from mavis/annotate/base.py rename to src/mavis/annotate/base.py diff --git a/mavis/annotate/constants.py b/src/mavis/annotate/constants.py similarity index 100% rename from mavis/annotate/constants.py rename to src/mavis/annotate/constants.py diff --git a/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py similarity index 100% rename from mavis/annotate/file_io.py rename to src/mavis/annotate/file_io.py diff --git a/mavis/annotate/fusion.py b/src/mavis/annotate/fusion.py similarity index 100% rename from mavis/annotate/fusion.py rename to src/mavis/annotate/fusion.py diff --git a/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py similarity index 100% rename from mavis/annotate/genomic.py rename to src/mavis/annotate/genomic.py diff --git a/mavis/annotate/main.py b/src/mavis/annotate/main.py similarity index 100% rename from mavis/annotate/main.py rename to src/mavis/annotate/main.py diff --git a/mavis/annotate/protein.py b/src/mavis/annotate/protein.py similarity index 100% rename from mavis/annotate/protein.py rename to src/mavis/annotate/protein.py diff --git a/mavis/annotate/splicing.py b/src/mavis/annotate/splicing.py similarity index 100% rename from mavis/annotate/splicing.py rename to src/mavis/annotate/splicing.py diff --git a/mavis/annotate/variant.py b/src/mavis/annotate/variant.py similarity index 100% rename from mavis/annotate/variant.py rename to src/mavis/annotate/variant.py diff --git a/mavis/assemble.py b/src/mavis/assemble.py similarity index 100% rename from mavis/assemble.py rename to src/mavis/assemble.py diff --git a/mavis/bam/__init__.py b/src/mavis/bam/__init__.py similarity index 100% rename from mavis/bam/__init__.py rename to src/mavis/bam/__init__.py diff --git a/mavis/bam/cache.py b/src/mavis/bam/cache.py similarity index 100% rename from mavis/bam/cache.py rename to src/mavis/bam/cache.py diff --git a/mavis/bam/cigar.py b/src/mavis/bam/cigar.py similarity index 100% rename from mavis/bam/cigar.py rename to src/mavis/bam/cigar.py diff --git a/mavis/bam/read.py b/src/mavis/bam/read.py similarity index 100% rename from mavis/bam/read.py rename to src/mavis/bam/read.py diff --git a/mavis/bam/stats.py b/src/mavis/bam/stats.py similarity index 100% rename from mavis/bam/stats.py rename to src/mavis/bam/stats.py diff --git a/mavis/blat.py b/src/mavis/blat.py similarity index 100% rename from mavis/blat.py rename to src/mavis/blat.py diff --git a/mavis/breakpoint.py b/src/mavis/breakpoint.py similarity index 100% rename from mavis/breakpoint.py rename to src/mavis/breakpoint.py diff --git a/mavis/cluster/__init__.py b/src/mavis/cluster/__init__.py similarity index 100% rename from mavis/cluster/__init__.py rename to src/mavis/cluster/__init__.py diff --git a/mavis/cluster/cluster.py b/src/mavis/cluster/cluster.py similarity index 100% rename from mavis/cluster/cluster.py rename to src/mavis/cluster/cluster.py diff --git a/mavis/cluster/main.py b/src/mavis/cluster/main.py similarity index 100% rename from mavis/cluster/main.py rename to src/mavis/cluster/main.py diff --git a/mavis/config.py b/src/mavis/config.py similarity index 100% rename from mavis/config.py rename to src/mavis/config.py diff --git a/mavis/constants.py b/src/mavis/constants.py similarity index 100% rename from mavis/constants.py rename to src/mavis/constants.py diff --git a/mavis/error.py b/src/mavis/error.py similarity index 100% rename from mavis/error.py rename to src/mavis/error.py diff --git a/mavis/illustrate/__init__.py b/src/mavis/illustrate/__init__.py similarity index 100% rename from mavis/illustrate/__init__.py rename to src/mavis/illustrate/__init__.py diff --git a/mavis/illustrate/constants.py b/src/mavis/illustrate/constants.py similarity index 100% rename from mavis/illustrate/constants.py rename to src/mavis/illustrate/constants.py diff --git a/mavis/illustrate/diagram.py b/src/mavis/illustrate/diagram.py similarity index 100% rename from mavis/illustrate/diagram.py rename to src/mavis/illustrate/diagram.py diff --git a/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py similarity index 100% rename from mavis/illustrate/elements.py rename to src/mavis/illustrate/elements.py diff --git a/mavis/illustrate/scatter.py b/src/mavis/illustrate/scatter.py similarity index 100% rename from mavis/illustrate/scatter.py rename to src/mavis/illustrate/scatter.py diff --git a/mavis/illustrate/util.py b/src/mavis/illustrate/util.py similarity index 100% rename from mavis/illustrate/util.py rename to src/mavis/illustrate/util.py diff --git a/mavis/interval.py b/src/mavis/interval.py similarity index 100% rename from mavis/interval.py rename to src/mavis/interval.py diff --git a/mavis/main.py b/src/mavis/main.py similarity index 100% rename from mavis/main.py rename to src/mavis/main.py diff --git a/mavis/overlay.py b/src/mavis/overlay.py similarity index 100% rename from mavis/overlay.py rename to src/mavis/overlay.py diff --git a/mavis/pairing/__init__.py b/src/mavis/pairing/__init__.py similarity index 100% rename from mavis/pairing/__init__.py rename to src/mavis/pairing/__init__.py diff --git a/mavis/pairing/constants.py b/src/mavis/pairing/constants.py similarity index 100% rename from mavis/pairing/constants.py rename to src/mavis/pairing/constants.py diff --git a/mavis/pairing/main.py b/src/mavis/pairing/main.py similarity index 100% rename from mavis/pairing/main.py rename to src/mavis/pairing/main.py diff --git a/mavis/pairing/pairing.py b/src/mavis/pairing/pairing.py similarity index 100% rename from mavis/pairing/pairing.py rename to src/mavis/pairing/pairing.py diff --git a/mavis/schemas/__init__.py b/src/mavis/schemas/__init__.py similarity index 100% rename from mavis/schemas/__init__.py rename to src/mavis/schemas/__init__.py diff --git a/mavis/schemas/config.json b/src/mavis/schemas/config.json similarity index 100% rename from mavis/schemas/config.json rename to src/mavis/schemas/config.json diff --git a/mavis/schemas/overlay.json b/src/mavis/schemas/overlay.json similarity index 100% rename from mavis/schemas/overlay.json rename to src/mavis/schemas/overlay.json diff --git a/mavis/summary/__init__.py b/src/mavis/summary/__init__.py similarity index 100% rename from mavis/summary/__init__.py rename to src/mavis/summary/__init__.py diff --git a/mavis/summary/constants.py b/src/mavis/summary/constants.py similarity index 100% rename from mavis/summary/constants.py rename to src/mavis/summary/constants.py diff --git a/mavis/summary/main.py b/src/mavis/summary/main.py similarity index 100% rename from mavis/summary/main.py rename to src/mavis/summary/main.py diff --git a/mavis/summary/summary.py b/src/mavis/summary/summary.py similarity index 100% rename from mavis/summary/summary.py rename to src/mavis/summary/summary.py diff --git a/mavis/tools/__init__.py b/src/mavis/tools/__init__.py similarity index 100% rename from mavis/tools/__init__.py rename to src/mavis/tools/__init__.py diff --git a/mavis/tools/breakdancer.py b/src/mavis/tools/breakdancer.py similarity index 100% rename from mavis/tools/breakdancer.py rename to src/mavis/tools/breakdancer.py diff --git a/mavis/tools/chimerascan.py b/src/mavis/tools/chimerascan.py similarity index 100% rename from mavis/tools/chimerascan.py rename to src/mavis/tools/chimerascan.py diff --git a/mavis/tools/cnvnator.py b/src/mavis/tools/cnvnator.py similarity index 100% rename from mavis/tools/cnvnator.py rename to src/mavis/tools/cnvnator.py diff --git a/mavis/tools/constants.py b/src/mavis/tools/constants.py similarity index 100% rename from mavis/tools/constants.py rename to src/mavis/tools/constants.py diff --git a/mavis/tools/starfusion.py b/src/mavis/tools/starfusion.py similarity index 100% rename from mavis/tools/starfusion.py rename to src/mavis/tools/starfusion.py diff --git a/mavis/tools/transabyss.py b/src/mavis/tools/transabyss.py similarity index 100% rename from mavis/tools/transabyss.py rename to src/mavis/tools/transabyss.py diff --git a/mavis/tools/vcf.py b/src/mavis/tools/vcf.py similarity index 100% rename from mavis/tools/vcf.py rename to src/mavis/tools/vcf.py diff --git a/mavis/util.py b/src/mavis/util.py similarity index 100% rename from mavis/util.py rename to src/mavis/util.py diff --git a/mavis/validate/__init__.py b/src/mavis/validate/__init__.py similarity index 100% rename from mavis/validate/__init__.py rename to src/mavis/validate/__init__.py diff --git a/mavis/validate/base.py b/src/mavis/validate/base.py similarity index 100% rename from mavis/validate/base.py rename to src/mavis/validate/base.py diff --git a/mavis/validate/call.py b/src/mavis/validate/call.py similarity index 100% rename from mavis/validate/call.py rename to src/mavis/validate/call.py diff --git a/mavis/validate/constants.py b/src/mavis/validate/constants.py similarity index 100% rename from mavis/validate/constants.py rename to src/mavis/validate/constants.py diff --git a/mavis/validate/evidence.py b/src/mavis/validate/evidence.py similarity index 100% rename from mavis/validate/evidence.py rename to src/mavis/validate/evidence.py diff --git a/mavis/validate/main.py b/src/mavis/validate/main.py similarity index 100% rename from mavis/validate/main.py rename to src/mavis/validate/main.py diff --git a/tab/__init__.py b/src/tab/__init__.py similarity index 100% rename from tab/__init__.py rename to src/tab/__init__.py diff --git a/tab/tab.py b/src/tab/tab.py similarity index 100% rename from tab/tab.py rename to src/tab/tab.py diff --git a/tools/TSV.pm b/src/tools/TSV.pm similarity index 100% rename from tools/TSV.pm rename to src/tools/TSV.pm diff --git a/src/tools/__init__.py b/src/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/calculate_ref_alt_counts.py b/src/tools/calculate_ref_alt_counts.py similarity index 100% rename from tools/calculate_ref_alt_counts.py rename to src/tools/calculate_ref_alt_counts.py diff --git a/tools/find_repeats.py b/src/tools/find_repeats.py similarity index 100% rename from tools/find_repeats.py rename to src/tools/find_repeats.py diff --git a/tools/generate_ensembl_json.py b/src/tools/generate_ensembl_json.py similarity index 100% rename from tools/generate_ensembl_json.py rename to src/tools/generate_ensembl_json.py diff --git a/tools/get_hg19_reference_files.sh b/src/tools/get_hg19_reference_files.sh similarity index 100% rename from tools/get_hg19_reference_files.sh rename to src/tools/get_hg19_reference_files.sh From 47f22bbc89787f5415acc3446c81895ab83df50d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 21 Apr 2021 12:10:52 -0700 Subject: [PATCH 019/137] Add docker container --- .github/workflows/quick-tests.yml | 11 ++++++++++ Dockerfile | 35 +++++++++++++++++++++++++++++++ Snakefile | 2 +- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml index e6f6a917..689ea8fb 100644 --- a/.github/workflows/quick-tests.yml +++ b/.github/workflows/quick-tests.yml @@ -51,3 +51,14 @@ jobs: --durations=10 env: RUN_FULL: 0 + docker: + runs-on: ubuntu-latest + name: docker build + steps: + - uses: actions/checkout@v2 + - name: build the docker container + run: | + docker build --file Dockerfile --tag bcgsc/mavis . + - name: test the help menu + run: | + docker run bcgsc/mavis -h diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..3f2ef284 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,35 @@ +FROM python:3.7-slim-buster + +WORKDIR /app + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y git wget make gcc libz-dev + +# pysam dependencies +RUN apt-get install -y libncurses5-dev zlib1g-dev libbz2-dev libncursesw5-dev liblzma-dev + +# install BWA +RUN git clone https://github.com/lh3/bwa.git && \ + cd bwa && \ + git checkout v0.7.17 && \ + make && \ + cd .. && \ + mv bwa/bwa /usr/local/bin + +# install blat +RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ + chmod a+x blat && \ + mv blat /usr/local/bin + +COPY setup.py setup.py +COPY setup.cfg setup.cfg +COPY src src +COPY LICENSE.txt LICENSE.txt +COPY README.md README.md + +# install python package +RUN pip install -U setuptools pip wheel +RUN pip install . +RUN which mavis +ENTRYPOINT [ "mavis" ] diff --git a/Snakefile b/Snakefile index 351e0ab9..23c0772e 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,7 @@ import re import json import pandas as pd -CONTAINER = 'creisle/mavis:latest' +CONTAINER = 'bcgsc/mavis:latest' def output_dir(*paths): return os.path.join(config['output_dir'], *paths) From b8e15d081e378d38481eaeeebca9e2dddaf96703 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 21 Apr 2021 12:20:40 -0700 Subject: [PATCH 020/137] Add src prefix to data files --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 53374dcb..5aab6d08 100644 --- a/setup.py +++ b/setup.py @@ -126,7 +126,7 @@ def check_nonpython_dependencies(): ] }, include_package_data=True, - data_files=[('mavis', ['mavis/schemas/config.json', 'mavis/schemas/overlay.json'])], + data_files=[('mavis', ['src/mavis/schemas/config.json', 'src/mavis/schemas/overlay.json'])], project_urls={'mavis': 'http://mavis.bcgsc.ca'}, ) check_nonpython_dependencies() From 5063aa5fffdbe8a6524192ec72af050f3dd2e718 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 23 Apr 2021 13:33:32 -0700 Subject: [PATCH 021/137] Replace custom tab package with pandas --- docs/migrating.md | 21 + setup.py | 2 +- src/mavis/align.py | 14 +- src/mavis/annotate/constants.py | 23 - src/mavis/annotate/file_io.py | 213 ++--- src/mavis/annotate/main.py | 2 - src/mavis/annotate/splicing.py | 6 +- src/mavis/bam/stats.py | 2 +- src/mavis/blat.py | 41 +- src/mavis/cluster/main.py | 4 +- src/mavis/config.py | 7 +- src/mavis/constants.py | 84 +- src/mavis/main.py | 6 +- src/mavis/overlay.py | 4 +- src/mavis/pairing/main.py | 22 +- src/mavis/summary/main.py | 62 +- src/mavis/summary/summary.py | 1 + src/mavis/tools/__init__.py | 23 +- src/mavis/tools/breakdancer.py | 43 +- src/mavis/util.py | 263 ++++-- src/mavis/validate/main.py | 10 +- src/tab/__init__.py | 127 --- src/tab/tab.py | 401 --------- src/tools/calculate_ref_alt_counts.py | 2 +- tests/data/annotations_subsample.tab | 2 +- tests/data/clustering_input.tab | 2 +- ...is_summary_all_mock-A36971_mock-A47933.tab | 2 +- tests/data/mock_masking.tab | 2 +- tests/data/mock_pairing_input.tab | 2 +- .../data/mock_reference_annotations.full.tsv | 2 +- tests/data/mock_reference_annotations.tsv | 2 +- tests/data/mock_sv_events.tsv | 2 +- tests/data/mock_trans_sv_events.tsv | 2 +- tests/data/pairing_annotations.tab | 2 +- .../pairing_reference_annotations_file.tab | 2 +- tests/end_to_end/test_convert.py | 2 +- tests/integration/test_annotate.py | 26 +- tests/integration/test_annotate_examples.py | 13 +- tests/integration/test_bam.py | 15 +- tests/integration/test_splicing.py | 4 +- tests/unit/test_tab.py | 287 ------ tests/unit/test_tool.py | 837 +++++++++--------- tests/unit/test_util.py | 635 ++++++------- 43 files changed, 1286 insertions(+), 1938 deletions(-) create mode 100644 docs/migrating.md delete mode 100644 src/tab/__init__.py delete mode 100755 src/tab/tab.py delete mode 100644 tests/unit/test_tab.py diff --git a/docs/migrating.md b/docs/migrating.md new file mode 100644 index 00000000..213ee00c --- /dev/null +++ b/docs/migrating.md @@ -0,0 +1,21 @@ +# Migrating + +## Migrating from v2 to v3 + +There are major changes from v2 to v3 of MAVIS. + +### Tab File Headers + +Tab file headers no longer start with `#`. Any lines starting with a pound will be treated +as comments. This will apply to mavis-style inputs as well as any tab delimited +reference files + +### Configuration + +MAVIS no longer users command line arguments, config files, and environment variables for +configuration. Instead all configurable settings are controlled via a single input JSON +config file + +### Scheduling + +MAVIS is now integrated with snakemake instead of handling its own scheduling diff --git a/setup.py b/setup.py index 5aab6d08..0245e812 100644 --- a/setup.py +++ b/setup.py @@ -115,7 +115,7 @@ def check_nonpython_dependencies(): }, tests_require=TEST_REQS, setup_requires=['pip>=9.0.0', 'setuptools>=36.0.0'], - python_requires='>=3.2', + python_requires='>=3.6', author='Caralyn Reisle', author_email='creisle@bcgsc.ca', test_suite='tests', diff --git a/src/mavis/align.py b/src/mavis/align.py index be81b28d..dffed765 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -13,16 +13,8 @@ from .bam import cigar as _cigar from .bam import read as _read from .breakpoint import Breakpoint, BreakpointPair -from .constants import ( - CIGAR, - COLUMNS, - NA_MAPPING_QUALITY, - ORIENT, - STRAND, - SVTYPE, - MavisNamespace, - reverse_complement, -) +from .constants import (CIGAR, COLUMNS, NA_MAPPING_QUALITY, ORIENT, STRAND, + SVTYPE, MavisNamespace, reverse_complement) from .error import InvalidRearrangement from .interval import Interval from .util import DEVNULL @@ -91,7 +83,7 @@ def query_overlap_extension(self): return total_overlap return 0 - def score(self, consec_bonus=10): + def score(self, consec_bonus=10) -> float: """ scores events between 0 and 1 penalizing events interrupting the alignment. Counts a split alignment as a single event diff --git a/src/mavis/annotate/constants.py b/src/mavis/annotate/constants.py index 1a4e324d..62882a9b 100644 --- a/src/mavis/annotate/constants.py +++ b/src/mavis/annotate/constants.py @@ -1,33 +1,10 @@ import re -import tab - from ..constants import MavisNamespace, float_fraction PASS_FILENAME = 'annotations.tab' -class SPLICE_TYPE(MavisNamespace): - """ - holds controlled vocabulary for allowed splice type classification values - - Attributes: - RETAIN: an intron was retained - SKIP: an exon was skipped - NORMAL: no exons were skipped and no introns were retained. the normal/expected splicing pattern was followed - MULTI_RETAIN: multiple introns were retained - MULTI_SKIP: multiple exons were skipped - COMPLEX: some combination of exon skipping and intron retention - """ - - RETAIN: str = 'retained intron' - SKIP: str = 'skipped exon' - NORMAL: str = 'normal' - MULTI_RETAIN: str = 'retained multiple introns' - MULTI_SKIP: str = 'skipped multiple exons' - COMPLEX: str = 'complex' - - class SPLICE_SITE_TYPE(MavisNamespace): DONOR: int = 3 ACCEPTOR: int = 5 diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index ed4f45cb..b41489c7 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -5,19 +5,21 @@ import os import re import warnings +from typing import Callable, Dict, List, Optional, Tuple -import tab +import pandas as pd from Bio import SeqIO +from Bio.SeqRecord import SeqRecord from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate from ..interval import Interval -from ..util import DEVNULL, LOG, filepath +from ..util import DEVNULL, LOG, cast_boolean, filepath from .base import BioInterval, ReferenceName from .genomic import Exon, Gene, PreTranscript, Template, Transcript from .protein import Domain, Translation -def load_masking_regions(*filepaths): +def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]: """ reads a file of regions. The expect input format for the file is tab-delimited and the header should contain the following columns @@ -35,23 +37,20 @@ def load_masking_regions(*filepaths): chr20 25600000 27500000 centromere Args: - filepath (str): path to the input tab-delimited file + filepath: path to the input tab-delimited file Returns: - Dict[str,List[BioInterval]]: a dictionary keyed by chromosome name with values of lists of regions on the chromosome - - Example: - >>> m = load_masking_regions('filename') - >>> m['1'] - [BioInterval(), BioInterval(), ...] + a dictionary keyed by chromosome name with values of lists of regions on the chromosome """ - regions = {} + regions: Dict[str, List[BioInterval]] = {} for filepath in filepaths: - _, rows = tab.read_file( - filepath, - require=['chr', 'start', 'end', 'name'], - cast={'start': int, 'end': int, 'chr': ReferenceName}, + df = pd.read_csv( + filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str} ) - for row in rows: + for col in ['chr', 'start', 'end', 'name']: + if col not in df: + raise KeyError(f'missing required column ({col})') + df['chr'] = df['chr'].apply(lambda c: ReferenceName(c)) + for row in df.to_dict('records'): mask_region = BioInterval( reference_object=row['chr'], start=row['start'], end=row['end'], name=row['name'] ) @@ -59,38 +58,32 @@ def load_masking_regions(*filepaths): return regions -def load_reference_genes(*pos, **kwargs): - """ - *Deprecated* Use :func:`load_annotations` instead - """ - warnings.warn('this function has been replaced by load_annotations', DeprecationWarning) - return load_annotations(*pos, **kwargs) - - -def load_annotations(*filepaths, warn=DEVNULL, reference_genome=None, best_transcripts_only=False): +def load_annotations( + *filepaths: str, + warn: Callable = DEVNULL, + reference_genome: Optional[Dict[str, SeqRecord]] = None, + best_transcripts_only: bool = False, +) -> Dict[str, List[Gene]]: """ loads gene models from an input file. Expects a tabbed or json file. Args: - filepath (str): path to the input file - verbose (bool): output extra information to stdout - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence by - template/chr name - filetype (str): json or tab/tsv. only required if the file type can't be interpolated from the path extension + filepath: path to the input file + reference_genome: dict of reference sequence by template/chr name Returns: - Dict[str,List[mavis.annotate.genomic.Gene]]: lists of genes keyed by chromosome name + lists of genes keyed by chromosome name """ - total_annotations = {} + total_annotations: Dict[str, List[Gene]] = {} for filename in filepaths: data = None - if filename.endswith('.tab') or filename.endswith('.tsv'): - data = convert_tab_to_json(filename, warn) - else: + if filename.endswith('.json'): with open(filename) as fh: data = json.load(fh) + else: + data = convert_tab_to_json(filename, warn) current_annotations = parse_annotations_json( data, @@ -105,11 +98,17 @@ def load_annotations(*filepaths, warn=DEVNULL, reference_genome=None, best_trans return total_annotations -def parse_annotations_json(data, reference_genome=None, best_transcripts_only=False, warn=DEVNULL): +def parse_annotations_json( + data, + reference_genome: Optional[Dict[str, SeqRecord]] = None, + best_transcripts_only=False, + warn=DEVNULL, +) -> Dict[str, List[Gene]]: """ parses a json of annotation information into annotation objects """ - genes_by_chr = {} + genes_by_chr: Dict[str, List[Gene]] = {} + for gene_dict in data['genes']: if gene_dict['strand'] in ['1', '+', 1]: gene_dict['strand'] = STRAND.POS @@ -131,7 +130,7 @@ def parse_annotations_json(data, reference_genome=None, best_transcripts_only=Fa has_best = False for transcript in gene_dict['transcripts']: - transcript['is_best_transcript'] = tab.cast_boolean(transcript['is_best_transcript']) + transcript['is_best_transcript'] = cast_boolean(transcript['is_best_transcript']) transcript.setdefault('exons', []) exons = [Exon(strand=gene.strand, **ex) for ex in transcript['exons']] if not exons: @@ -206,7 +205,7 @@ def parse_annotations_json(data, reference_genome=None, best_transcripts_only=Fa return genes_by_chr -def convert_tab_to_json(filepath, warn=DEVNULL): +def convert_tab_to_json(filepath: str, warn: Callable = DEVNULL) -> Dict: """ given a file in the std input format (see below) reads and return a list of genes (and sub-objects) @@ -236,17 +235,12 @@ def convert_tab_to_json(filepath, warn=DEVNULL): Returns: Dict[str,List[Gene]]: a dictionary keyed by chromosome name with values of list of genes on the chromosome - Example: - >>> ref = load_reference_genes('filename') - >>> ref['1'] - [Gene(), Gene(), ....] - Warning: does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 """ def parse_exon_list(row): - if not row: + if pd.isnull(row): return [] exons = [] for temp in re.split('[; ]', row): @@ -258,7 +252,7 @@ def parse_exon_list(row): return exons def parse_domain_list(row): - if not row: + if pd.isnull(row): return [] domains = [] for domain in row.split(';'): @@ -272,38 +266,41 @@ def parse_domain_list(row): warn('error in domain:', domain, row, repr(err)) return domains - def nullable_int(row): - try: - row = int(row) - except ValueError: - row = tab.cast_null(row) - return row - - _, rows = tab.read_file( + df = pd.read_csv( filepath, - require=['ensembl_gene_id', 'chr', 'ensembl_transcript_id'], - add_default={ - 'cdna_coding_start': 'null', - 'cdna_coding_end': 'null', - 'AA_domain_ranges': '', - 'genomic_exon_ranges': '', - 'hugo_names': '', - 'transcript_genomic_start': 'null', - 'transcript_genomic_end': 'null', - 'best_ensembl_transcript_id': 'null', - }, - cast={ - 'genomic_exon_ranges': parse_exon_list, - 'AA_domain_ranges': parse_domain_list, - 'cdna_coding_end': nullable_int, - 'cdna_coding_start': nullable_int, - 'transcript_genomic_end': nullable_int, - 'transcript_genomic_start': nullable_int, + dtype={ + 'ensembl_gene_id': str, + 'ensembl_transcript_id': str, + 'chr': str, + 'cdna_coding_start': pd.Int64Dtype(), + 'cdna_coding_end': pd.Int64Dtype(), + 'AA_domain_ranges': str, + 'genomic_exon_ranges': str, + 'hugo_names': str, + 'transcript_genomic_start': pd.Int64Dtype(), + 'transcript_genomic_end': pd.Int64Dtype(), + 'best_ensembl_transcript_id': str, 'gene_start': int, 'gene_end': int, }, + sep='\t', + comment='#', ) + + for col in ['ensembl_gene_id', 'chr', 'ensembl_transcript_id', 'gene_start', 'gene_end']: + if col not in df: + raise KeyError(f'missing required column: {col}') + + for col, parser in [ + ('genomic_exon_ranges', parse_exon_list), + ('AA_domain_ranges', parse_domain_list), + ]: + if col in df: + df[col] = df[col].apply(parser) + genes = {} + rows = df.where(df.notnull(), None).to_dict('records') + for row in rows: gene = { 'chr': row['chr'], @@ -311,23 +308,26 @@ def nullable_int(row): 'end': row['gene_end'], 'name': row['ensembl_gene_id'], 'strand': row['strand'], - 'aliases': row['hugo_names'].split(';') if row['hugo_names'] else [], + 'aliases': row['hugo_names'].split(';') if row.get('hugo_names') else [], 'transcripts': [], } if gene['name'] not in genes: genes[gene['name']] = gene else: gene = genes[gene['name']] - + is_best_transcript = ( + row.get('best_ensembl_transcript_id', row['ensembl_transcript_id']) + == row['ensembl_transcript_id'] + ) transcript = { - 'is_best_transcript': row['best_ensembl_transcript_id'] == row['ensembl_transcript_id'], + 'is_best_transcript': is_best_transcript, 'name': row['ensembl_transcript_id'], - 'exons': row['genomic_exon_ranges'], - 'domains': row['AA_domain_ranges'], - 'start': row['transcript_genomic_start'], - 'end': row['transcript_genomic_end'], - 'cdna_coding_start': row['cdna_coding_start'], - 'cdna_coding_end': row['cdna_coding_end'], + 'exons': row.get('genomic_exon_ranges', []), + 'domains': row.get('AA_domain_ranges', []), + 'start': row.get('transcript_genomic_start'), + 'end': row.get('transcript_genomic_end'), + 'cdna_coding_start': row.get('cdna_coding_start'), + 'cdna_coding_end': row.get('cdna_coding_end'), 'aliases': [], } gene['transcripts'].append(transcript) @@ -335,13 +335,13 @@ def nullable_int(row): return {'genes': genes.values()} -def load_reference_genome(*filepaths): +def load_reference_genome(*filepaths: str) -> Dict[str, SeqRecord]: """ Args: - filepaths (List[str]): the paths to the files containing the input fasta genomes + filepaths: the paths to the files containing the input fasta genomes Returns: - Dict[str,Bio.SeqRecord]: a dictionary representing the sequences in the fasta file + a dictionary representing the sequences in the fasta file """ reference_genome = {} for filename in filepaths: @@ -376,7 +376,7 @@ def load_reference_genome(*filepaths): return reference_genome -def load_templates(*filepaths): +def load_templates(*filepaths: str) -> Dict[str, Template]: """ primarily useful if template drawings are required and is not necessary otherwise assumes the input file is 0-indexed with [start,end) style. Columns are expected in @@ -395,26 +395,30 @@ def load_templates(*filepaths): chr1 0 2300000 p36.33 gneg chr1 2300000 5400000 p36.32 gpos25 - Args: - filename (str): the path to the file with the cytoband template information - Returns: - List[Template]: list of the templates loaded - + templates loaded """ header = ['name', 'start', 'end', 'band_name', 'giemsa_stain'] - templates = {} + templates: Dict[str, Template] = {} for filename in filepaths: - header, rows = tab.read_file( + df = pd.read_csv( filename, - header=header, - cast={'start': int, 'end': int}, - in_={'giemsa_stain': GIEMSA_STAIN.values()}, + sep='\t', + dtype={ + 'start': int, + 'end': int, + 'name': str, + 'band_name': str, + 'giemsa_stain': str, + }, + names=header, + comment='#', ) + df['giemsa_stain'].apply(lambda v: GIEMSA_STAIN.enforce(v)) - bands_by_template = {} - for row in rows: + bands_by_template: Dict[str, List[BioInterval]] = {} + for row in df.to_dict('records'): band = BioInterval(None, row['start'] + 1, row['end'], name=row['band_name'], data=row) bands_by_template.setdefault(row['name'], []).append(band) @@ -427,10 +431,10 @@ def load_templates(*filepaths): class ReferenceFile: + # store loaded file to avoid re-loading + CACHE = {} # type: ignore - CACHE = {} # store loaded file to avoid re-loading - - LOAD_FUNCTIONS = { + LOAD_FUNCTIONS: Dict[str, Optional[Callable]] = { 'annotations': load_annotations, 'reference_genome': load_reference_genome, 'masking': load_masking_regions, @@ -440,7 +444,14 @@ class ReferenceFile: } """dict: Mapping of file types (based on ENV name) to load functions""" - def __init__(self, file_type, *filepaths, eager_load=False, assert_exists=False, **opt): + def __init__( + self, + file_type: str, + *filepaths: str, + eager_load: bool = False, + assert_exists: bool = False, + **opt, + ): """ Args: *filepaths (str): list of paths to load diff --git a/src/mavis/annotate/main.py b/src/mavis/annotate/main.py index 6103ea44..f1a9456e 100644 --- a/src/mavis/annotate/main.py +++ b/src/mavis/annotate/main.py @@ -138,13 +138,11 @@ def main( # test that the sequence makes sense for a random transcript bpps = read_inputs( inputs, - in_={COLUMNS.protocol: PROTOCOL.values()}, add_default={ COLUMNS.protocol: config['libraries'][library]['protocol'], COLUMNS.library: library, COLUMNS.stranded: False, }, - require=[COLUMNS.protocol, COLUMNS.library], expand_strand=False, expand_orient=True, expand_svtype=True, diff --git a/src/mavis/annotate/splicing.py b/src/mavis/annotate/splicing.py index 910d8c0e..ae9d4ef4 100644 --- a/src/mavis/annotate/splicing.py +++ b/src/mavis/annotate/splicing.py @@ -1,9 +1,9 @@ import itertools -from .base import BioInterval -from .constants import ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, SPLICE_SITE_TYPE, SPLICE_TYPE -from ..constants import reverse_complement, STRAND +from ..constants import SPLICE_TYPE, STRAND, reverse_complement from ..interval import Interval +from .base import BioInterval +from .constants import ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, SPLICE_SITE_TYPE class SplicingPattern(list): diff --git a/src/mavis/bam/stats.py b/src/mavis/bam/stats.py index e7ba151e..e161227c 100644 --- a/src/mavis/bam/stats.py +++ b/src/mavis/bam/stats.py @@ -117,7 +117,7 @@ def compute_transcriptome_bam_stats( Args: bam_file_handle (BamCache): the input bam file handle - annotations (object): see :func:`mavis.annotate.load_reference_genes` + annotations (object): see :func:`mavis.annotate.load_annotations` sample_size (int): the number of genes to compute stats over log (Callable): outputs logging information min_mapping_quality (int): the minimum mapping quality for a read to be used diff --git a/src/mavis/blat.py b/src/mavis/blat.py index 12c9a7b6..c8cd325f 100644 --- a/src/mavis/blat.py +++ b/src/mavis/blat.py @@ -13,7 +13,7 @@ import math import re -import tab +import pandas as pd from .align import query_coverage_interval from .bam import cigar as _cigar @@ -24,11 +24,11 @@ DNA_ALPHABET, NA_MAPPING_QUALITY, PYSAM_READ_FLAGS, - reverse_complement, STRAND, + reverse_complement, ) -from .util import LOG from .interval import Interval +from .util import LOG class Blat: @@ -107,7 +107,7 @@ def percent_identity(row, is_protein=False, is_mrna=True): @staticmethod def read_pslx(filename, seqid_to_sequence_mapping, is_protein=False, verbose=True): - pslx_header = [ + header = [ 'match', 'mismatch', 'repmatch', @@ -139,10 +139,11 @@ def split_csv_trailing_seq(x): def split_csv_trailing_ints(x): return [int(s) for s in re.sub(',$', '', x).split(',')] - header, rows = tab.read_file( + df = pd.read_csv( filename, - header=pslx_header, - cast={ + sep='\t', + names=header, + dtype={ 'match': int, 'mismatch': int, 'repmatch': int, @@ -158,18 +159,23 @@ def split_csv_trailing_ints(x): 'tstart': int, 'tend': int, 'block_count': int, - 'tname': lambda x: re.sub('^chr', '', x), - 'block_sizes': split_csv_trailing_ints, - 'qstarts': split_csv_trailing_ints, - 'tstarts': split_csv_trailing_ints, - 'qseqs': split_csv_trailing_seq, - 'tseqs': split_csv_trailing_seq, + 'tname': str, + 'block_sizes': str, + 'qstarts': str, + 'tstarts': str, + 'qseqs': str, + 'tseqs': str, }, - validate={'strand': r'^[\+-]$'}, ) + for col in ['block_sizes', 'qstarts', 'tstarts']: + df[col] = df[col].apply(split_csv_trailing_ints) + for col in ['qseqs', 'tseqs']: + df[col] = df[col].apply(split_csv_trailing_seq) + df['strand'].apply(lambda x: STRAND.enforce(x)) + final_rows = [] - for row in rows: + for row in df.to_dict('records'): try: row['score'] = Blat.score(row, is_protein=is_protein) row['percent_ident'] = Blat.percent_identity(row, is_protein=is_protein) @@ -366,10 +372,7 @@ def process_blat_output( if is_protein: raise NotImplementedError('currently does not support aligning protein sequences') - try: - _, rows = Blat.read_pslx(aligner_output_file, query_id_mapping, is_protein=is_protein) - except tab.tab.EmptyFileError: - rows = [] + _, rows = Blat.read_pslx(aligner_output_file, query_id_mapping, is_protein=is_protein) # split the rows by query id rows_by_query = {} diff --git a/src/mavis/cluster/main.py b/src/mavis/cluster/main.py index 3ea459fd..ae07c755 100644 --- a/src/mavis/cluster/main.py +++ b/src/mavis/cluster/main.py @@ -76,7 +76,7 @@ def main( output: path to the output directory library: the library to look for in each of the input files masking (ReferenceFile): see :func:`mavis.annotate.file_io.load_masking_regions` - annotations (ReferenceFile): see :func:`mavis.annotate.file_io.load_reference_genes` + annotations (ReferenceFile): see :func:`mavis.annotate.file_io.load_annotations` """ masking = ReferenceFile.load_from_config(config, 'masking', eager_load=True) annotations = ReferenceFile.load_from_config(config, 'annotations') @@ -95,7 +95,7 @@ def main( # load the input files breakpoint_pairs = read_inputs( inputs, - cast={ + apply={ COLUMNS.tools: lambda x: set(x.split(';')) if x else set() diff --git a/src/mavis/config.py b/src/mavis/config.py index 6ff74392..480d3f72 100644 --- a/src/mavis/config.py +++ b/src/mavis/config.py @@ -4,15 +4,14 @@ from typing import Dict, Optional import snakemake -import tab from snakemake.exceptions import WorkflowError from snakemake.utils import validate as snakemake_validate from .annotate.file_io import ReferenceFile from .bam import stats from .bam.cache import BamCache -from .constants import PROTOCOL, SUBCOMMAND, float_fraction -from .util import bash_expands, filepath +from .constants import INTEGER_COLUMNS, PROTOCOL, SUBCOMMAND, float_fraction +from .util import bash_expands, cast_boolean, filepath def calculate_bam_stats(config: Dict, library_name: str) -> Dict: @@ -206,7 +205,7 @@ def get_metavar(arg_type): >>> get_metavar(bool) '{True,False}' """ - if arg_type in [bool, tab.cast_boolean]: + if arg_type in [bool, cast_boolean]: return '{True,False}' elif arg_type in [float_fraction, float]: return 'FLOAT' diff --git a/src/mavis/constants.py b/src/mavis/constants.py index da2a8d40..89e75e64 100644 --- a/src/mavis/constants.py +++ b/src/mavis/constants.py @@ -10,7 +10,6 @@ from Bio.Alphabet.IUPAC import ambiguous_dna from Bio.Data.IUPACData import ambiguous_dna_values from Bio.Seq import Seq -from tab import cast_boolean, cast_null PROGNAME: str = 'mavis' EXIT_OK: int = 0 @@ -123,6 +122,27 @@ def float_fraction(num): return num +class SPLICE_TYPE(MavisNamespace): + """ + holds controlled vocabulary for allowed splice type classification values + + Attributes: + RETAIN: an intron was retained + SKIP: an exon was skipped + NORMAL: no exons were skipped and no introns were retained. the normal/expected splicing pattern was followed + MULTI_RETAIN: multiple introns were retained + MULTI_SKIP: multiple exons were skipped + COMPLEX: some combination of exon skipping and intron retention + """ + + RETAIN: str = 'retained intron' + SKIP: str = 'skipped exon' + NORMAL: str = 'normal' + MULTI_RETAIN: str = 'retained multiple introns' + MULTI_SKIP: str = 'skipped multiple exons' + COMPLEX: str = 'complex' + + COMPLETE_STAMP: str = 'MAVIS.COMPLETE' """Filename for all complete stamp files""" @@ -515,14 +535,12 @@ class COLUMNS(MavisNamespace): call_method: str = 'call_method' break1_ewindow: str = 'break1_ewindow' break1_ewindow_count: str = 'break1_ewindow_count' - break1_ewindow_practical_coverage: str = 'break1_ewindow_practical_coverage' break1_homologous_seq: str = 'break1_homologous_seq' break1_split_read_names: str = 'break1_split_read_names' break1_split_reads: str = 'break1_split_reads' break1_split_reads_forced: str = 'break1_split_reads_forced' break2_ewindow: str = 'break2_ewindow' break2_ewindow_count: str = 'break2_ewindow_count' - break2_ewindow_practical_coverage: str = 'break2_ewindow_practical_coverage' break2_homologous_seq: str = 'break2_homologous_seq' break2_split_read_names: str = 'break2_split_read_names' break2_split_reads: str = 'break2_split_reads' @@ -576,3 +594,63 @@ def sort_columns(input_columns): temp = sorted([c for c in input_columns if c in order], key=lambda x: order[x]) temp = temp + sorted([c for c in input_columns if c not in order]) return temp + + +INTEGER_COLUMNS = { + COLUMNS.break1_position_end, + COLUMNS.break1_position_start, + COLUMNS.break2_position_end, + COLUMNS.break2_position_start, +} + +FLOAT_COLUMNS = { + COLUMNS.break1_ewindow_count, + COLUMNS.break1_split_reads_forced, + COLUMNS.break1_split_reads, + COLUMNS.break2_ewindow_count, + COLUMNS.break2_split_reads_forced, + COLUMNS.break2_split_reads, + COLUMNS.cluster_size, + COLUMNS.contig_alignment_query_consumption, + COLUMNS.contig_alignment_rank, + COLUMNS.contig_alignment_score, + COLUMNS.contig_break1_read_depth, + COLUMNS.contig_break2_read_depth, + COLUMNS.contig_build_score, + COLUMNS.contig_read_depth, + COLUMNS.contig_remap_score, + COLUMNS.contig_remapped_reads, + COLUMNS.contigs_assembled, + COLUMNS.flanking_pairs_compatible, + COLUMNS.flanking_pairs, + COLUMNS.linking_split_reads, + COLUMNS.raw_break1_half_mapped_reads, + COLUMNS.raw_break1_split_reads, + COLUMNS.raw_break2_half_mapped_reads, + COLUMNS.raw_break2_split_reads, + COLUMNS.raw_flanking_pairs, + COLUMNS.raw_spanning_reads, + COLUMNS.repeat_count, + COLUMNS.spanning_reads, +} + +BOOLEAN_COLUMNS = {COLUMNS.opposing_strands, COLUMNS.stranded, COLUMNS.supplementary_call} + +SUMMARY_LIST_COLUMNS = { + COLUMNS.annotation_figure, + COLUMNS.annotation_id, + COLUMNS.break1_split_reads, + COLUMNS.break2_split_reads, + COLUMNS.call_method, + COLUMNS.contig_alignment_score, + COLUMNS.contig_remapped_reads, + COLUMNS.contig_seq, + COLUMNS.event_type, + COLUMNS.flanking_pairs, + COLUMNS.pairing, + COLUMNS.product_id, + COLUMNS.spanning_reads, + COLUMNS.tools, + COLUMNS.tools, + COLUMNS.tracking_id, +} diff --git a/src/mavis/main.py b/src/mavis/main.py index 1d86fc17..5b69fff3 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -8,8 +8,6 @@ import time from typing import Dict -import tab - from . import __version__ from . import config as _config from . import util as _util @@ -91,10 +89,10 @@ def create_parser(argv): help='Indicates the input file type to be parsed', ) optional[SUBCOMMAND.CONVERT].add_argument( - '--strand_specific', type=tab.cast_boolean, default=False + '--strand_specific', type=_util.cast_boolean, default=False ) optional[SUBCOMMAND.CONVERT].add_argument( - '--assume_no_untemplated', type=tab.cast_boolean, default=True + '--assume_no_untemplated', type=_util.cast_boolean, default=True ) for command in [SUBCOMMAND.CONVERT, SUBCOMMAND.SETUP]: required[command].add_argument( diff --git a/src/mavis/overlay.py b/src/mavis/overlay.py index 868a90fa..9543ff84 100644 --- a/src/mavis/overlay.py +++ b/src/mavis/overlay.py @@ -1,8 +1,6 @@ import os from typing import Dict, List, Tuple, Union -import tab - from . import annotate as _annotate from . import util as _util from .annotate.file_io import ReferenceFile @@ -59,7 +57,7 @@ def check_overlay_args(args, parser): 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax]) ) try: - plot[stranded] = tab.cast_boolean(plot[stranded]) + plot[stranded] = _util.cast_boolean(plot[stranded]) except TypeError: parser.error( 'argument --read_depth_plots: stranded must be an boolean: {}'.format( diff --git a/src/mavis/pairing/main.py b/src/mavis/pairing/main.py index 1e332002..850d0173 100644 --- a/src/mavis/pairing/main.py +++ b/src/mavis/pairing/main.py @@ -3,10 +3,11 @@ import time from typing import Dict, List, Set, Tuple -from ..annotate.constants import SPLICE_TYPE +import pandas as pd + from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair -from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE +from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SPLICE_TYPE, SVTYPE from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs from .pairing import inferred_equivalent, pair_by_distance, product_key @@ -36,23 +37,18 @@ def main( bpps.extend( read_inputs( inputs, - require=[ + required_columns=[ COLUMNS.annotation_id, COLUMNS.library, COLUMNS.fusion_cdna_coding_start, COLUMNS.fusion_cdna_coding_end, COLUMNS.fusion_sequence_fasta_id, ], - in_={ - COLUMNS.protocol: PROTOCOL.values(), - COLUMNS.event_type: SVTYPE.values(), - COLUMNS.fusion_splicing_pattern: SPLICE_TYPE.values() + [None, 'None'], - }, - add_default={ - COLUMNS.fusion_cdna_coding_start: None, - COLUMNS.fusion_cdna_coding_end: None, - COLUMNS.fusion_sequence_fasta_id: None, - COLUMNS.fusion_splicing_pattern: None, + apply={ + COLUMNS.event_type: lambda x: SVTYPE.enforce(x), + COLUMNS.fusion_splicing_pattern: lambda x: SPLICE_TYPE.enforce(x) + if not pd.isnull(x) + else x, }, expand_strand=False, expand_orient=False, diff --git a/src/mavis/summary/main.py b/src/mavis/summary/main.py index c34cb5e8..94f79ddb 100644 --- a/src/mavis/summary/main.py +++ b/src/mavis/summary/main.py @@ -4,11 +4,11 @@ from functools import partial from typing import Dict, List, Tuple -import tab +import pandas as pd from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair -from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE +from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SPLICE_TYPE, SVTYPE from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs, soft_cast from .constants import HOMOPOLYMER_MIN_LENGTH from .summary import ( @@ -21,13 +21,6 @@ ) -def soft_cast_null(value): - try: - return tab.cast_null(value) - except TypeError: - return value - - def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time())): annotations = ReferenceFile.load_from_config(config, 'annotations', eager_load=True) dgv_annotation = ReferenceFile.load_from_config(config, 'dgv_annotation') @@ -45,7 +38,7 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( bpps.extend( read_inputs( inputs, - require=[ + required_columns=[ COLUMNS.event_type, COLUMNS.product_id, COLUMNS.fusion_cdna_coding_end, @@ -69,50 +62,17 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( COLUMNS.disease_status, ], add_default={ - **{ - k: None - for k in [ - COLUMNS.contig_remapped_reads, - COLUMNS.contig_seq, - COLUMNS.break1_split_reads, - COLUMNS.break1_split_reads_forced, - COLUMNS.break2_split_reads, - COLUMNS.break2_split_reads_forced, - COLUMNS.linking_split_reads, - COLUMNS.flanking_pairs, - COLUMNS.contigs_assembled, - COLUMNS.contig_alignment_score, - COLUMNS.contig_remap_score, - COLUMNS.spanning_reads, - COLUMNS.annotation_figure, - COLUMNS.gene1_aliases, - COLUMNS.gene2_aliases, - COLUMNS.protein_synon, - COLUMNS.cdna_synon, - COLUMNS.net_size, - COLUMNS.tracking_id, - COLUMNS.assumed_untemplated, - 'dgv', - 'summary_pairing', - ] - }, COLUMNS.call_method: CALL_METHOD.INPUT, }, + apply={ + COLUMNS.event_type: lambda x: SVTYPE.enforce(x), + COLUMNS.fusion_splicing_pattern: lambda x: SPLICE_TYPE.enforce(x) + if not pd.isnull(x) + else x, + }, expand_strand=False, expand_orient=False, expand_svtype=False, - cast={ - COLUMNS.break1_split_reads: partial(soft_cast, cast_type=int), - COLUMNS.break2_split_reads: partial(soft_cast, cast_type=int), - COLUMNS.contig_remapped_reads: partial(soft_cast, cast_type=int), - COLUMNS.spanning_reads: partial(soft_cast, cast_type=int), - COLUMNS.break1_split_reads_forced: partial(soft_cast, cast_type=int), - COLUMNS.break2_split_reads_forced: partial(soft_cast, cast_type=int), - COLUMNS.flanking_pairs: partial(soft_cast, cast_type=int), - COLUMNS.linking_split_reads: partial(soft_cast, cast_type=int), - COLUMNS.protein_synon: soft_cast_null, - COLUMNS.cdna_synon: soft_cast_null, - }, ) ) # load all transcripts @@ -335,12 +295,12 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( row.data.setdefault(COLUMNS.library, lib) # filter pairing ids based on what is still kept? paired_libraries = set() - for product_id in row.pairing.split(';'): + for product_id in (row.pairing or '').split(';'): for lib in bpps_by_library: if product_id.startswith(lib): paired_libraries.add(lib) inferred_paired_libraries = set() - for product_id in row.inferred_pairing.split(';'): + for product_id in (row.inferred_pairing or '').split(';'): for lib in bpps_by_library: if product_id.startswith(lib): inferred_paired_libraries.add(lib) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index 67ef7420..5287d7d8 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -297,6 +297,7 @@ def filter_by_evidence( linking_split_reads = bpp.column('linking_split_reads') if bpp.event_type == SVTYPE.INS: linking_split_reads += bpp.column('flanking_pairs') + if any( [ bpp.column('break1_split_reads') + bpp.column('break1_split_reads_forced') diff --git a/src/mavis/tools/__init__.py b/src/mavis/tools/__init__.py index 307f4af5..8dfc2db0 100644 --- a/src/mavis/tools/__init__.py +++ b/src/mavis/tools/__init__.py @@ -1,7 +1,7 @@ import itertools from typing import Callable, Dict, List -import tab +import pandas as pd from shortuuid import uuid from ..breakpoint import Breakpoint, BreakpointPair @@ -253,9 +253,9 @@ def _convert_tool_output( input_file, expand_orient=True, expand_svtype=True, add_default={'stranded': stranded} ) elif file_type == SUPPORTED_TOOL.CNVNATOR: - _, rows = tab.read_file( + df = pd.read_csv( input_file, - header=[ + names=[ 'event_type', 'coordinates', 'size', @@ -266,7 +266,20 @@ def _convert_tool_output( 'e-val4', 'q0', ], + dtype={ + 'event_type': str, + 'coordinates': str, + 'size': pd.Int64Dtype(), + 'normalized_RD': float, + 'e-val1': float, + 'e-val2': float, + 'e-val3': float, + 'e-val4': float, + 'q0': float, + }, + sep='\t', ) + rows = df.where(df.notnull(), None).to_dict('records') elif file_type in [ SUPPORTED_TOOL.DELLY, SUPPORTED_TOOL.MANTA, @@ -279,7 +292,9 @@ def _convert_tool_output( elif file_type == SUPPORTED_TOOL.BREAKDANCER: rows = _convert_breakdancer_file(input_file) else: - _, rows = tab.read_file(input_file) + df = pd.read_csv(input_file, sep='\t', dtype=str, comment=None) + df.columns = [c[1:] if c.startswith('#') else c for c in df.columns] + rows = df.where(df.notnull(), None).to_dict('records') if rows: log('found', len(rows), 'rows') for row in rows: diff --git a/src/mavis/tools/breakdancer.py b/src/mavis/tools/breakdancer.py index 2f27ee85..e1f8361e 100644 --- a/src/mavis/tools/breakdancer.py +++ b/src/mavis/tools/breakdancer.py @@ -1,24 +1,41 @@ import re - -import tab from argparse import Namespace +import pandas as pd + def convert_file(input_file): bam_to_lib = {} + + # read comments with open(input_file, 'r') as fh: # comments in breakdancer are marked with a single # so they need to be discarded before reading lines = fh.readlines() - header = 0 - while header < len(lines) and lines[header].startswith('#'): - metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*', lines[header]) + line_index = 0 + while line_index < len(lines) and lines[line_index].startswith('#'): + metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*', lines[line_index]) if metadata_match: bam_to_lib[metadata_match.group(1)] = metadata_match.group(2) - header += 1 - lines = lines[header - 1 :] - input_file = Namespace(readlines=lambda: lines) - header, rows = tab.read_file(input_file, allow_short=True, require=['num_Reads_lib']) - for row in rows: - for bam, lib in bam_to_lib.items(): - row['num_Reads_lib'] = row['num_Reads_lib'].replace(bam, lib) - return rows + line_index += 1 + header = [c.strip() for c in re.sub(r'^#', '', lines[line_index - 1]).split('\t')] + # read the main file + df = pd.read_csv( + input_file, + names=header, + sep='\t', + comment='#', + dtype={ + 'num_Reads_lib': str, + 'Pos1': int, + 'Pos2': int, + 'Chr1': str, + 'Chr2': str, + 'Type': str, + }, + ) + if 'num_Reads_lib' not in df: + raise KeyError(f'missing required column: num_Reads_lib') + + for bam, lib in bam_to_lib.items(): + df['num_Reads_lib'] = df['num_Reads_lib'].str.replace(bam, lib) + return df.to_dict('records') diff --git a/src/mavis/util.py b/src/mavis/util.py index 1b145d29..4a22984a 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -9,13 +9,25 @@ from datetime import datetime from functools import partial from glob import glob +from typing import Any, Callable, Dict, List, Optional, Set +import pandas as pd from braceexpand import braceexpand from shortuuid import uuid -from tab import tab from .breakpoint import Breakpoint, BreakpointPair -from .constants import COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE, MavisNamespace, sort_columns +from .constants import ( + COLUMNS, + FLOAT_COLUMNS, + INTEGER_COLUMNS, + ORIENT, + PROTOCOL, + STRAND, + SUMMARY_LIST_COLUMNS, + SVTYPE, + MavisNamespace, + sort_columns, +) from .error import InvalidRearrangement from .interval import Interval @@ -84,6 +96,22 @@ def __call__(self, item): return self.callback_func(item) +def cast_null(input_value): + value = str(input_value).lower() + if value in ['none', 'null']: + return None + raise TypeError('casting to null/None failed', input_value) + + +def cast_boolean(input_value): + value = str(input_value).lower() + if value in ['t', 'true', '1', 'y', 'yes', '+']: + return True + elif value in ['f', 'false', '0', 'n', 'no', '-']: + return False + raise TypeError('casting to boolean failed', input_value) + + def cast(value, cast_func): """ cast a value to a given type @@ -93,7 +121,7 @@ def cast(value, cast_func): 1 """ if cast_func == bool: - value = tab.cast_boolean(value) + value = cast_boolean(value) else: value = cast_func(value) return value @@ -113,7 +141,7 @@ def soft_cast(value, cast_type): return cast(value, cast_type) except (TypeError, ValueError): pass - return tab.cast_null(value) + return cast_null(value) def get_env_variable(arg, default, cast_type=None): @@ -228,23 +256,21 @@ def filter_on_overlap(bpps, regions_by_reference_name): return passed, failed -def read_inputs(inputs, **kwargs): +def read_inputs(inputs, required_columns=[], **kwargs): bpps = [] - kwargs.setdefault('require', []) - kwargs['require'] = list(set(kwargs['require'] + [COLUMNS.protocol])) - kwargs.setdefault('in_', {}) - kwargs['in_'][COLUMNS.protocol] = PROTOCOL.values() + for finput in bash_expands(*inputs): - try: - LOG('loading:', finput) - bpps.extend(read_bpp_from_input_file(finput, **kwargs)) - except tab.EmptyFileError: - LOG('ignoring empty file:', finput) + LOG('loading:', finput) + bpps.extend( + read_bpp_from_input_file( + finput, required_columns=[COLUMNS.protocol, *required_columns], **kwargs + ) + ) LOG('loaded', len(bpps), 'breakpoint pairs') return bpps -def output_tabbed_file(bpps, filename, header=None): +def output_tabbed_file(bpps: List[BreakpointPair], filename: str, header=None): if header is None: custom_header = False header = set() @@ -258,12 +284,10 @@ def output_tabbed_file(bpps, filename, header=None): if not custom_header: header.update(row.keys()) header = sort_columns(header) - - with open(filename, 'w') as fh: - LOG('writing:', filename) - fh.write('#' + '\t'.join(header) + '\n') - for row in rows: - fh.write('\t'.join([str(row.get(c, None)) for c in header]) + '\n') + LOG('writing:', filename) + df = pd.DataFrame.from_records(rows, columns=header) + df = df.fillna('None') + df.to_csv(filename, columns=header, index=False, sep='\t') def write_bed_file(filename, bed_rows): @@ -351,7 +375,9 @@ def filter_uninformative(annotations_by_chr, breakpoint_pairs, max_proximity=500 return result, filtered -def unique_exists(pattern, allow_none=False, get_newest=False): +def unique_exists( + pattern: str, allow_none: bool = False, get_newest: bool = False +) -> Optional[str]: result = bash_expands(pattern) if len(result) == 1: return result[0] @@ -366,75 +392,135 @@ def unique_exists(pattern, allow_none=False, get_newest=False): def read_bpp_from_input_file( - filename, expand_orient=False, expand_strand=False, expand_svtype=False, **kwargs -): + filename: str, + expand_orient: bool = False, + expand_strand: bool = False, + expand_svtype: bool = False, + integer_columns: Set[str] = INTEGER_COLUMNS, + float_columns: Set[str] = FLOAT_COLUMNS, + required_columns: Set[str] = set(), + add_default: Dict[str, Any] = {}, + summary: bool = False, + apply: Dict[str, Callable] = {}, + overwrite: Dict[str, Any] = {}, +) -> List[BreakpointPair]: """ reads a file using the tab module. Each row is converted to a breakpoint pair and other column data is stored in the data attribute Args: - filename (str): path to the input file - expand_ns (bool): expand not specified orient/strand settings to all specific version - (for strand this is only applied if the bam itself is stranded) - explicit_strand (bool): used to stop unstranded breakpoint pairs from losing input strand information - Returns: - List[BreakpointPair]: a list of pairs - - Example: - >>> read_bpp_from_input_file('filename') - [BreakpointPair(), BreakpointPair(), ...] - - One can also validate other expected columns that will go in the data attribute using the usual arguments - to the tab.read_file function + filename: path to the input file + expand_ns: expand not specified orient/strand settings to all specific version (for strand this is only applied if the bam itself is stranded) + explicit_strand: used to stop unstranded breakpoint pairs from losing input strand information + summary: the input is post-summary so some float/int columns have been merged and delimited with semi-colons + overwrite: set column values for all breakpoints, if the column exists overwrite its current value - Example: - >>> read_bpp_from_input_file('filename', cast={'index': int}) - [BreakpointPair(), BreakpointPair(), ...] + Returns: + a list of pairs """ def soft_null_cast(value): try: - tab.cast_null(value) + cast_null(value) except TypeError: return value - kwargs['require'] = set() if 'require' not in kwargs else set(kwargs['require']) - kwargs['require'].update({COLUMNS.break1_chromosome, COLUMNS.break2_chromosome}) - kwargs.setdefault('cast', {}).update( - { - COLUMNS.break1_position_start: int, - COLUMNS.break1_position_end: int, - COLUMNS.break2_position_start: int, - COLUMNS.break2_position_end: int, - COLUMNS.opposing_strands: lambda x: None if x == '?' else soft_cast(x, cast_type=bool), - COLUMNS.stranded: tab.cast_boolean, - COLUMNS.untemplated_seq: soft_null_cast, - COLUMNS.break1_chromosome: lambda x: re.sub('^chr', '', x), - COLUMNS.break2_chromosome: lambda x: re.sub('^chr', '', x), - COLUMNS.tracking_id: lambda x: x if x else str(uuid()), - } - ) - kwargs.setdefault('add_default', {}).update( - { - COLUMNS.untemplated_seq: None, - COLUMNS.break1_orientation: ORIENT.NS, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break2_orientation: ORIENT.NS, - COLUMNS.break2_strand: STRAND.NS, - COLUMNS.opposing_strands: None, - COLUMNS.tracking_id: '', - } - ) - kwargs.setdefault('in_', {}).update( - { - COLUMNS.break1_orientation: ORIENT.values(), - COLUMNS.break1_strand: STRAND.values(), - COLUMNS.break2_orientation: ORIENT.values(), - COLUMNS.break2_strand: STRAND.values(), - } - ) - _, rows = tab.read_file(filename, suppress_index=True, **kwargs) - restricted = [ + if summary: + integer_columns = integer_columns - SUMMARY_LIST_COLUMNS + float_columns = float_columns - SUMMARY_LIST_COLUMNS + + try: + df = pd.read_csv( + filename, + dtype={ + **{col: pd.Int64Dtype() for col in integer_columns}, + **{col: float for col in float_columns}, + **{ + col: str + for col in COLUMNS.keys() + if col not in (float_columns | integer_columns) + }, + }, + sep='\t', + comment='#', + na_values=['None', 'none', 'N/A', 'n/a', 'null', 'NULL', 'Null', 'nan', '', 'NaN'], + ) + except pd.errors.EmptyDataError: + return [] + + for col in required_columns: + if col not in df: + raise KeyError(f'missing required column: {col}') + + # run the custom functions + for col, func in apply.items(): + df[col] = df[col].apply(func) + + if COLUMNS.opposing_strands in df: + df[COLUMNS.opposing_strands] = df[COLUMNS.opposing_strands].apply( + lambda x: None if x == '?' else soft_cast(x, cast_type=bool) + ) + else: + df[COLUMNS.opposing_strands] = None + + if COLUMNS.stranded in df: + df[COLUMNS.stranded] = df[COLUMNS.stranded].apply(cast_boolean) + else: + df[COLUMNS.stranded] = None + + if COLUMNS.untemplated_seq in df: + df[COLUMNS.untemplated_seq] = df[COLUMNS.untemplated_seq].apply(soft_null_cast) + else: + df[COLUMNS.untemplated_seq] = None + + for col in [COLUMNS.break1_chromosome, COLUMNS.break2_chromosome]: + df[col] = df[col].apply(lambda v: re.sub(r'^chr', '', v)) + + if COLUMNS.tracking_id not in df: + df[COLUMNS.tracking_id] = '' + else: + df[COLUMNS.tracking_id] = df[COLUMNS.tracking_id].fillna(str(uuid())) + + # add default values + for col, default_value in add_default.items(): + if col in df: + df[col] = df[col].fillna(default_value) + else: + df[col] = default_value + + # set overwriting defaults + for col, value in overwrite.items(): + df[col] = value + + # enforce controlled vocabulary + for vocab, cols in [ + (ORIENT, [COLUMNS.break1_orientation, COLUMNS.break2_orientation]), + (STRAND, [COLUMNS.break1_strand, COLUMNS.break2_strand]), + (PROTOCOL, [COLUMNS.protocol]), + ]: + for col in cols: + if col in df: + df[col].apply(lambda c: vocab.enforce(c)) + elif hasattr(vocab, 'NS'): + df[col] = vocab.NS # type: ignore + + def validate_pipeline_id(value): + if not re.match(r'^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', value): + raise AssertionError( + 'All mavis pipeline step ids must satisfy the regex:', + '^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', + value, + ) + + for col in [COLUMNS.cluster_id, COLUMNS.annotation_id, COLUMNS.validation_id]: + if col in df: + try: + df[col].apply(validate_pipeline_id) + except AssertionError as err: + raise AssertionError(f'error in column ({col}): {err}') + + rows = df.where(df.notnull(), None).to_dict('records') + non_data_columns = { COLUMNS.break1_chromosome, COLUMNS.break1_position_start, COLUMNS.break1_position_end, @@ -448,24 +534,17 @@ def soft_null_cast(value): COLUMNS.stranded, COLUMNS.opposing_strands, COLUMNS.untemplated_seq, - ] - pairs = [] + } + pairs: List[BreakpointPair] = [] + for line_index, row in enumerate(rows): row['line_no'] = line_index + 1 + if '_index' in row: del row['_index'] for attr, val in row.items(): row[attr] = soft_null_cast(val) - for attr in row: - if attr in [COLUMNS.cluster_id, COLUMNS.annotation_id, COLUMNS.validation_id]: - if not re.match('^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', row[attr]): - raise AssertionError( - 'error in column', - attr, - 'All mavis pipeline step ids must satisfy the regex:', - '^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', - row[attr], - ) + stranded = row[COLUMNS.stranded] strand1 = row[COLUMNS.break1_strand] if stranded else STRAND.NS @@ -474,7 +553,7 @@ def soft_null_cast(value): temp = [] expand_strand = stranded and expand_strand event_type = [None] - if row.get(COLUMNS.event_type, None) not in [None, 'None']: + if not pd.isnull(row.get(COLUMNS.event_type)): try: event_type = row[COLUMNS.event_type].split(';') for putative_event_type in event_type: @@ -509,7 +588,7 @@ def soft_null_cast(value): orient=orient2, ) - data = {k: v for k, v in row.items() if k not in restricted} + data = {k: v for k, v in row.items() if k not in non_data_columns} bpp = BreakpointPair( break1, break2, diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 1136ff81..51d1e48f 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -48,7 +48,7 @@ def main( stdev_fragment_size (int): the standard deviation in fragment size read_length (int): read length reference_genome (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_reference_genome` - annotations (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_reference_genes` + annotations (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_annotations` masking (mavis.annotate.file_io.ReferenceFile): see :func:`mavis.annotate.file_io.load_masking_regions` aligner_reference (mavis.annotate.file_io.ReferenceFile): path to the aligner reference file (e.g 2bit file for blat) """ @@ -86,11 +86,13 @@ def main( bpps = read_inputs( inputs, - add_default={COLUMNS.cluster_id: None, COLUMNS.stranded: False}, - add={COLUMNS.protocol: config['libraries'][library]['protocol'], COLUMNS.library: library}, + add_default={COLUMNS.cluster_id: str(uuid()), COLUMNS.stranded: False}, + overwrite={ + COLUMNS.protocol: config['libraries'][library]['protocol'], + COLUMNS.library: library, + }, expand_strand=False, expand_orient=True, - cast={COLUMNS.cluster_id: lambda x: str(uuid()) if not x else x}, ) evidence_clusters = [] for bpp in bpps: diff --git a/src/tab/__init__.py b/src/tab/__init__.py deleted file mode 100644 index fc7a7938..00000000 --- a/src/tab/__init__.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -# About - -The tab module is a python module used for easy transformation of tab files -It is fairly basic and does not support quoting text or escaping delimiters - -## Order of Operations - -There are a number of different transformations which can be applied to the rows. The order in which they are applied -is as follows - -1. add -2. add_default -3. require -4. validate -5. rename -6. split -7. combine -8. cast -9. in_ -10. drop -11. simplify - -### add - -this adds a new column with a default value. If the column name already exists in the input header. The existing column -is overwritten and replaced with the default value - -### add_default - -this adds a new column with a default value. If the column name already exists in the input header the existing value -is retained instead - -### require - -checks a list of column names to ensure they exist - -### validate - -checks a given column name against a regular expression - -### rename - -renames an input column to one or more new column names - -### split - -based on named capture groups in a regular expression, splits an existing column into one or more new columns - -### combine - -based on python template strings. Combines the values of 1 or more columns into a new column - -### cast - -applies any cast function to the column value - -### in_ - -check the column value for membership of a specified object - -### drop - -deletes columns with a given name - -### simplify - -drops any input (not new) column names not specified in the require option - - -# Use-Cases - -1. reading a tab file with no transformations - -``` ->>> header, rows = tab.read_file(filename, suppress_index=True) -``` - -2. reading a tab file and getting the line numbers - -``` ->>> header, rows = tab.read_file(filename) ->>> for row in rows: ->>> print('row number:', row['_index']) -'row number:' 1 -``` - -3. split a column with an expected pattern into multiple columns - -``` ->>> header, rows = tab.read_file(filename, split={'colname': r'^(?P\w+):(?P\d+)$'}) ->>> print(header) -['colname', 'chr', 'pos'] -``` - -4. drop specific unwanted columns - -``` ->>> header, rows = tab.read_file(filename, drop=['colname']) -``` - -5. drop all but specific columns - -``` ->>> header, rows = tab.read_file(filename, require=['colname'], simplify=True) -``` - -6. add a column with a default value - -``` ->>> header, rows = tab.read_file(filename, add={'colname': 'default value'}) -``` - -7. combine columns into a new column - -``` ->>> header, rows = tab.read_file(filename, combine={'new_colname': '{colname1}_{colname2}'}) -``` - -8. cast a column to a specific type - -``` ->>> header, rows = tab.read_file(filename, cast={'colname': int}) ->>> header, rows = tab.read_file(filename, cast={'colname': tab.cast_boolean}) -``` -""" -from .tab import FileTransform, cast_boolean, cast_null, read_file, VERBOSE diff --git a/src/tab/tab.py b/src/tab/tab.py deleted file mode 100755 index ce28b703..00000000 --- a/src/tab/tab.py +++ /dev/null @@ -1,401 +0,0 @@ -#!/usr/bin/env python3 -""" -Order of transform operations - -1. add -2. add_default -3. require -4. validate -5. rename -6. split -7. combine -8. cast -9. in_ -10. drop -11. simplify -""" - -from __future__ import division - -import re -import string -import warnings - - -VERBOSE = False # Output extra logging information (useful in debugging) - - -def cast_boolean(input_value): - value = str(input_value).lower() - if value in ['t', 'true', '1', 'y', 'yes', '+']: - return True - elif value in ['f', 'false', '0', 'n', 'no', '-']: - return False - raise TypeError('casting to boolean failed', input_value) - - -def cast_null(input_value): - value = str(input_value).lower() - if value in ['none', 'null']: - return None - raise TypeError('casting to null/None failed', input_value) - - -def null(input_value): - warnings.warn('null is deprecated in favor of cast_null', DeprecationWarning, stacklevel=2) - return cast_null(input_value) - - -class EmptyFileError(Exception): - pass - - -class FileTransform: - """ - Holds a set of operations which define the transform_line function. - Generally a single FileTransform object is required per file as lines are expected to have the same format - """ - - def __init__(self, header, **kwargs): - """ - Args: - header (List[str]): the header from the file as a list of column names (in-order) - require (List[str]): list of columns that must be in the input header - rename (Dict[str,List[str]]): mapping of old to new column(s) - drop (List[str]): list of columns in the old input header to drop - add_default (Dict[str]): mapping of new column names to default values (if the column does not exist already) - cast (Dict[str,func]): mapping of new/final columns to the type to cast them to - split (Dict[str,str]): - a dictionary mapping original column names to regex groups to create as the new column names - combine (Dict[str,str]): - a dictionary of the final column name to the format string. The field names in the format - string must correspond to existing column names - simplify (bool): drop all columns not created or retained - validate (Dict[str,str]): mapping of old columns to regex they must satisfy - - Returns: - FileTransform: an object with the validated rules for transforming lines in an input file - """ - self.input = header[:] - self.require = kwargs.pop('require', []) - self.rename = kwargs.pop('rename', {}) - self.drop = kwargs.pop('drop', []) - self.add = kwargs.pop('add', {}) - self.add_default = kwargs.pop('add_default', {}) - self.split = kwargs.pop('split', {}) - self.combine = kwargs.pop('combine', {}) - self.validate = kwargs.pop('validate', {}) - self.cast = kwargs.pop('cast', {}) - self.simplify = kwargs.pop('simplify', False) - self.in_ = kwargs.pop('in_', {}) - self.header = [] # holds the new header after the transform - - if kwargs: - raise TypeError('invalid argument(s)', list(kwargs.keys())) - - current_columns = set(header) - cant_simplify = set() # columns that are restricted against being dropped in simplify - - if VERBOSE: - print('input header:', header) - - # check that the header columns are unique - if len(set(header)) != len(header): - raise KeyError( - 'duplicate input col: column names in input header must be unique', header - ) - - for col in self.add: - current_columns.add(col) - cant_simplify.add(col) - # add_default: add_default new columns with default values if not already present - for col in self.add_default: - current_columns.add(col) - cant_simplify.add(col) - - # 1. require: check that the required columns exist in the input header - for col in self.require: - if col not in current_columns: - raise KeyError( - 'cannot require: column not found in the input header', col, current_columns - ) - cant_simplify.add(col) - - # 2. validate: check that the input column matches the expected pattern - for col, regex in self.validate.items(): - if col not in current_columns: - raise KeyError( - 'cannot validate: column not found in the input header', col, current_columns - ) - cant_simplify.add(col) - - # 4. rename: rename a column to one or more new column names - for col, new_names in self.rename.items(): - if col not in current_columns: - raise KeyError( - 'cannot rename column. column not found in header', col, current_columns - ) - for new_name in new_names: - if new_name in current_columns: - raise KeyError('duplicate column name', new_name, current_columns) - current_columns.add(new_name) - cant_simplify.add(new_name) - - # 5. split: split a column into a set of new columns - for col, regex in self.split.items(): - robj = re.compile(regex) - new_columns = robj.groupindex.keys() - if col not in current_columns: - raise KeyError( - 'cannot split column. column not found in header', col, current_columns - ) - for new_col in new_columns: - if new_col in current_columns: - raise KeyError('duplicate column name', new_col, current_columns) - current_columns.add(new_col) - cant_simplify.add(new_col) - - # 6. combine: - for ncol, format_string in self.combine.items(): - old_column_names = [t[1] for t in list(string.Formatter().parse(format_string))] - if ncol in current_columns: - raise KeyError('duplicate column name', ncol, current_columns) - current_columns.add(ncol) - cant_simplify.add(ncol) - for col in old_column_names: - if col not in current_columns: - raise KeyError( - 'cannot combine column. column not found in header', col, current_columns - ) - - # 7. cast: apply some callable - for col, func in self.cast.items(): - if col not in current_columns: - raise KeyError( - 'cannot cast column. column not found in header', col, current_columns - ) - if not callable(func): - raise TypeError('function applied to column must be callable', col, func) - cant_simplify.add(col) - - # 8. in_: check for satisfying some controlled vocab - for col, item in self.in_.items(): - if col not in current_columns: - raise KeyError( - 'cannot check membership column. column not found in header', - col, - current_columns, - ) - if None in item: - pass - cant_simplify.add(col) - - # 9. drop: drop any columns from the original input IF EXIST - for col in self.drop: - if col in self.require: - raise AssertionError('cannot both drop and retain a column', col) - current_columns.discard( - col - ) # 8. simplify: drop any columns that are not new, added, or retained - - if self.simplify: - for col in list(current_columns): - if col not in cant_simplify: - current_columns.discard(col) - - # retain the original input order except for new columns - order = {} - for col in current_columns: - order[col] = len(header) - for i, col in enumerate(header): - if col in current_columns: - order[col] = i - - self.header = [m for m, n in sorted(order.items(), key=lambda x: (x[1], x[0]))] - - if VERBOSE: - print('output header:', self.header) - - def transform_line(self, line, allow_short=False): - """ - transforms the input line into a hash of the new/final column names with the transform rules applied - - Args: - line (List[str]): list of values for a row with the same input header as the transform - Raises: - exception exceptions occur if validation, split or combine fails - - Returns: - Dict[str]: the hash representation of the new row - """ - if any( - [ - not allow_short and len(self.input) != len(line), - allow_short and len(self.input) < len(line), - ] - ): - raise AssertionError( - 'length of input list {0} does not match length of the expected header {1}: '.format( - len(line), len(self.input) - ) - + re.sub('\n', '\\n', '\\t'.join(line)), - self.input, - ) - - row = {} - cant_simplify = set() - - for i in range(0, len(self.input)): - row[self.input[i]] = line[i] if i < len(line) else None - - for col, default in self.add.items(): - row[col] = default - cant_simplify.add(col) - - # add_default: add new columns with default values if not already present - for col, default in self.add_default.items(): - row.setdefault(col, default) - cant_simplify.add(col) - - # 1. require: check that the required columns exist in the input header - cant_simplify.update(self.require) - - # 2. validate: check that the input column matches the expected pattern - for col, regex in self.validate.items(): - cant_simplify.add(col) - if not re.match(regex, row[col]): - raise UserWarning('validation failed', col, regex, row[col]) - - # 4. rename: rename a column to one or more new column names - for col, new_names in self.rename.items(): - for new_name in new_names: - row[new_name] = row[col] - cant_simplify.add(new_name) - - # 5. split: split a column into a set of new columns - for col, regex in self.split.items(): - robj = re.compile(regex) - new_columns = robj.groupindex.keys() - match = robj.match(row[col]) - if not match: - raise UserWarning('split of column failed', col, regex, row[col]) - for new_col in new_columns: - row[new_col] = match.group(new_col) - cant_simplify.add(new_col) - - # 6. combine: - for ncol, format_string in self.combine.items(): - old_column_names = [t[1] for t in list(string.Formatter().parse(format_string))] - cant_simplify.add(ncol) - substitutions = {} - for col in old_column_names: - substitutions[col] = row[col] - row[ncol] = format_string.format(**substitutions) - - # 7. cast: apply some callable - for col, func in self.cast.items(): - try: - row[col] = func(row[col]) - except Exception as err: - raise type(err)('error in casting column: {}. {}'.format(col, str(err))) - cant_simplify.add(col) - - # 8. in_: check for satisfying some controlled vocab - for col, item in self.in_.items(): - if row[col] not in item: - raise KeyError('failed in_ check', col, row[col], item) - cant_simplify.add(col) - - # 9. drop: drop any columns from the original input IF EXIST - for col in self.drop: - row.pop(col, None) - - # 10. simplify: drop any columns that are not new, added, or retained - if self.simplify: - for col in list(row): - if col not in cant_simplify: - row.pop(col, None) - - return row - - -def read_file( - inputfile, - delimiter='\t', - header=None, - strict=True, - suppress_index=False, - allow_short=False, - **kwargs -): - """ - Args: - inputfile (str): the path to the inputfile - header (List[str]): for non-headered files - delimiter (str): the delimiter (what to split on) - strict (bool): if false will ignore lines that fail transform - suppress_index (bool): do not create an index - Returns: - Tuple[List[str], Dict[str]]: header and the row dictionaries - """ - if VERBOSE: - print("read_file(", inputfile, ", ", kwargs, ")") - - new_header = None - is_file_handle = True if hasattr(inputfile, 'readlines') else False - index = '_index' - objects = [] - line_count = 0 - - fh = inputfile if is_file_handle else open(inputfile, 'r') - - # first grab the header and skip comments - lines = fh.readlines() - if not lines: - raise EmptyFileError('empty file has no lines to read') - current_line_index = 0 - line = re.sub(r'[\r\n]*$', '', lines[current_line_index]) - while current_line_index < len(lines): - if not re.match(r'^\s*##', lines[current_line_index]): # skip comment lines - break - current_line_index += 1 - - # first line is the header unless a header was input - if not header: - if current_line_index >= len(lines): - raise EmptyFileError('no lines beyond comments to read as header') - line = re.sub(r'(^#)|([\r\n\s]*$)', '', lines[current_line_index]) # clean the header - current_line_index += 1 - header = line.split(delimiter) if line else [] - if not header: - raise EmptyFileError('header is empty', inputfile) - # create the file transform object - transform = FileTransform(header, **kwargs) - new_header = transform.header - - if not suppress_index and index in new_header: - raise AttributeError( - 'column name {0} is reserved and cannot be used as an input'.format(repr(index)) - ) - - # now go through the lines in the file - while current_line_index < len(lines): - line_count += 1 - line = re.sub(r'[\r\n]*$', '', lines[current_line_index]) # clean the line - try: - row = line.split(delimiter) - row = transform.transform_line(row, allow_short=allow_short) - if not suppress_index: - row[index] = current_line_index - objects.append(row) - except Exception as error: # General b/c will be re-raised unless strict mode is off - if strict: - print('error at line', current_line_index) - raise type(error)('{0} happens at line {1}'.format(error, current_line_index)) - elif VERBOSE: - print('[ERROR]', str(error)) - current_line_index += 1 - - if not is_file_handle: - fh.close() - return (new_header, objects) diff --git a/src/tools/calculate_ref_alt_counts.py b/src/tools/calculate_ref_alt_counts.py index d873daba..cbb3be43 100644 --- a/src/tools/calculate_ref_alt_counts.py +++ b/src/tools/calculate_ref_alt_counts.py @@ -190,7 +190,7 @@ def calculate_all_counts(self, input_files, output_file): processed_bpps = {} filtered_events = [] - bpps = read_inputs(input_files, add_default={'stranded': False}) + bpps = read_inputs(input_files, add_default={'stranded': False}, summary=True) for bpp in bpps: # only use precise bpps that are within a certain event size diff --git a/tests/data/annotations_subsample.tab b/tests/data/annotations_subsample.tab index 00922483..8a56bb78 100644 --- a/tests/data/annotations_subsample.tab +++ b/tests/data/annotations_subsample.tab @@ -2,7 +2,7 @@ ## input file for picking best transcript: ens69_best_transcript.txt ## Ensembl Api version 69 ## generated at: Thu Aug 4 16:38:01 2016 -#ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges +ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges ENSG00000259662 15 1 63889592 63893885 ENST00000539570 ENST00000539570 NP_976307.2;NM_203373.2 63889592 63893885 1 744 63889592-63889944;63893495-63893885 SSF81383:9-49 ENSG00000258865 DIO3 14 1 102027834 102028748 ENST00000510508 ENST00000510508 NP_001353.4;NM_001362.3 102027834 102028748 1 915 102027834-102028748 PF00837:38-293;SSF52833:125-198 ENSG00000255738 GAGE4 X 1 49364778 49370618 ENST00000381700 ENST00000381700 NP_001035753.1;NM_001040663.2 49364778 49370618 1 354 49364778-49364861;49365327-49365447;49368271-49368396;49370596-49370618 PF05831:1-116 diff --git a/tests/data/clustering_input.tab b/tests/data/clustering_input.tab index 6b60397f..9123da3e 100644 --- a/tests/data/clustering_input.tab +++ b/tests/data/clustering_input.tab @@ -1,3 +1,3 @@ -#tracking_id event_type break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded tools protocol +tracking_id event_type break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded tools protocol manta-MantaDEL:175574:0:0:0:0:0 deletion 15 67333523 67333619 L ? None 15 67333581 67333581 R ? None False False manta genome strelka-TyeSomZhWTRakEu6ZJ7up6 deletion 15 67333623 67333623 L ? None 15 67333625 67333625 R ? None False False strelka genome diff --git a/tests/data/mavis_summary_all_mock-A36971_mock-A47933.tab b/tests/data/mavis_summary_all_mock-A36971_mock-A47933.tab index 81ad70b4..4ac60e71 100644 --- a/tests/data/mavis_summary_all_mock-A36971_mock-A47933.tab +++ b/tests/data/mavis_summary_all_mock-A36971_mock-A47933.tab @@ -1,4 +1,4 @@ -#tracking_id library annotation_id product_id event_type gene1 gene1_direction gene2 gene2_direction gene1_aliases gene2_aliases gene_product_type transcript1 transcript2 fusion_splicing_pattern fusion_cdna_coding_start fusion_cdna_coding_end fusion_mapped_domains fusion_protein_hgvs annotation_figure genes_encompassed break1_chromosome break1_position_start break1_position_end break1_orientation exon_last_5prime exon_first_3prime break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand protocol tools call_method break1_homologous_seq break1_split_reads break2_homologous_seq break2_split_reads contig_alignment_score contig_remapped_reads contig_seq spanning_reads flanking_pairs linking_split_reads untemplated_seq cdna_synon protein_synon supplementary_call net_size assumed_untemplated dgv mock-A36971_diseased_genome mock-A47933_diseased_transcriptome +tracking_id library annotation_id product_id event_type gene1 gene1_direction gene2 gene2_direction gene1_aliases gene2_aliases gene_product_type transcript1 transcript2 fusion_splicing_pattern fusion_cdna_coding_start fusion_cdna_coding_end fusion_mapped_domains fusion_protein_hgvs annotation_figure genes_encompassed break1_chromosome break1_position_start break1_position_end break1_orientation exon_last_5prime exon_first_3prime break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand protocol tools call_method break1_homologous_seq break1_split_reads break2_homologous_seq break2_split_reads contig_alignment_score contig_remapped_reads contig_seq spanning_reads flanking_pairs linking_split_reads untemplated_seq cdna_synon protein_synon supplementary_call net_size assumed_untemplated dgv mock-A36971_diseased_genome mock-A47933_diseased_transcriptome SeYJmwZMWGeS8ciCzwNJeb;eqGMKJx6w8M6QU7kood8N7 mock-A36971 NwT4iWqEPs27pjwfVpEw4j-v1-a1 mock-A36971_genome_NwT4iWqEPs27pjwfVpEw4j-v1-a1_normal_547_1098 inverted translocation ENSG00000128891 5 ENSG00000122565 3 C15orf57 CBX3 sense ENST00000358005 ENST00000337620 normal 547 1098 [{"name": "PR00504", "sequences": ["PEEFVVEKV", "VVNGKVEYFLKWKGF", "TDADNTWEPEENL"], "regions": [{"start": 27, "end": 35}, {"start": 40, "end": 54}, {"start": 55, "end": 67}], "mapping_quality": 100.0, "matches": 37}, {"name": "PF01393", "sequences": ["RGLDPERIIGATDSSGELMFLMKWKDSDEADLVLAKEANMKCPQIVIAFYEERLTWHS"], "regions": [{"start": 119, "end": 176}], "mapping_quality": 100.0, "matches": 58}, {"name": "SSF54160", "sequences": ["QNGKSKKVEEAEPEEFVVEKVLDRRVVNGKVEYFLKWKGFTDADNTWEPEENLDCPELIEAFLNS", "KKRDAADKPRGFARGLDPERIIGATDSSGELMFLMKWKDSDEADLVLAKEANMKCPQIVIAFYEERLTWH"], "regions": [{"start": 15, "end": 79}, {"start": 106, "end": 175}], "mapping_quality": 100.0, "matches": 135}, {"name": "PF00385", "sequences": ["FVVEKVLDRRVVNGKVEYFLKWKGFTDADNTWEPEENLDCPELIEAFLN"], "regions": [{"start": 30, "end": 78}], "mapping_quality": 100.0, "matches": 49}, {"name": "SM00298", "sequences": ["EFVVEKVLDRRVVNGKVEYFLKWKGFTDADNTWEPEENLDCPELIEAFLNSQK", "GLDPERIIGATDSSGELMFLMKWKDSDEADLVLAKEANMKCPQIVIAFYEERL"], "regions": [{"start": 29, "end": 81}, {"start": 120, "end": 172}], "mapping_quality": 100.0, "matches": 106}, {"name": "SM00300", "sequences": ["RGFARGLDPERIIGATDSSGELMFLMKWKDSDEADLVLAKEANMKCPQIVIAFYEERLTWHSC"], "regions": [{"start": 115, "end": 177}], "mapping_quality": 100.0, "matches": 63}, {"name": "PS50013", "sequences": ["FVVEKVLDRRVVNGKVEYFLKWKGFTDADNTWEPEENLDCPELIEAFLNSQKAGKEKDG", "LDPERIIGATDSSGELMFLMKWKDSDEADLVLAKEANMKCPQIVIAFYEERLTWHSCPE"], "regions": [{"start": 30, "end": 88}, {"start": 121, "end": 179}], "mapping_quality": 100.0, "matches": 118}] None /var/tmp/tmp3cvjw9j4/mock-A36971_diseased_genome/annotate/batch-oyiw4PkCc96hn7kVpVWxEX-1/drawings/mavis_NwT4iWqEPs27pjwfVpEw4j-v1-a1-chrgene1_chrgene5-b-C15orf57_b-CBX3.svg gene1 33299 33299 R 2 2 ? gene5 584 584 R ? genome convert_ta.py_v0.0.1 contig 8 9 0.9954107388710418 9 GCTATTATTCACCGCCTCCGAGCTGCTCCGGGTCGCGGGTCTGCAGCGTCTCCGGCCCTCCGCGCCTACAGCTCAAGCCACATCCGAAGTCAGGAAATATTTTTAAAATAAAATGGCTAACAAGAGGCAGAATGAATCTTATGTCAATATGCTCCCATTCTCAACAATCAATCTATTTATGTAAGTTTTTCAAACTCCAGCATCAG 0 9 10 None ENST00000337620 False 0-0 False None Not Applicable not expressed DTquBbWJJLsogM4dfvDqP7;erEiWboaSGxXgQ29fuZWz8 mock-A36971 5BBNKTYvzaZHNjq8CaTcAH-v1-a1 mock-A36971_genome_5BBNKTYvzaZHNjq8CaTcAH-v1-a1_None_None_None deletion None None None None None None None reference11:6001_6001+ reference11:6005_6005+ None None None None None /var/tmp/tmp3cvjw9j4/mock-A36971_diseased_genome/annotate/batch-oyiw4PkCc96hn7kVpVWxEX-14/drawings/mavis_5BBNKTYvzaZHNjq8CaTcAH-v1-a1-chrreference11_chrreference11-NA_NA.svg reference11 6001 6001 L None None ? reference11 6005 6005 R ? genome convert_ta.py_v0.0.1 contig AT 1 0 0.9951667472208796 9 CACGCCCTGCTAGGAGTTCACGCTTTAGTTGGGGAAAATATACAATAAGCAAGCCAGTTTTTAAAATGAGAACTGCAATTAGAGTTAAATGCTACAAAGACAAACTCACAGGAAGATGGGATGTAGAATAAGGCTCTCAGAATAGTAAGAGAAACTATTGCTTCTTACGATGTTTGTCTTTCTTTGTAT 16 2 0 None None False -3--3 False None Not Applicable not expressed AeY2qdWDTvBkEFgXa92ave;D9XznZsQErYgPTEMYeqMjK mock-A36971 tM2bMsSPiz47LLNw7ED7R6-v1-a1 mock-A36971_genome_tM2bMsSPiz47LLNw7ED7R6-v1-a1_None_None_None deletion None None None None None None None reference11:10026_10026+ reference11:10067_10067+ None None None None None /var/tmp/tmp3cvjw9j4/mock-A36971_diseased_genome/annotate/batch-oyiw4PkCc96hn7kVpVWxEX-1/drawings/mavis_tM2bMsSPiz47LLNw7ED7R6-v1-a1-chrreference11_chrreference11-NA_NA.svg reference11 10026 10026 L None None ? reference11 10067 10067 R ? genome convert_ta.py_v0.0.1 contig C 0 0 0.9819121447028424 51 GGGGGGGGGGTCCGGGGGGGGCTGTGGTTCTTATGGCTGCTCCCAGTCCAGCTGCTGCAAGCCCTGCTGCTGCTCCTCAGGCTGTGGGTCATCCTGCTGCCAGTCCAGCTGCTGTAAGCCCTACTGCTGTCAGTCCAGCTGCTGTAAGCCCTGTAGCTGCTTCTCAGGCTGTGGATCATCCTGCTGCCAATCCAGCTGCTACAAGCCCTGCTGCTGCCAGTCCAGC 0 3 0 CTACTGCTGT None None False -30--30 False None Not Applicable not expressed diff --git a/tests/data/mock_masking.tab b/tests/data/mock_masking.tab index 1d2748ee..dc68745f 100644 --- a/tests/data/mock_masking.tab +++ b/tests/data/mock_masking.tab @@ -1 +1 @@ -#chr start end name +chr start end name diff --git a/tests/data/mock_pairing_input.tab b/tests/data/mock_pairing_input.tab index 8e18ac1e..cdddb859 100644 --- a/tests/data/mock_pairing_input.tab +++ b/tests/data/mock_pairing_input.tab @@ -1,4 +1,4 @@ -#library cluster_id validation_id annotation_id event_type transcript1 transcript2 fusion_cdna_coding_start fusion_cdna_coding_end fusion_sequence_fasta_id fusion_sequence_fasta_file break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand opposing_strands stranded protocol break1_call_method break2_call_method untemplated_seq fusion_splicing_pattern +library cluster_id validation_id annotation_id event_type transcript1 transcript2 fusion_cdna_coding_start fusion_cdna_coding_end fusion_sequence_fasta_id fusion_sequence_fasta_file break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand opposing_strands stranded protocol break1_call_method break2_call_method untemplated_seq fusion_splicing_pattern genome1 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 10008 10008 L + gene3 18900 18900 R + False True genome split reads split reads None None genome2 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 10000 10000 L + gene3 18900 18900 R + False True genome split reads split reads None None transcriptome1 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 5347 5347 L + gene3 19969 19969 R + False True transcriptome split reads split reads None None diff --git a/tests/data/mock_reference_annotations.full.tsv b/tests/data/mock_reference_annotations.full.tsv index d86736d4..7ead95e3 100644 --- a/tests/data/mock_reference_annotations.full.tsv +++ b/tests/data/mock_reference_annotations.full.tsv @@ -1,4 +1,4 @@ -#ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges +ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges ENSG00000186354 C9orf47 fakereference9 1 1 5278 ENST00000375851 ENST00000375851 NP_001135885.1;NM_001142413.1 1 5278 134 685 1-322;608-833;990-5278 ENSG00000186354 C9orf47 fakereference9 1 1 5278 ENST00000375851 ENST00000375850 59 1202 76 783 59-322;608-1202 ENSG00000186354 C9orf47 fakereference9 1 1 5278 ENST00000375851 ENST00000334490 NP_001001938.1;NM_001001938.3 66 5278 69 677 66-379;608-833;990-5278 diff --git a/tests/data/mock_reference_annotations.tsv b/tests/data/mock_reference_annotations.tsv index 10874ea2..14391a69 100644 --- a/tests/data/mock_reference_annotations.tsv +++ b/tests/data/mock_reference_annotations.tsv @@ -1,4 +1,4 @@ -#ensembl_gene_id chr strand gene_start gene_end ensembl_transcript_id transcript_genomic_start transcript_genomic_end +ensembl_gene_id chr strand gene_start gene_end ensembl_transcript_id transcript_genomic_start transcript_genomic_end GENE-A fake + 100 200 TRANSCRIPT-A 100 200 GENE-B fake - 250 350 TRANSCRIPT-B 250 350 GENE-C fake + 300 400 TRANSCRIPT-C 300 400 diff --git a/tests/data/mock_sv_events.tsv b/tests/data/mock_sv_events.tsv index bd997b86..baba0191 100644 --- a/tests/data/mock_sv_events.tsv +++ b/tests/data/mock_sv_events.tsv @@ -1,5 +1,5 @@ ## False reference9 2000 2000 reference9 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 9:66466004 -#stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment +stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment False reference7 5000 5000 reference7 11000 11000 R L - - duplication genome convert_ta.py_v0.0.1 mock-A36971 7:104485067|7:104612302 False reference20 2000 2000 reference20 6000 6000 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 20:13160730|20:13164100 False reference10 520 520 reference19 964 964 R L + + translocation genome convert_ta.py_v0.0.1 mock-A36971 10:7059511|19:17396811 diff --git a/tests/data/mock_trans_sv_events.tsv b/tests/data/mock_trans_sv_events.tsv index 25087d31..6f1d64a7 100644 --- a/tests/data/mock_trans_sv_events.tsv +++ b/tests/data/mock_trans_sv_events.tsv @@ -1,5 +1,5 @@ ## False reference9 2000 2000 reference9 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 9:66466004 -#stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment +stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment False gene3 27175 27175 gene3 27176 27176 R L + + duplication transcriptome convert_ta.py_v0.0.1 mock-A47933 1:207249992 True gene1 34090 34090 gene5 608 608 R R - + inverted translocation transcriptome convert_ta.py_v0.0.1 mock-A47933 15:40854971|7:26241389 False gene2 22979 22979 gene2 23783 23783 R L + + duplication transcriptome convert_ta.py_v0.0.1 mock-A47933 15:41623873|15:41625248#this one is pretty low qual diff --git a/tests/data/pairing_annotations.tab b/tests/data/pairing_annotations.tab index f661e501..a208d593 100644 --- a/tests/data/pairing_annotations.tab +++ b/tests/data/pairing_annotations.tab @@ -1,4 +1,4 @@ -#library cluster_id cluster_size validation_id annotation_id event_type gene1 gene1_direction gene2 gene2_direction gene_product_type transcript1 transcript2 fusion_splicing_pattern fusion_cdna_coding_start fusion_cdna_coding_end fusion_mapped_domains fusion_sequence_fasta_id fusion_sequence_fasta_file annotation_figure annotation_figure_legend genes_encompassed genes_overlapping_break1 genes_overlapping_break2 genes_proximal_to_break1 genes_proximal_to_break2 break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded protocol tools call_method break1_ewindow break1_ewindow_count break1_homologous_seq break1_split_read_names break1_split_reads break1_split_reads_forced break2_ewindow break2_ewindow_count break2_homologous_seq break2_split_read_names break2_split_reads break2_split_reads_forced contig_alignment_score contig_alignment_query_coverage contig_build_score contig_remap_score contig_remapped_read_names contig_remapped_reads contig_seq contig_strand_specific contigs_aligned contigs_assembled spanning_reads spanning_read_names flanking_median_fragment_size flanking_pairs flanking_pairs_read_names flanking_stdev_fragment_size linking_split_read_names linking_split_reads raw_break1_half_mapped_reads raw_break1_split_reads raw_break2_half_mapped_reads raw_break2_split_reads raw_flanking_pairs raw_spanning_reads untemplated_seq +library cluster_id cluster_size validation_id annotation_id event_type gene1 gene1_direction gene2 gene2_direction gene_product_type transcript1 transcript2 fusion_splicing_pattern fusion_cdna_coding_start fusion_cdna_coding_end fusion_mapped_domains fusion_sequence_fasta_id fusion_sequence_fasta_file annotation_figure annotation_figure_legend genes_encompassed genes_overlapping_break1 genes_overlapping_break2 genes_proximal_to_break1 genes_proximal_to_break2 break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded protocol tools call_method break1_ewindow break1_ewindow_count break1_homologous_seq break1_split_read_names break1_split_reads break1_split_reads_forced break2_ewindow break2_ewindow_count break2_homologous_seq break2_split_read_names break2_split_reads break2_split_reads_forced contig_alignment_score contig_alignment_query_coverage contig_build_score contig_remap_score contig_remapped_read_names contig_remapped_reads contig_seq contig_strand_specific contigs_aligned contigs_assembled spanning_reads spanning_read_names flanking_median_fragment_size flanking_pairs flanking_pairs_read_names flanking_stdev_fragment_size linking_split_read_names linking_split_reads raw_break1_half_mapped_reads raw_break1_split_reads raw_break2_half_mapped_reads raw_break2_split_reads raw_flanking_pairs raw_spanning_reads untemplated_seq A36971 cluster-batch20170407r590869-449 1 validation-batch20170407r289453-199 annotation-batch20170410r499868-2221 inverted translocation ENSG00000182463 5 ENSG00000146282 3 sense ENST00000371497 ENST00000369536 normal 504 884 [] annotation-batch20170410r499868-2221_normal /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/annotations.fusion-cdna.fa /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/drawings/mavis_20_6.b-TSHZ2_b-RARS2.annotation-batch20170410r499868-2221.svg /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/drawings/mavis_20_6.b-TSHZ2_b-RARS2.annotation-batch20170410r499868-2221.legend.json 20 52021562 52021562 L + None 6 88232570 88232570 L - None True False genome DELLY_v0.6.1;convert_ta.py_v0.0.1 contig 52020680-52021832 1051 T 0 0 88216583-88217735 738 0 0 0 seq31 1025 39 HISEQX1_11:3:1109:17107:4684;HISEQX1_11:3:1115:28057:65529;HISEQX1_11:3:1118:19827:12068;HISEQX1_11:3:1118:26808:54366;HISEQX1_11:3:1118:26880:54242;HISEQX1_11:3:1203:24657:33551;HISEQX1_11:3:1211:14915:49162;HISEQX1_11:3:1224:29985:17500;HISEQX1_11:3:2109:10429:26097;HISEQX1_11:3:2110:30969:47193;HISEQX1_11:3:2119:30005:71682;HISEQX1_11:3:2119:31010:72086;HISEQX1_11:3:2123:23399:43185;HISEQX1_11:3:2203:6928:43062;HISEQX1_11:3:2206:18852:13826;HISEQX1_11:3:2213:28696:54330;HISEQX1_11:3:2213:9699:72121;HISEQX1_11:3:2216:11576:10890;HISEQX1_11:3:2223:4888:12420;HISEQX1_11:4:1116:7669:4192;HISEQX1_11:4:1205:5355:29332;HISEQX1_11:4:1216:24342:59727;HISEQX1_11:4:1223:29873:24409;HISEQX1_11:4:2102:9628:27415;HISEQX1_11:4:2115:26230:35397;HISEQX1_11:4:2202:27072:18116;HISEQX1_11:4:2206:4391:19223;HISEQX1_11:4:2207:24424:30509;HISEQX1_11:4:2211:20709:31283;HISEQX1_11:4:2211:21004:20612;HISEQX1_11:4:2213:18355:30439;HISEQX1_11:4:2213:19136:30070;HISEQX1_11:4:2217:11647:32531;HISEQX1_11:4:2219:26707:61398;HISEQX1_11:4:2219:28118:62083 43 CCCAACTGGATAATAAATTATAACAATTCTATTATCTGACTGCTTCTGTTCTTCCACGCACTCTTCGACATCCAATTTAAAACTTAAAGTTGGCCGGGCATGGCAGTTCATCCCTGTAATCTAGCATTTTGGGAGGCCGATGTGGGTGGATCACCTGAGGCCAGAAGTTCGAAACCAGCCTGGCCACCAGGGCGAAAACCTGTCTCTACAAAAATACAAAAATTAGCCGTATATGTGCATTTTTCTGGAGTTGAAGGTCCATAGATTTTTTCAGATACTTCAAAGGAGTACATGATACCCCTCCCCAACAAAAGTCCCCTATCTCTGGATTTATGCTTAAAATGAATGCATATTTTACAAAGCCA False 1 1 0 0 16 HISEQX1_11:3:1109:17107:4684;HISEQX1_11:3:1118:26808:54366;HISEQX1_11:3:1118:26880:54242;HISEQX1_11:3:1203:24657:33551;HISEQX1_11:3:1211:14915:49162;HISEQX1_11:3:1224:29985:17500;HISEQX1_11:3:2119:30005:71682;HISEQX1_11:3:2119:31010:72086;HISEQX1_11:3:2123:23399:43185;HISEQX1_11:3:2213:9699:72121;HISEQX1_11:3:2216:11576:10890;HISEQX1_11:4:2102:9628:27415;HISEQX1_11:4:2206:4391:19223;HISEQX1_11:4:2211:20709:31283;HISEQX1_11:4:2211:21004:20612;HISEQX1_11:4:2217:11647:32531 0 0 1 27 5 22 17 0 A36971 cluster-batch20170407r590869-449 1 validation-batch20170407r289453-199 annotation-batch20170410r499868-2221 inverted translocation ENSG00000182463 5 ENSG00000146282 3 sense ENST00000371497 ENST00000369536 normal 888 3992 "[{""name"": ""PS50157"", ""sequences"": [""LKCMFCGDSFDSLQDLSVHMIKTKHYQKVP"", ""FYCSDCASQFRTPSTYISHLESHLGFQM"", ""FKCKLCCRTFVSKHAVKLHLSKTHSKSPE""], ""regions"": [{""start"": 275, ""end"": 304}, {""start"": 926, ""end"": 953}, {""start"": 994, ""end"": 1022}], ""mapping_quality"": 100.0, ""matches"": 87}, {""name"": ""SM00389"", ""sequences"": [""KRKGRQSNWNPQHLLILQAQFASSLFQTSEGKYLLSDLGPQERMQISKFTGLSMTTISHWLANVKYQLRKTGGTK""], ""regions"": [{""start"": 840, ""end"": 914}], ""mapping_quality"": 100.0, ""matches"": 75}, {""name"": ""SSF46689"", ""sequences"": [""VRRFEDVSSEVSTLHKRKGRQSNWNPQHLLILQAQFASSLFQTSEGKYLLSDLGPQERMQISKFTGLSMTTISHWLANVKYQLRK""], ""regions"": [{""start"": 825, ""end"": 909}], ""mapping_quality"": 100.0, ""matches"": 85}, {""name"": ""SSF57667"", ""sequences"": [""TVFTGASRFRCRQCSAAYDTLVELTVHMNETGHYQDD"", ""KVLKCMFCGDSFDSLQDLSVHMIKTKHYQKVPLKEPVPTISSKMVTPAKKRVFDVNRPCSPDSTTGSFADSFSSQKNANLQLSSNNRYGYQNGASYTWQFEACKSQILKCMECGSSHDTLQQLTTHMM"", ""PIFYCSDCASQFRTPSTYISHLESHLGFQMKDMTRLSVDQQSKVEQEISRVSSAQRSPETIAAEEDTDSKFKCKLCCRTFVSKHAVKLHLSKTH""], ""regions"": [{""start"": 207, ""end"": 243}, {""start"": 273, ""end"": 400}, {""start"": 924, ""end"": 1017}], ""mapping_quality"": 100.0, ""matches"": 259}, {""name"": ""SM00355"", ""sequences"": [""FRCRQCSAAYDTLVELTVHMNETGH"", ""LKCMFCGDSFDSLQDLSVHMIKTKH"", ""LKCMECGSSHDTLQQLTTHMMVTGH"", ""FYCSDCASQFRTPSTYISHLESH"", ""FKCKLCCRTFVSKHAVKLHLSKTH""], ""regions"": [{""start"": 215, ""end"": 239}, {""start"": 275, ""end"": 299}, {""start"": 380, ""end"": 404}, {""start"": 926, ""end"": 948}, {""start"": 994, ""end"": 1017}], ""mapping_quality"": 100.0, ""matches"": 122}]" annotation-batch20170410r499868-2221_normal /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/annotations.fusion-cdna.fa /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/drawings/mavis_20_6.b-TSHZ2_b-RARS2.annotation-batch20170410r499868-2221.svg /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/drawings/mavis_20_6.b-TSHZ2_b-RARS2.annotation-batch20170410r499868-2221.legend.json 20 52021562 52021562 L + None 6 88232570 88232570 L - None True False genome DELLY_v0.6.1;convert_ta.py_v0.0.1 contig 52020680-52021832 1051 T 0 0 88216583-88217735 738 0 0 0 seq31 1025 39 HISEQX1_11:3:1109:17107:4684;HISEQX1_11:3:1115:28057:65529;HISEQX1_11:3:1118:19827:12068;HISEQX1_11:3:1118:26808:54366;HISEQX1_11:3:1118:26880:54242;HISEQX1_11:3:1203:24657:33551;HISEQX1_11:3:1211:14915:49162;HISEQX1_11:3:1224:29985:17500;HISEQX1_11:3:2109:10429:26097;HISEQX1_11:3:2110:30969:47193;HISEQX1_11:3:2119:30005:71682;HISEQX1_11:3:2119:31010:72086;HISEQX1_11:3:2123:23399:43185;HISEQX1_11:3:2203:6928:43062;HISEQX1_11:3:2206:18852:13826;HISEQX1_11:3:2213:28696:54330;HISEQX1_11:3:2213:9699:72121;HISEQX1_11:3:2216:11576:10890;HISEQX1_11:3:2223:4888:12420;HISEQX1_11:4:1116:7669:4192;HISEQX1_11:4:1205:5355:29332;HISEQX1_11:4:1216:24342:59727;HISEQX1_11:4:1223:29873:24409;HISEQX1_11:4:2102:9628:27415;HISEQX1_11:4:2115:26230:35397;HISEQX1_11:4:2202:27072:18116;HISEQX1_11:4:2206:4391:19223;HISEQX1_11:4:2207:24424:30509;HISEQX1_11:4:2211:20709:31283;HISEQX1_11:4:2211:21004:20612;HISEQX1_11:4:2213:18355:30439;HISEQX1_11:4:2213:19136:30070;HISEQX1_11:4:2217:11647:32531;HISEQX1_11:4:2219:26707:61398;HISEQX1_11:4:2219:28118:62083 43 CCCAACTGGATAATAAATTATAACAATTCTATTATCTGACTGCTTCTGTTCTTCCACGCACTCTTCGACATCCAATTTAAAACTTAAAGTTGGCCGGGCATGGCAGTTCATCCCTGTAATCTAGCATTTTGGGAGGCCGATGTGGGTGGATCACCTGAGGCCAGAAGTTCGAAACCAGCCTGGCCACCAGGGCGAAAACCTGTCTCTACAAAAATACAAAAATTAGCCGTATATGTGCATTTTTCTGGAGTTGAAGGTCCATAGATTTTTTCAGATACTTCAAAGGAGTACATGATACCCCTCCCCAACAAAAGTCCCCTATCTCTGGATTTATGCTTAAAATGAATGCATATTTTACAAAGCCA False 1 1 0 0 16 HISEQX1_11:3:1109:17107:4684;HISEQX1_11:3:1118:26808:54366;HISEQX1_11:3:1118:26880:54242;HISEQX1_11:3:1203:24657:33551;HISEQX1_11:3:1211:14915:49162;HISEQX1_11:3:1224:29985:17500;HISEQX1_11:3:2119:30005:71682;HISEQX1_11:3:2119:31010:72086;HISEQX1_11:3:2123:23399:43185;HISEQX1_11:3:2213:9699:72121;HISEQX1_11:3:2216:11576:10890;HISEQX1_11:4:2102:9628:27415;HISEQX1_11:4:2206:4391:19223;HISEQX1_11:4:2211:20709:31283;HISEQX1_11:4:2211:21004:20612;HISEQX1_11:4:2217:11647:32531 0 0 1 27 5 22 17 0 A36971 cluster-batch20170407r590869-449 1 validation-batch20170407r289453-199 annotation-batch20170410r499868-2221 inverted translocation ENSG00000182463 5 ENSG00000146282 3 sense ENST00000371497 ENST00000369536 normal 4026 4763 "[{""name"": ""PF05746"", ""sequences"": [""LQYTHARLHSLEETFGCGYLNDFNTACLQEPQSVSILQHLLRFDEVLYKSSQDFQPRHIVSYLLTLSHLAAVAHKTLQIKDSPPEVAGARLHLFKAVRSVLANGMKLLGITPVCRM""], ""regions"": [{""start"": 130, ""end"": 245}], ""mapping_quality"": 100.0, ""matches"": 116}, {""name"": ""SSF47323"", ""sequences"": [""DTGVFLQYTHARLHSLEETFGCGYLNDFNTACLQEPQSVSILQHLLRFDEVLYKSSQDFQPRHIVSYLLTLSHLAAVAHKTLQIKDSPPEVAGARLHLFKAVRSVLANGMKLLGITPVCRM""], ""regions"": [{""start"": 125, ""end"": 245}], ""mapping_quality"": 100.0, ""matches"": 121}, {""name"": ""SM00836"", ""sequences"": [""LQYTHARLHSLEETFGCGYLNDFNTACLQEPQSVSILQHLLRFDEVLYKSSQDFQPRHIVSYLLTLSHLAAVAHKTLQIKDSPPEVAGARLHLFKAVRSVLANGMKLLGITPVCRM""], ""regions"": [{""start"": 130, ""end"": 245}], ""mapping_quality"": 100.0, ""matches"": 116}]" annotation-batch20170410r499868-2221_normal /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/annotations.fusion-cdna.fa /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/drawings/mavis_20_6.b-TSHZ2_b-RARS2.annotation-batch20170410r499868-2221.svg /projects/trans_scratch/validations/workspace/creisle/MAV89/output/A36971_genome/annotation/drawings/mavis_20_6.b-TSHZ2_b-RARS2.annotation-batch20170410r499868-2221.legend.json 20 52021562 52021562 L + None 6 88232570 88232570 L - None True False genome DELLY_v0.6.1;convert_ta.py_v0.0.1 contig 52020680-52021832 1051 T 0 0 88216583-88217735 738 0 0 0 seq31 1025 39 HISEQX1_11:3:1109:17107:4684;HISEQX1_11:3:1115:28057:65529;HISEQX1_11:3:1118:19827:12068;HISEQX1_11:3:1118:26808:54366;HISEQX1_11:3:1118:26880:54242;HISEQX1_11:3:1203:24657:33551;HISEQX1_11:3:1211:14915:49162;HISEQX1_11:3:1224:29985:17500;HISEQX1_11:3:2109:10429:26097;HISEQX1_11:3:2110:30969:47193;HISEQX1_11:3:2119:30005:71682;HISEQX1_11:3:2119:31010:72086;HISEQX1_11:3:2123:23399:43185;HISEQX1_11:3:2203:6928:43062;HISEQX1_11:3:2206:18852:13826;HISEQX1_11:3:2213:28696:54330;HISEQX1_11:3:2213:9699:72121;HISEQX1_11:3:2216:11576:10890;HISEQX1_11:3:2223:4888:12420;HISEQX1_11:4:1116:7669:4192;HISEQX1_11:4:1205:5355:29332;HISEQX1_11:4:1216:24342:59727;HISEQX1_11:4:1223:29873:24409;HISEQX1_11:4:2102:9628:27415;HISEQX1_11:4:2115:26230:35397;HISEQX1_11:4:2202:27072:18116;HISEQX1_11:4:2206:4391:19223;HISEQX1_11:4:2207:24424:30509;HISEQX1_11:4:2211:20709:31283;HISEQX1_11:4:2211:21004:20612;HISEQX1_11:4:2213:18355:30439;HISEQX1_11:4:2213:19136:30070;HISEQX1_11:4:2217:11647:32531;HISEQX1_11:4:2219:26707:61398;HISEQX1_11:4:2219:28118:62083 43 CCCAACTGGATAATAAATTATAACAATTCTATTATCTGACTGCTTCTGTTCTTCCACGCACTCTTCGACATCCAATTTAAAACTTAAAGTTGGCCGGGCATGGCAGTTCATCCCTGTAATCTAGCATTTTGGGAGGCCGATGTGGGTGGATCACCTGAGGCCAGAAGTTCGAAACCAGCCTGGCCACCAGGGCGAAAACCTGTCTCTACAAAAATACAAAAATTAGCCGTATATGTGCATTTTTCTGGAGTTGAAGGTCCATAGATTTTTTCAGATACTTCAAAGGAGTACATGATACCCCTCCCCAACAAAAGTCCCCTATCTCTGGATTTATGCTTAAAATGAATGCATATTTTACAAAGCCA False 1 1 0 0 16 HISEQX1_11:3:1109:17107:4684;HISEQX1_11:3:1118:26808:54366;HISEQX1_11:3:1118:26880:54242;HISEQX1_11:3:1203:24657:33551;HISEQX1_11:3:1211:14915:49162;HISEQX1_11:3:1224:29985:17500;HISEQX1_11:3:2119:30005:71682;HISEQX1_11:3:2119:31010:72086;HISEQX1_11:3:2123:23399:43185;HISEQX1_11:3:2213:9699:72121;HISEQX1_11:3:2216:11576:10890;HISEQX1_11:4:2102:9628:27415;HISEQX1_11:4:2206:4391:19223;HISEQX1_11:4:2211:20709:31283;HISEQX1_11:4:2211:21004:20612;HISEQX1_11:4:2217:11647:32531 0 0 1 27 5 22 17 0 diff --git a/tests/data/pairing_reference_annotations_file.tab b/tests/data/pairing_reference_annotations_file.tab index df444af8..108d204e 100644 --- a/tests/data/pairing_reference_annotations_file.tab +++ b/tests/data/pairing_reference_annotations_file.tab @@ -2,4 +2,4 @@ ## input file for picking best transcript: ens69_best_transcript.txt ## Ensembl Api version 69 ## generated at: Thu Aug 4 16:38:01 2016 -#ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges +ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index 2b110802..a1d33be2 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -93,7 +93,7 @@ def test_manta(self): self.assertEqual(17396810, bpp.break2.end) self.assertEqual(ORIENT.LEFT, bpp.break2.orient) somatic_event = result['manta-MantaDEL:20644:0:2:0:0:0'][0] - self.assertEqual('True', somatic_event.data.get('SOMATIC', False)) + self.assertEqual(True, somatic_event.data.get('SOMATIC', False)) def test_pindel(self): self.run_main(get_data('pindel_events.vcf'), SUPPORTED_TOOL.PINDEL, False) diff --git a/tests/integration/test_annotate.py b/tests/integration/test_annotate.py index c2086654..9532665c 100644 --- a/tests/integration/test_annotate.py +++ b/tests/integration/test_annotate.py @@ -2,27 +2,25 @@ import unittest from mavis.annotate.base import BioInterval, ReferenceName -from mavis.annotate.file_io import load_reference_genes, load_reference_genome -from mavis.annotate.genomic import Exon, Gene, Template, Transcript, PreTranscript -from mavis.annotate.protein import calculate_orf, Domain, DomainRegion, translate, Translation +from mavis.annotate.file_io import load_annotations, load_reference_genome +from mavis.annotate.fusion import FusionTranscript, determine_prime +from mavis.annotate.genomic import Exon, Gene, PreTranscript, Template, Transcript +from mavis.annotate.protein import Domain, DomainRegion, Translation, calculate_orf, translate from mavis.annotate.variant import ( + Annotation, _gather_annotations, _gather_breakpoint_annotations, annotate_events, - Annotation, flatten_fusion_transcript, overlapping_transcripts, ) -from mavis.annotate.fusion import determine_prime, FusionTranscript -from mavis.annotate.constants import SPLICE_TYPE from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import ORIENT, PRIME, PROTOCOL, reverse_complement, STRAND, SVTYPE +from mavis.constants import ORIENT, PRIME, PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, reverse_complement from mavis.error import NotSpecifiedError from mavis.interval import Interval -from . import MockLongString, MockObject, get_example_genes from ..util import get_data - +from . import MockLongString, MockObject, get_example_genes REFERENCE_ANNOTATIONS = None REFERENCE_GENOME = None @@ -33,7 +31,7 @@ def setUpModule(): global REFERENCE_ANNOTATIONS, REFERENCE_GENOME, REF_CHR, EXAMPLE_GENES EXAMPLE_GENES = get_example_genes() - REFERENCE_ANNOTATIONS = load_reference_genes(get_data('mock_reference_annotations.tsv')) + REFERENCE_ANNOTATIONS = load_annotations(get_data('mock_reference_annotations.tsv')) count = sum([len(genes) for genes in REFERENCE_ANNOTATIONS.values()]) print('loaded annotations', count) assert count >= 6 # make sure this is the file we expect @@ -1461,13 +1459,13 @@ def test_reference_name_dict(self): self.assertEqual(1, len(d)) def test_loading_json_annotations(self): - annotations = load_reference_genes(get_data('mock_reference_annotations.json')) + annotations = load_annotations(get_data('mock_reference_annotations.json')) self.assertEqual(1, len(annotations.keys())) self.assertEqual(1, len(list(annotations.values())[0])) def test_loading_annotations_not_found(self): with self.assertRaises(FileNotFoundError): - load_reference_genes('file.other') + load_annotations('file.other') def test_determine_prime(self): tneg = PreTranscript(exons=[(3, 4)], strand=STRAND.NEG) @@ -1558,9 +1556,7 @@ def test_calculate_orf_nested(self): class TestAnnotateEvents(unittest.TestCase): def test_annotate_events(self): - reference_annotations = load_reference_genes( - get_data('mock_reference_annotations.full.tsv') - ) + reference_annotations = load_annotations(get_data('mock_reference_annotations.full.tsv')) b1 = Breakpoint('fakereference9', 658, orient=ORIENT.RIGHT, strand=STRAND.POS) b2 = Breakpoint('fakereference9', 10237, orient=ORIENT.RIGHT, strand=STRAND.NEG) bpp = BreakpointPair( diff --git a/tests/integration/test_annotate_examples.py b/tests/integration/test_annotate_examples.py index 3a57d5a5..1e28f845 100644 --- a/tests/integration/test_annotate_examples.py +++ b/tests/integration/test_annotate_examples.py @@ -1,19 +1,18 @@ import os import unittest +from mavis.annotate.fusion import FusionTranscript from mavis.annotate.variant import ( - annotate_events, Annotation, - flatten_fusion_transcript, - call_protein_indel, IndelCall, + annotate_events, + call_protein_indel, + flatten_fusion_transcript, ) -from mavis.annotate.fusion import FusionTranscript -from mavis.annotate.constants import SPLICE_TYPE from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import ORIENT, PROTOCOL, STRAND, SVTYPE +from mavis.constants import ORIENT, PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE -from . import get_example_genes, MockObject, MockLongString +from . import MockLongString, MockObject, get_example_genes def get_best(gene): diff --git a/tests/integration/test_bam.py b/tests/integration/test_bam.py index f29453df..0712bc57 100644 --- a/tests/integration/test_bam.py +++ b/tests/integration/test_bam.py @@ -1,10 +1,11 @@ import logging import os import unittest -from unittest import mock import warnings +from unittest import mock -from mavis.annotate.file_io import load_reference_genes, load_reference_genome +import timeout_decorator +from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.bam import cigar as _cigar from mavis.bam import read as _read from mavis.bam.cache import BamCache @@ -14,22 +15,20 @@ read_pair_type, sequenced_strand, ) -from mavis.bam.stats import compute_genome_bam_stats, compute_transcriptome_bam_stats, Histogram +from mavis.bam.stats import Histogram, compute_genome_bam_stats, compute_transcriptome_bam_stats from mavis.constants import ( CIGAR, DNA_ALPHABET, + NA_MAPPING_QUALITY, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE, - NA_MAPPING_QUALITY, ) from mavis.interval import Interval -import timeout_decorator -from . import MockRead, MockBamFileHandle from ..util import get_data - +from . import MockBamFileHandle, MockRead REFERENCE_GENOME = None @@ -463,7 +462,7 @@ def test_genome_bam_stats(self): def test_trans_bam_stats(self): bamfh = BamCache(get_data('mock_trans_reads_for_events.sorted.bam')) - annotations = load_reference_genes(get_data('mock_annotations.json')) + annotations = load_annotations(get_data('mock_annotations.json')) stats = compute_transcriptome_bam_stats( bamfh, annotations, diff --git a/tests/integration/test_splicing.py b/tests/integration/test_splicing.py index 55503882..a80adb11 100644 --- a/tests/integration/test_splicing.py +++ b/tests/integration/test_splicing.py @@ -1,13 +1,13 @@ import os import unittest -from mavis.annotate.constants import SPLICE_SITE_RADIUS, SPLICE_TYPE +from mavis.annotate.constants import SPLICE_SITE_RADIUS from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.genomic import Exon, PreTranscript from mavis.annotate.splicing import predict_splice_sites from mavis.annotate.variant import annotate_events from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import PROTOCOL, reverse_complement, STRAND, SVTYPE +from mavis.constants import PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, reverse_complement from mavis.interval import Interval from . import DATA_DIR, MockLongString, MockObject, get_example_genes diff --git a/tests/unit/test_tab.py b/tests/unit/test_tab.py deleted file mode 100644 index b510c635..00000000 --- a/tests/unit/test_tab.py +++ /dev/null @@ -1,287 +0,0 @@ -import unittest -from tab import FileTransform, cast_boolean, cast_null - - -class MockFileTransform: - def __init__(self, h, **kwargs): - self.input = h - self.require = kwargs.pop('require', []) - self.rename = kwargs.pop('rename', {}) - self.drop = kwargs.pop('drop', []) - self.add = kwargs.pop('add', {}) - self.add_default = kwargs.pop('add_default', {}) - self.split = kwargs.pop('split', {}) - self.combine = kwargs.pop('combine', {}) - self.validate = kwargs.pop('validate', {}) - self.cast = kwargs.pop('cast', {}) - self.simplify = kwargs.pop('simplify', False) - self.in_ = kwargs.pop('in_', {}) - - def transform_line(self, *pos, **kwargs): - return FileTransform.transform_line(self, *pos, **kwargs) - - -class TestCast(unittest.TestCase): - def test_cast_boolean_true(self): - self.assertEqual(True, cast_boolean('+')) - self.assertEqual(True, cast_boolean('T')) - self.assertEqual(True, cast_boolean('true')) - self.assertEqual(True, cast_boolean('y')) - self.assertEqual(True, cast_boolean(1)) - - def test_cast_boolean_false(self): - self.assertEqual(False, cast_boolean('-')) - self.assertEqual(False, cast_boolean('f')) - self.assertEqual(False, cast_boolean('false')) - self.assertEqual(False, cast_boolean('n')) - self.assertEqual(False, cast_boolean(0)) - - def test_cast_boolean_error(self): - with self.assertRaises(TypeError): - cast_boolean(2) - - def test_cast_null_ok(self): - self.assertEqual(None, cast_null('none')) - self.assertEqual(None, cast_null(None)) - - def test_cast_null_error(self): - with self.assertRaises(TypeError): - cast_null('f') - - -class TestFileTransform(unittest.TestCase): - def test_simplify(self): - h = ['a', 'b', 'c'] - ft = FileTransform(header=h) - self.assertEqual(h, ft.input) - self.assertEqual(h, ft.header) - ft = FileTransform(h, simplify=True) - self.assertEqual(h, ft.input) - self.assertEqual([], ft.header) - - def test_require_simplify(self): - h = ['a', 'b', 'c'] - ft = FileTransform(header=h, require=['a'], simplify=True) - self.assertEqual(h, ft.input) - self.assertEqual(['a'], ft.header) - - def test_require_error(self): - h = ['a', 'b', 'c'] - with self.assertRaises(KeyError): - FileTransform(header=h, require=['k']) - - def test_rename(self): - h = ['a', 'b', 'c'] - ft = FileTransform(h, rename={'a': ['k', 'm']}) - self.assertEqual(h, ft.input) - self.assertEqual(['a', 'b', 'c', 'k', 'm'], ft.header) - - def test_rename_error(self): - h = ['a', 'b', 'c'] - with self.assertRaises(KeyError): - FileTransform(header=h, rename={'k': ['t']}) - - def test_cast_error(self): - h = ['b', 'c'] - with self.assertRaises(KeyError): - FileTransform(h, cast={'a': int}) - - def test_add(self): - h = ['a', 'b', 'c'] - ft = FileTransform(h, add_default={'k': 1}) - self.assertEqual(h, ft.input) - self.assertEqual(['a', 'b', 'c', 'k'], ft.header) - - def test_require__in(self): - h = ['a', 'b', 'c'] - ft = FileTransform(h, require=['c'], in_={'a': []}, simplify=True) - self.assertEqual(h, ft.input) - self.assertEqual(['a', 'c'], ft.header) - - def test_combine(self): - h = ['a', 'b', 'c'] - ft = FileTransform(h, combine={'k': '{a}{b}{c}'}) - self.assertEqual(ft.input, h) - self.assertEqual(ft.header, h + ['k']) - - def test_combine_error_name_conflict(self): - h = ['a', 'b', 'c'] - with self.assertRaises(KeyError): - FileTransform(h, combine={'b': '{a}{b}{c}'}) - - def test_combine_keyerror(self): - h = ['a', 'b', 'c'] - with self.assertRaises(KeyError): - FileTransform(h, combine={'k': '{m}{b}{c}'}) - - def test_duplicate_input_column(self): - with self.assertRaises(KeyError): - FileTransform(['a', 'a']) - - def test_validate_missing_column(self): - with self.assertRaises(KeyError): - FileTransform(['a', 'b'], validate={'c': ''}) - - def test_drop_and_require_error(self): - with self.assertRaises(AssertionError): - FileTransform(['a'], require=['a'], drop=['a']) - - def test_membership_of_missing_column_error(self): - with self.assertRaises(KeyError): - FileTransform(['a'], in_={'x': []}) - - def test_membership_bad_object(self): - with self.assertRaises(TypeError): - FileTransform(['a'], in_={'a': 1}) - - def test_cast_noncallable_error(self): - FileTransform(['a'], cast={'a': int}) - with self.assertRaises(TypeError): - FileTransform(['a'], cast={'a': 1}) - - def test_split_missing_column_error(self): - FileTransform(['a'], split={'a': r'^(?P\w+)'}) - with self.assertRaises(KeyError): - FileTransform(['a'], split={'x': r'^(?P\w+)'}) - - def test_split_duplicate_column_error(self): - FileTransform(['a', 'b'], split={'a': r'^(?P\w+)'}) - with self.assertRaises(KeyError): - FileTransform(['a', 'b'], require=['b'], split={'a': r'^(?P\w+)'}) - - def test_add(self): - ft = FileTransform(['a', 'b'], add={'c': 1}) - self.assertEqual(['a', 'b', 'c'], ft.header) - - def test_add_default(self): - ft = FileTransform(['a', 'b'], add_default={'c': 1}) - self.assertEqual(['a', 'b', 'c'], ft.header) - ft = FileTransform(['a', 'b'], add_default={'b': 1}) - self.assertEqual(['a', 'b'], ft.header) - - def test_require(self): - ft = FileTransform(['a', 'b'], require=['a']) - self.assertEqual(['a', 'b'], ft.header) - - ft = FileTransform(['a', 'b'], require=['a'], simplify=True) - self.assertEqual(['a'], ft.header) - - def test_invalid_option(self): - with self.assertRaises(TypeError): - FileTransform(['a', 'b'], require=['a'], blargh=1) - - -class TestTransformLine(unittest.TestCase): - def test_add(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, add={'a': 'blargh'}) - row = ft.transform_line(['1', '2', '3']) - self.assertEqual('blargh', row['a']) - - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, add={'x': 'blargh'}) - row = ft.transform_line(['1', '2', '3']) - self.assertEqual('blargh', row['x']) - - def test_combine(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, combine={'k': '{a}{b}{c}'}) - row = ft.transform_line(['1', '2', '3']) - self.assertEqual('123', row['k']) - - def test_combine_then_cast(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, combine={'k': '{a}{b}{c}'}, cast={'k': int}) - row = ft.transform_line(['1', '2', '3']) - self.assertEqual(123, row['k']) - - def test_cast_to_cast_boolean(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, cast={'a': cast_boolean, 'b': cast_boolean}) - row = ft.transform_line(['1', '0', '3']) - self.assertEqual(True, row['a']) - self.assertEqual(False, row['b']) - - def test_split_combine_cast(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform( - h, - split={'a': r'^(?P\d+)_(?P\d+)$'}, - combine={'k': '{a1}{b}{c}'}, - cast={'k': int}, - ) - row = ft.transform_line(['1_10', '2', '3']) - self.assertEqual(123, row['k']) - - def test_add_default(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, add_default={'k': 1}) - line = ['1', '2', '3'] - row = ft.transform_line(line) - self.assertEqual(1, row['k']) - self.assertEqual('1', row['a']) - self.assertEqual('2', row['b']) - self.assertEqual('3', row['c']) - - def test_add_default_override_default(self): - h = ['a', 'b', 'c'] - ft = MockFileTransform(h, add_default={'a': 8}, in_={'a': ['1']}) - line = ['1', '2', '3'] - row = ft.transform_line(line) - self.assertEqual('1', row['a']) - self.assertEqual('2', row['b']) - self.assertEqual('3', row['c']) - - def test_validate(self): - h = ['a'] - ft = MockFileTransform(h, validate={'a': r'^[t]+$'}) - line = ['ttttt'] - row = ft.transform_line(line) - self.assertEqual('ttttt', row['a']) - - def test_rename(self): - h = ['a'] - ft = MockFileTransform(h, rename={'a': ['b', 'c']}) - line = ['ttttt'] - row = ft.transform_line(line) - self.assertEqual('ttttt', row['a']) - self.assertEqual('ttttt', row['b']) - self.assertEqual('ttttt', row['c']) - - def test_length_mismatch_error(self): - h = ['a', 'b'] - ft = MockFileTransform(h) - line = ['ttttt'] - with self.assertRaises(AssertionError): - ft.transform_line(line) - - def test_rename_drop_original(self): - h = ['a'] - ft = MockFileTransform(h, rename={'a': ['b', 'c']}, drop=['a']) - line = ['ttttt'] - row = ft.transform_line(line) - self.assertTrue('a' not in row) - self.assertEqual('ttttt', row['b']) - self.assertEqual('ttttt', row['c']) - - ft = MockFileTransform(h, rename={'a': ['b', 'c']}, simplify=True) - row = ft.transform_line(line) - self.assertTrue('a' not in row) - self.assertEqual('ttttt', row['b']) - self.assertEqual('ttttt', row['c']) - - def test_split(self): - h = ['a'] - ft = MockFileTransform(h, split={'a': r'^(?P\d+)[_]+(?P\d+)$'}) - row = ft.transform_line(['1_2']) - self.assertEqual('1', row['a1']) - self.assertEqual('2', row['a2']) - row = ft.transform_line(['1__2']) - self.assertEqual('1', row['a1']) - self.assertEqual('2', row['a2']) - with self.assertRaises(UserWarning): - ft.transform_line(['_1__4']) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/unit/test_tool.py b/tests/unit/test_tool.py index fffaa36a..72c0d2e9 100644 --- a/tests/unit/test_tool.py +++ b/tests/unit/test_tool.py @@ -1,5 +1,6 @@ import unittest +import pytest from mavis.constants import COLUMNS, ORIENT, STRAND, SVTYPE from mavis.tools import SUPPORTED_TOOL, _convert_tool_row, _parse_transabyss from mavis.tools.vcf import convert_record as _parse_vcf_record @@ -8,7 +9,7 @@ from .mock import Mock -class TestDelly(unittest.TestCase): +class TestDelly: def test_convert_insertion(self): row = Mock( chrom='1', @@ -25,28 +26,28 @@ def test_convert_insertion(self): alts=[], ) bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.DELLY, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual(247760043 - 10, bpp.break1.start) - self.assertEqual(247760043 + 10, bpp.break1.end) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(STRAND.NS, bpp.break1.strand) - self.assertEqual(247760044 - 10, bpp.break2.start) - self.assertEqual(247760044 + 10, bpp.break2.end) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual(STRAND.NS, bpp.break2.strand) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(SVTYPE.INS, bpp.event_type) - self.assertEqual(None, bpp.untemplated_seq) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '1' + assert bpp.break1.start == 247760043 - 10 + assert bpp.break1.end == 247760043 + 10 + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break1.strand == STRAND.NS + assert bpp.break2.start == 247760044 - 10 + assert bpp.break2.end == 247760044 + 10 + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.break2.strand == STRAND.NS + assert bpp.break2.chr == '1' + assert bpp.event_type == SVTYPE.INS + assert bpp.untemplated_seq == None bpp_list = _convert_tool_row( _parse_vcf_record(row)[0], SUPPORTED_TOOL.DELLY, False, assume_no_untemplated=True ) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(None, bpp.untemplated_seq) - self.assertNotEqual('', bpp.untemplated_seq) + assert bpp.untemplated_seq == None + assert bpp.untemplated_seq != '' def test_convert_convert_translocation(self): row = Mock( @@ -66,43 +67,43 @@ def test_convert_convert_translocation(self): bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.DELLY, False) for b in bpp_list: print(b) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 row.info['CT'] = 'NtoN' bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.DELLY, False) for b in bpp_list: print(b) - self.assertEqual(4, len(bpp_list)) + assert len(bpp_list) == 4 -class TestCnvNator(unittest.TestCase): +class TestCnvNator: def test_convert_deletion(self): row = {'event_type': 'deletion', 'coordinates': '1:1-10000'} bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.CNVNATOR, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(1, bpp.break1.start) - self.assertEqual(1, bpp.break1.end) - self.assertEqual(10000, bpp.break2.start) - self.assertEqual(10000, bpp.break2.start) - self.assertEqual(SVTYPE.DEL, bpp.event_type) - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) + assert bpp.break1.start == 1 + assert bpp.break1.end == 1 + assert bpp.break2.start == 10000 + assert bpp.break2.start == 10000 + assert bpp.event_type == SVTYPE.DEL + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' def test_convert_duplication(self): row = {'event_type': 'duplication', 'coordinates': '1:1-10000'} bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.CNVNATOR, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(1, bpp.break1.start) - self.assertEqual(1, bpp.break1.end) - self.assertEqual(10000, bpp.break2.start) - self.assertEqual(10000, bpp.break2.start) - self.assertEqual(SVTYPE.DUP, bpp.event_type) - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) + assert bpp.break1.start == 1 + assert bpp.break1.end == 1 + assert bpp.break2.start == 10000 + assert bpp.break2.start == 10000 + assert bpp.event_type == SVTYPE.DUP + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' -class TestStarFusion(unittest.TestCase): +class TestStarFusion: def test_convert_standard_event(self): row = { 'FusionName': 'GAS6--RASA3', @@ -111,14 +112,14 @@ def test_convert_standard_event(self): } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.STARFUSION, True) - self.assertEqual(2, len(bpp_list)) + assert len(bpp_list) == 2 bpp = bpp_list[0] - self.assertEqual('chr13', bpp.break1.chr) - self.assertEqual('chr13', bpp.break2.chr) - self.assertEqual(114529969, bpp.break1.start) - self.assertEqual(114751269, bpp.break2.start) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(True, bpp.stranded) + assert bpp.break1.chr == 'chr13' + assert bpp.break2.chr == 'chr13' + assert bpp.break1.start == 114529969 + assert bpp.break2.start == 114751269 + assert bpp.opposing_strands == False + assert bpp.stranded == True def test_convert_translocation(self): row = { @@ -128,22 +129,22 @@ def test_convert_translocation(self): } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.STARFUSION, True) - self.assertEqual(2, len(bpp_list)) + assert len(bpp_list) == 2 bpp = bpp_list[0] - self.assertEqual('chr17', bpp.break1.chr) - self.assertEqual('chr20', bpp.break2.chr) - self.assertEqual(59445688, bpp.break1.start) - self.assertEqual(49411710, bpp.break2.start) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(True, bpp.stranded) + assert bpp.break1.chr == 'chr17' + assert bpp.break2.chr == 'chr20' + assert bpp.break1.start == 59445688 + assert bpp.break2.start == 49411710 + assert bpp.opposing_strands == False + assert bpp.stranded == True def test_malformed(self): row = {'FusionName': 'BCAS4--BCAS3', 'LeftBreakpoint': '', 'RightBreakpoint': None} - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): _convert_tool_row(row, SUPPORTED_TOOL.STARFUSION, False) -class TestTransAbyss(unittest.TestCase): +class TestTransAbyss: def test_convert_stranded_indel_insertion(self): row = { 'chr': '1', @@ -155,16 +156,16 @@ def test_convert_stranded_indel_insertion(self): 'id': 1, } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.TA, True) - self.assertEqual(2, len(bpp_list)) + assert len(bpp_list) == 2 bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(10015, bpp.break1.start) - self.assertEqual(10016, bpp.break2.start) - self.assertEqual(SVTYPE.INS, bpp.event_type) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(True, bpp.stranded) - self.assertEqual('AAT', bpp.untemplated_seq) + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.break1.start == 10015 + assert bpp.break2.start == 10016 + assert bpp.event_type == SVTYPE.INS + assert bpp.opposing_strands == False + assert bpp.stranded == True + assert bpp.untemplated_seq == 'AAT' def test_convert_indel_deletion(self): row = { @@ -182,9 +183,9 @@ def test_convert_indel_deletion(self): print(_convert_tool_row) for bpp in bpp_list: print(bpp) - self.assertEqual(2, len(bpp_list)) + assert len(bpp_list) == 2 bpp = bpp_list[0] - self.assertEqual('', bpp.untemplated_seq) + assert bpp.untemplated_seq == '' def test_convert_indel_unstranded_insertion(self): row = { @@ -199,15 +200,15 @@ def test_convert_indel_unstranded_insertion(self): } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.TA, False) print([str(b) for b in bpp_list]) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(SVTYPE.INS, bpp.event_type) - self.assertEqual(STRAND.NS, bpp.break1.strand) - self.assertEqual(STRAND.NS, bpp.break2.strand) - self.assertEqual(False, bpp.stranded) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual('TT', bpp.untemplated_seq) + assert bpp.event_type == SVTYPE.INS + assert bpp.break1.strand == STRAND.NS + assert bpp.break2.strand == STRAND.NS + assert bpp.stranded == False + assert bpp.opposing_strands == False + assert bpp.untemplated_seq == 'TT' def test_convert_indel_duplication(self): row = { @@ -222,15 +223,15 @@ def test_convert_indel_duplication(self): } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.TA, False) print([str(b) for b in bpp_list]) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(SVTYPE.DUP, bpp.event_type) - self.assertEqual(STRAND.NS, bpp.break1.strand) - self.assertEqual(STRAND.NS, bpp.break2.strand) - self.assertEqual(False, bpp.stranded) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual('', bpp.untemplated_seq) + assert bpp.event_type == SVTYPE.DUP + assert bpp.break1.strand == STRAND.NS + assert bpp.break2.strand == STRAND.NS + assert bpp.stranded == False + assert bpp.opposing_strands == False + assert bpp.untemplated_seq == '' def test_convert_translocation(self): raise unittest.SkipTest('TODO') @@ -246,7 +247,7 @@ def test_convert_stranded_translocation(self): 'id': 1, } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.TA, True) - self.assertEqual(2, len(bpp_list)) + assert len(bpp_list) == 2 def test_parse_stranded_translocation(self): row = { @@ -260,10 +261,10 @@ def test_parse_stranded_translocation(self): } std = _parse_transabyss(row) print(std) - self.assertTrue('event_type' not in std) + assert 'event_type' not in std -class TestManta(unittest.TestCase): +class TestManta: def test_convert_deletion(self): row = Mock( chrom='21', @@ -274,16 +275,16 @@ def test_convert_deletion(self): alts=[], ) bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.MANTA, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('21', bpp.break1.chr) - self.assertEqual(9412306, bpp.break1.start) - self.assertEqual(9412310, bpp.break1.end) - self.assertEqual(9412400, bpp.break2.start) - self.assertEqual(9412404, bpp.break2.end) - self.assertEqual('21', bpp.break2.chr) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '21' + assert bpp.break1.start == 9412306 + assert bpp.break1.end == 9412310 + assert bpp.break2.start == 9412400 + assert bpp.break2.end == 9412404 + assert bpp.break2.chr == '21' print(bpp, bpp.data['tracking_id']) - self.assertEqual('manta-MantaDEL:20644:0:2:0:0:0', bpp.data['tracking_id']) + assert bpp.data['tracking_id'] == 'manta-MantaDEL:20644:0:2:0:0:0' def test_convert_duplication(self): row = Mock( @@ -295,11 +296,11 @@ def test_convert_duplication(self): alts=[], ) bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.MANTA, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual('manta-MantaDUP:TANDEM:22477:0:1:0:9:0', bpp.data['tracking_id']) + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.data['tracking_id'] == 'manta-MantaDUP:TANDEM:22477:0:1:0:9:0' def test_non_trans_bnd(self): row = Mock( @@ -319,16 +320,16 @@ def test_non_trans_bnd(self): ) vcf_list = _parse_vcf_record(row) bpp_list = _convert_tool_row(vcf_list[0], SUPPORTED_TOOL.MANTA, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(17051724, bpp.break1.start) - self.assertEqual(234912188, bpp.break2.start) - self.assertEqual('R', bpp.break1.orient) - self.assertEqual('R', bpp.break2.orient) - self.assertEqual('manta-MantaBND:207:0:1:0:0:0:0', bpp.data['tracking_id']) - self.assertEqual(1, len(bpp_list)) + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.break1.start == 17051724 + assert bpp.break2.start == 234912188 + assert bpp.break1.orient == 'R' + assert bpp.break2.orient == 'R' + assert bpp.data['tracking_id'] == 'manta-MantaBND:207:0:1:0:0:0:0' + assert len(bpp_list) == 1 def test_non_trans_bnd_from_mate(self): row = Mock( @@ -348,19 +349,19 @@ def test_non_trans_bnd_from_mate(self): ) vcf_list = _parse_vcf_record(row) bpp_list = _convert_tool_row(vcf_list[0], SUPPORTED_TOOL.MANTA, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(17051724, bpp.break1.start) - self.assertEqual(234912188, bpp.break2.start) - self.assertEqual('R', bpp.break1.orient) - self.assertEqual('R', bpp.break2.orient) - self.assertEqual('manta-MantaBND:207:0:1:0:0:0:1', bpp.data['tracking_id']) - self.assertEqual(1, len(bpp_list)) + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.break1.start == 17051724 + assert bpp.break2.start == 234912188 + assert bpp.break1.orient == 'R' + assert bpp.break2.orient == 'R' + assert bpp.data['tracking_id'] == 'manta-MantaBND:207:0:1:0:0:0:1' + assert len(bpp_list) == 1 -class TestDefuse(unittest.TestCase): +class TestDefuse: def test_convert_inverted_translocation(self): row = { 'gene_chromosome1': 'X', @@ -372,18 +373,18 @@ def test_convert_inverted_translocation(self): 'cluster_id': 1, } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.DEFUSE, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('3', bpp.break1.chr) - self.assertEqual('X', bpp.break2.chr) - self.assertEqual(50294136, bpp.break1.start) - self.assertEqual(153063989, bpp.break2.start) - self.assertEqual(None, bpp.event_type) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.data['tracking_id']) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '3' + assert bpp.break2.chr == 'X' + assert bpp.break1.start == 50294136 + assert bpp.break2.start == 153063989 + assert bpp.event_type == None + assert bpp.opposing_strands == False + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.stranded == False + assert bpp.data['tracking_id'] == 'defuse-1' def test_convert_translocation(self): row = { @@ -396,18 +397,18 @@ def test_convert_translocation(self): 'cluster_id': 1, } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.DEFUSE, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('3', bpp.break1.chr) - self.assertEqual('X', bpp.break2.chr) - self.assertEqual(50294136, bpp.break1.start) - self.assertEqual(153063989, bpp.break2.start) - self.assertEqual(None, bpp.event_type) - self.assertEqual(True, bpp.opposing_strands) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.data['tracking_id']) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '3' + assert bpp.break2.chr == 'X' + assert bpp.break1.start == 50294136 + assert bpp.break2.start == 153063989 + assert bpp.event_type == None + assert bpp.opposing_strands == True + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.stranded == False + assert bpp.data['tracking_id'] == 'defuse-1' def test_convert_indel(self): row = { @@ -420,18 +421,18 @@ def test_convert_indel(self): 'cluster_id': 1, } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.DEFUSE, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(1663681, bpp.break1.start) - self.assertEqual(151732089, bpp.break2.start) - self.assertEqual(None, bpp.event_type) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.data['tracking_id']) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.break1.start == 1663681 + assert bpp.break2.start == 151732089 + assert bpp.event_type == None + assert bpp.opposing_strands == False + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.stranded == False + assert bpp.data['tracking_id'] == 'defuse-1' def test_convert_inversion(self): row = { @@ -444,21 +445,21 @@ def test_convert_inversion(self): 'cluster_id': 1, } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.DEFUSE, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(144898348, bpp.break1.start) - self.assertEqual(235294748, bpp.break2.start) - self.assertEqual(None, bpp.event_type) - self.assertEqual(True, bpp.opposing_strands) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) - self.assertEqual('defuse-1', bpp.data['tracking_id']) + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.break1.start == 144898348 + assert bpp.break2.start == 235294748 + assert bpp.event_type == None + assert bpp.opposing_strands == True + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.stranded == False + assert bpp.data['tracking_id'] == 'defuse-1' -class TestChimerascan(unittest.TestCase): +class TestChimerascan: def test_convert_pos_pos(self): row = { 'chrom5p': 'chr3', @@ -472,17 +473,17 @@ def test_convert_pos_pos(self): 'chimera_cluster_id': 'CLUSTER30', } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.CHIMERASCAN, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('3', bpp.break1.chr) - self.assertEqual('3', bpp.break2.chr) + assert bpp.break1.chr == '3' + assert bpp.break2.chr == '3' print(bpp) - self.assertEqual(int(row['end5p']), bpp.break1.start) - self.assertEqual(int(row['start3p']), bpp.break2.start) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) + assert bpp.break1.start == int(row['end5p']) + assert bpp.break2.start == int(row['start3p']) + assert bpp.opposing_strands == False + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.stranded == False def test_convert_pos_neg(self): row = { @@ -497,17 +498,17 @@ def test_convert_pos_neg(self): 'chimera_cluster_id': 'CLUSTER30', } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.CHIMERASCAN, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('3', bpp.break1.chr) - self.assertEqual('3', bpp.break2.chr) + assert bpp.break1.chr == '3' + assert bpp.break2.chr == '3' print(bpp) - self.assertEqual(int(row['end5p']), bpp.break1.start) - self.assertEqual(int(row['end3p']), bpp.break2.start) - self.assertEqual(True, bpp.opposing_strands) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) + assert bpp.break1.start == int(row['end5p']) + assert bpp.break2.start == int(row['end3p']) + assert bpp.opposing_strands == True + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.stranded == False def test_convert_neg_pos(self): row = { @@ -522,17 +523,17 @@ def test_convert_neg_pos(self): 'chimera_cluster_id': 'CLUSTER30', } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.CHIMERASCAN, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('3', bpp.break1.chr) - self.assertEqual('3', bpp.break2.chr) + assert bpp.break1.chr == '3' + assert bpp.break2.chr == '3' print(bpp) - self.assertEqual(int(row['start5p']), bpp.break1.start) - self.assertEqual(int(row['start3p']), bpp.break2.start) - self.assertEqual(True, bpp.opposing_strands) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) + assert bpp.break1.start == int(row['start5p']) + assert bpp.break2.start == int(row['start3p']) + assert bpp.opposing_strands == True + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.stranded == False def test_convert_neg_neg(self): row = { @@ -547,162 +548,162 @@ def test_convert_neg_neg(self): 'chimera_cluster_id': 'CLUSTER30', } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.CHIMERASCAN, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('3', bpp.break1.chr) - self.assertEqual('3', bpp.break2.chr) + assert bpp.break1.chr == '3' + assert bpp.break2.chr == '3' print(bpp) - self.assertEqual(int(row['start5p']), bpp.break1.start) - self.assertEqual(int(row['end3p']), bpp.break2.start) - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(False, bpp.stranded) + assert bpp.break1.start == int(row['start5p']) + assert bpp.break2.start == int(row['end3p']) + assert bpp.opposing_strands == False + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.stranded == False -class TestPindel(unittest.TestCase): +class TestPindel: def test_convert_deletion(self): row = Mock(chrom='21', pos=9412306, info={'SVTYPE': 'DEL'}, stop=9412400, id=None, alts=[]) bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.PINDEL, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('21', bpp.break1.chr) - self.assertEqual('21', bpp.break2.chr) - self.assertEqual(SVTYPE.DEL, bpp.event_type) - self.assertEqual(row.pos, bpp.break1.start) - self.assertEqual(row.pos, bpp.break1.end) - self.assertEqual(row.stop, bpp.break2.start) - self.assertEqual(row.stop, bpp.break2.end) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(STRAND.NS, bpp.break1.strand) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual(STRAND.NS, bpp.break2.strand) - self.assertEqual(False, bpp.stranded) - self.assertEqual(False, bpp.opposing_strands) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '21' + assert bpp.break2.chr == '21' + assert bpp.event_type == SVTYPE.DEL + assert bpp.break1.start == row.pos + assert bpp.break1.end == row.pos + assert bpp.break2.start == row.stop + assert bpp.break2.end == row.stop + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break1.strand == STRAND.NS + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.break2.strand == STRAND.NS + assert bpp.stranded == False + assert bpp.opposing_strands == False def test_convert_insertion(self): row = Mock(chrom='21', pos=9412306, info={'SVTYPE': 'INS'}, stop=9412400, id=None, alts=[]) bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.PINDEL, False) - self.assertEqual(1, len(bpp_list)) - bpp = bpp_list[0] - self.assertEqual('21', bpp.break1.chr) - self.assertEqual('21', bpp.break2.chr) - self.assertEqual(SVTYPE.INS, bpp.event_type) - self.assertEqual(row.pos, bpp.break1.start) - self.assertEqual(row.pos, bpp.break1.end) - self.assertEqual(row.stop, bpp.break2.start) - self.assertEqual(row.stop, bpp.break2.end) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(STRAND.NS, bpp.break1.strand) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual(STRAND.NS, bpp.break2.strand) - self.assertEqual(False, bpp.stranded) - self.assertEqual(False, bpp.opposing_strands) + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == '21' + assert bpp.break2.chr == '21' + assert bpp.event_type == SVTYPE.INS + assert bpp.break1.start == row.pos + assert bpp.break1.end == row.pos + assert bpp.break2.start == row.stop + assert bpp.break2.end == row.stop + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break1.strand == STRAND.NS + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.break2.strand == STRAND.NS + assert bpp.stranded == False + assert bpp.opposing_strands == False def test_convert_inversion(self): row = Mock(chrom='21', pos=9412306, info={'SVTYPE': 'INV'}, stop=9412400, id=None, alts=[]) bpp_list = _convert_tool_row(_parse_vcf_record(row)[0], SUPPORTED_TOOL.PINDEL, False) - self.assertEqual(2, len(bpp_list)) + assert len(bpp_list) == 2 bpp = sorted(bpp_list, key=lambda x: x.break1)[0] - self.assertEqual('21', bpp.break1.chr) - self.assertEqual('21', bpp.break2.chr) - self.assertEqual(SVTYPE.INV, bpp.event_type) - self.assertEqual(row.pos, bpp.break1.start) - self.assertEqual(row.pos, bpp.break1.end) - self.assertEqual(row.stop, bpp.break2.start) - self.assertEqual(row.stop, bpp.break2.end) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(STRAND.NS, bpp.break1.strand) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(STRAND.NS, bpp.break2.strand) - self.assertEqual(False, bpp.stranded) - self.assertEqual(True, bpp.opposing_strands) - - -class TestParseBndAlt(unittest.TestCase): + assert bpp.break1.chr == '21' + assert bpp.break2.chr == '21' + assert bpp.event_type == SVTYPE.INV + assert bpp.break1.start == row.pos + assert bpp.break1.end == row.pos + assert bpp.break2.start == row.stop + assert bpp.break2.end == row.stop + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break1.strand == STRAND.NS + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.break2.strand == STRAND.NS + assert bpp.stranded == False + assert bpp.opposing_strands == True + + +class TestParseBndAlt: def test_right(self): # '[4:190898243[AGGT' chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('[4:190898243[A') - self.assertEqual('4', chrom) - self.assertEqual(190898243, pos) - self.assertEqual(ORIENT.RIGHT, orient1) - self.assertEqual(ORIENT.RIGHT, orient2) - self.assertEqual('', seq) - self.assertEqual('A', ref) + assert chrom == '4' + assert pos == 190898243 + assert orient1 == ORIENT.RIGHT + assert orient2 == ORIENT.RIGHT + assert seq == '' + assert ref == 'A' def test_right_untemp_seq(self): chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('[5:190898243[AGGT') - self.assertEqual('5', chrom) - self.assertEqual(190898243, pos) - self.assertEqual(ORIENT.RIGHT, orient1) - self.assertEqual(ORIENT.RIGHT, orient2) - self.assertEqual('AGG', seq) - self.assertEqual('T', ref) + assert chrom == '5' + assert pos == 190898243 + assert orient1 == ORIENT.RIGHT + assert orient2 == ORIENT.RIGHT + assert seq == 'AGG' + assert ref == 'T' chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('CAGTNNNCA[5:190898243[') - self.assertEqual('5', chrom) - self.assertEqual(190898243, pos) - self.assertEqual(ORIENT.LEFT, orient1) - self.assertEqual(ORIENT.RIGHT, orient2) - self.assertEqual('AGTNNNCA', seq) - self.assertEqual('C', ref) + assert chrom == '5' + assert pos == 190898243 + assert orient1 == ORIENT.LEFT + assert orient2 == ORIENT.RIGHT + assert seq == 'AGTNNNCA' + assert ref == 'C' chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('CTG[21:47575965[') - self.assertEqual('21', chrom) - self.assertEqual(47575965, pos) - self.assertEqual(ORIENT.LEFT, orient1) - self.assertEqual(ORIENT.RIGHT, orient2) - self.assertEqual('TG', seq) - self.assertEqual('C', ref) + assert chrom == '21' + assert pos == 47575965 + assert orient1 == ORIENT.LEFT + assert orient2 == ORIENT.RIGHT + assert seq == 'TG' + assert ref == 'C' def test_left(self): chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('G]10:198982]') - self.assertEqual('10', chrom) - self.assertEqual(198982, pos) - self.assertEqual(ORIENT.LEFT, orient1) - self.assertEqual(ORIENT.LEFT, orient2) - self.assertEqual('', seq) - self.assertEqual('G', ref) + assert chrom == '10' + assert pos == 198982 + assert orient1 == ORIENT.LEFT + assert orient2 == ORIENT.LEFT + assert seq == '' + assert ref == 'G' chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt(']10:198982]G') - self.assertEqual('10', chrom) - self.assertEqual(198982, pos) - self.assertEqual(ORIENT.LEFT, orient2) - self.assertEqual('', seq) - self.assertEqual('G', ref) + assert chrom == '10' + assert pos == 198982 + assert orient2 == ORIENT.LEFT + assert seq == '' + assert ref == 'G' def test_alternate_chrom(self): chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('G]GL000.01:198982]') - self.assertEqual('GL000.01', chrom) - self.assertEqual(198982, pos) - self.assertEqual(ORIENT.LEFT, orient2) - self.assertEqual('', seq) - self.assertEqual('G', ref) + assert chrom == 'GL000.01' + assert pos == 198982 + assert orient2 == ORIENT.LEFT + assert seq == '' + assert ref == 'G' def test_left_untemp_seq(self): chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt(']11:123456]AGTNNNCAT') - self.assertEqual('11', chrom) - self.assertEqual(123456, pos) - self.assertEqual(ORIENT.LEFT, orient2) - self.assertEqual('AGTNNNCA', seq) - self.assertEqual('T', ref) + assert chrom == '11' + assert pos == 123456 + assert orient2 == ORIENT.LEFT + assert seq == 'AGTNNNCA' + assert ref == 'T' chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt(']8:1682443]TGC') - self.assertEqual('8', chrom) - self.assertEqual(1682443, pos) - self.assertEqual(ORIENT.LEFT, orient2) - self.assertEqual('TG', seq) - self.assertEqual('C', ref) + assert chrom == '8' + assert pos == 1682443 + assert orient2 == ORIENT.LEFT + assert seq == 'TG' + assert ref == 'C' chrom, pos, orient1, orient2, ref, seq = _parse_bnd_alt('AAGTG]11:66289601]') - self.assertEqual('11', chrom) - self.assertEqual(66289601, pos) - self.assertEqual(ORIENT.LEFT, orient2) - self.assertEqual('AGTG', seq) - self.assertEqual('A', ref) + assert chrom == '11' + assert pos == 66289601 + assert orient2 == ORIENT.LEFT + assert seq == 'AGTG' + assert ref == 'A' -class TestBreakDancer(unittest.TestCase): +class TestBreakDancer: def test_itx(self): row = { 'Chr1': '1', @@ -717,15 +718,15 @@ def test_itx(self): 'num_Reads': '43', } bpps = _convert_tool_row(row, SUPPORTED_TOOL.BREAKDANCER, False, True) - self.assertEqual(1, len(bpps)) - self.assertEqual(SVTYPE.DUP, bpps[0].event_type) - self.assertEqual(10001, bpps[0].break1.start) - self.assertEqual(10001, bpps[0].break1.end) - self.assertEqual(ORIENT.RIGHT, bpps[0].break1.orient) - self.assertEqual(10546, bpps[0].break2.start) - self.assertEqual(10546, bpps[0].break2.end) - self.assertEqual(ORIENT.LEFT, bpps[0].break2.orient) - self.assertEqual(False, bpps[0].opposing_strands) + assert len(bpps) == 1 + assert bpps[0].event_type == SVTYPE.DUP + assert bpps[0].break1.start == 10001 + assert bpps[0].break1.end == 10001 + assert bpps[0].break1.orient == ORIENT.RIGHT + assert bpps[0].break2.start == 10546 + assert bpps[0].break2.end == 10546 + assert bpps[0].break2.orient == ORIENT.LEFT + assert bpps[0].opposing_strands == False def test_deletion(self): row = { @@ -741,15 +742,15 @@ def test_deletion(self): 'num_Reads': '67', } bpps = _convert_tool_row(row, SUPPORTED_TOOL.BREAKDANCER, False, True) - self.assertEqual(1, len(bpps)) - self.assertEqual(SVTYPE.DEL, bpps[0].event_type) - self.assertEqual(869445, bpps[0].break1.start) - self.assertEqual(869445, bpps[0].break1.end) - self.assertEqual(ORIENT.LEFT, bpps[0].break1.orient) - self.assertEqual(870225, bpps[0].break2.start) - self.assertEqual(870225, bpps[0].break2.end) - self.assertEqual(ORIENT.RIGHT, bpps[0].break2.orient) - self.assertEqual(False, bpps[0].opposing_strands) + assert len(bpps) == 1 + assert bpps[0].event_type == SVTYPE.DEL + assert bpps[0].break1.start == 869445 + assert bpps[0].break1.end == 869445 + assert bpps[0].break1.orient == ORIENT.LEFT + assert bpps[0].break2.start == 870225 + assert bpps[0].break2.end == 870225 + assert bpps[0].break2.orient == ORIENT.RIGHT + assert bpps[0].opposing_strands == False def test_inversion(self): row = { @@ -765,24 +766,24 @@ def test_inversion(self): 'num_Reads': '2', } bpps = _convert_tool_row(row, SUPPORTED_TOOL.BREAKDANCER, False, True) - self.assertEqual(2, len(bpps)) - self.assertEqual(SVTYPE.INV, bpps[0].event_type) - self.assertEqual(13143396, bpps[0].break1.start) - self.assertEqual(13143396, bpps[0].break1.end) - self.assertEqual(ORIENT.LEFT, bpps[0].break1.orient) - self.assertEqual(13218683, bpps[0].break2.start) - self.assertEqual(13218683, bpps[0].break2.end) - self.assertEqual(ORIENT.LEFT, bpps[0].break2.orient) - self.assertEqual(True, bpps[0].opposing_strands) - - self.assertEqual(SVTYPE.INV, bpps[1].event_type) - self.assertEqual(13143396, bpps[1].break1.start) - self.assertEqual(13143396, bpps[1].break1.end) - self.assertEqual(ORIENT.RIGHT, bpps[1].break1.orient) - self.assertEqual(13218683, bpps[1].break2.start) - self.assertEqual(13218683, bpps[1].break2.end) - self.assertEqual(ORIENT.RIGHT, bpps[1].break2.orient) - self.assertEqual(True, bpps[1].opposing_strands) + assert len(bpps) == 2 + assert bpps[0].event_type == SVTYPE.INV + assert bpps[0].break1.start == 13143396 + assert bpps[0].break1.end == 13143396 + assert bpps[0].break1.orient == ORIENT.LEFT + assert bpps[0].break2.start == 13218683 + assert bpps[0].break2.end == 13218683 + assert bpps[0].break2.orient == ORIENT.LEFT + assert bpps[0].opposing_strands == True + + assert bpps[1].event_type == SVTYPE.INV + assert bpps[1].break1.start == 13143396 + assert bpps[1].break1.end == 13143396 + assert bpps[1].break1.orient == ORIENT.RIGHT + assert bpps[1].break2.start == 13218683 + assert bpps[1].break2.end == 13218683 + assert bpps[1].break2.orient == ORIENT.RIGHT + assert bpps[1].opposing_strands == True def test_insertion(self): row = { @@ -798,30 +799,30 @@ def test_insertion(self): 'num_Reads': '3', } bpps = _convert_tool_row(row, SUPPORTED_TOOL.BREAKDANCER, False, True) - self.assertEqual(1, len(bpps)) - self.assertEqual(SVTYPE.INS, bpps[0].event_type) - self.assertEqual(20216146, bpps[0].break1.start) - self.assertEqual(20216146, bpps[0].break1.end) - self.assertEqual(ORIENT.LEFT, bpps[0].break1.orient) - self.assertEqual(20218060, bpps[0].break2.start) - self.assertEqual(20218060, bpps[0].break2.end) - self.assertEqual(ORIENT.RIGHT, bpps[0].break2.orient) - self.assertEqual(False, bpps[0].opposing_strands) - - -class TestStrelka(unittest.TestCase): + assert len(bpps) == 1 + assert bpps[0].event_type == SVTYPE.INS + assert bpps[0].break1.start == 20216146 + assert bpps[0].break1.end == 20216146 + assert bpps[0].break1.orient == ORIENT.LEFT + assert bpps[0].break2.start == 20218060 + assert bpps[0].break2.end == 20218060 + assert bpps[0].break2.orient == ORIENT.RIGHT + assert bpps[0].opposing_strands == False + + +class TestStrelka: def testInsertion(self): event = Mock( chrom='1', pos=724986, id=None, info={}, ref='G', stop=724986, alts=('GGAATT',) ) bpp_list = _convert_tool_row(_parse_vcf_record(event)[0], SUPPORTED_TOOL.STRELKA, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(724986, bpp.break1.start) - self.assertEqual(724986, bpp.break1.end) - self.assertEqual(724986, bpp.break2.start) - self.assertEqual(724986, bpp.break2.end) - self.assertEqual(SVTYPE.INS, bpp.event_type) + assert bpp.break1.start == 724986 + assert bpp.break1.end == 724986 + assert bpp.break2.start == 724986 + assert bpp.break2.end == 724986 + assert bpp.event_type == SVTYPE.INS def testDeletion(self): event = Mock( @@ -834,13 +835,13 @@ def testDeletion(self): alts=('G',), ) bpp_list = _convert_tool_row(_parse_vcf_record(event)[0], SUPPORTED_TOOL.STRELKA, False) - self.assertEqual(1, len(bpp_list)) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(1265353, bpp.break1.start) - self.assertEqual(1265353, bpp.break1.end) - self.assertEqual(1265366, bpp.break2.start) - self.assertEqual(1265366, bpp.break2.end) - self.assertEqual(SVTYPE.DEL, bpp.event_type) + assert bpp.break1.start == 1265353 + assert bpp.break1.end == 1265353 + assert bpp.break2.start == 1265366 + assert bpp.break2.end == 1265366 + assert bpp.event_type == SVTYPE.DEL def testMalformated(self): event = Mock( @@ -852,68 +853,82 @@ def testMalformated(self): alts=('CTTTTAAATGTAACATGACATAATATATTTCCTAAATAATTTAAAATAATC.',), stop=53678660, ) - with self.assertRaises(NotImplementedError): + with pytest.raises(NotImplementedError): _convert_tool_row(_parse_vcf_record(event)[0], SUPPORTED_TOOL.STRELKA, False) -class TestVCF(unittest.TestCase): - def setUp(self): - self.tra = Mock( - chrom='2', - pos=21673582, - id=None, - info={'SVTYPE': 'TRA', 'CT': '5to5', 'CHR2': '3'}, - stop=58921502, - alts=[], - ) +@pytest.fixture +def vcf_translocation(): + return Mock( + chrom='2', + pos=21673582, + id=None, + info={'SVTYPE': 'TRA', 'CT': '5to5', 'CHR2': '3'}, + stop=58921502, + alts=[], + ) - def test_no_ci(self): - bpp_list = _convert_tool_row(_parse_vcf_record(self.tra)[0], SUPPORTED_TOOL.VCF, False) - self.assertEqual(1, len(bpp_list)) + +class TestVCF: + def test_no_ci(self, vcf_translocation): + bpp_list = _convert_tool_row( + _parse_vcf_record(vcf_translocation)[0], SUPPORTED_TOOL.VCF, False + ) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(21673582, bpp.break1.start) - self.assertEqual(21673582, bpp.break1.end) - self.assertEqual(58921502, bpp.break2.start) - self.assertEqual(58921502, bpp.break2.end) + assert bpp.break1.start == 21673582 + assert bpp.break1.end == 21673582 + assert bpp.break2.start == 58921502 + assert bpp.break2.end == 58921502 - def test_ci(self): - self.tra.info.update({'CIEND': [-700, 700], 'CIPOS': [-700, 700]}) - bpp_list = _convert_tool_row(_parse_vcf_record(self.tra)[0], SUPPORTED_TOOL.VCF, False) - self.assertEqual(1, len(bpp_list)) + def test_ci(self, vcf_translocation): + vcf_translocation.info.update({'CIEND': [-700, 700], 'CIPOS': [-700, 700]}) + bpp_list = _convert_tool_row( + _parse_vcf_record(vcf_translocation)[0], SUPPORTED_TOOL.VCF, False + ) + assert len(bpp_list) == 1 bpp = bpp_list[0] print(bpp) - self.assertEqual(21673582 - 700, bpp.break1.start) - self.assertEqual(21673582 + 700, bpp.break1.end) - self.assertEqual(58921502 - 700, bpp.break2.start) - self.assertEqual(58921502 + 700, bpp.break2.end) + assert bpp.break1.start == 21673582 - 700 + assert bpp.break1.end == 21673582 + 700 + assert bpp.break2.start == 58921502 - 700 + assert bpp.break2.end == 58921502 + 700 - def test_precise_flag_ignores_ci(self): - self.tra.info.update({'CIEND': [-700, 700], 'CIPOS': [-700, 700], 'PRECISE': True}) - bpp_list = _convert_tool_row(_parse_vcf_record(self.tra)[0], SUPPORTED_TOOL.VCF, False) - self.assertEqual(1, len(bpp_list)) + def test_precise_flag_ignores_ci(self, vcf_translocation): + vcf_translocation.info.update({'CIEND': [-700, 700], 'CIPOS': [-700, 700], 'PRECISE': True}) + bpp_list = _convert_tool_row( + _parse_vcf_record(vcf_translocation)[0], SUPPORTED_TOOL.VCF, False + ) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual(21673582, bpp.break1.start) - self.assertEqual(21673582, bpp.break1.end) - self.assertEqual(58921502, bpp.break2.start) - self.assertEqual(58921502, bpp.break2.end) + assert bpp.break1.start == 21673582 + assert bpp.break1.end == 21673582 + assert bpp.break2.start == 58921502 + assert bpp.break2.end == 58921502 - def test_no_id(self): - bpp_list = _convert_tool_row(_parse_vcf_record(self.tra)[0], SUPPORTED_TOOL.VCF, False) - self.assertEqual(1, len(bpp_list)) + def test_no_id(self, vcf_translocation): + bpp_list = _convert_tool_row( + _parse_vcf_record(vcf_translocation)[0], SUPPORTED_TOOL.VCF, False + ) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertTrue(bpp.data[COLUMNS.tracking_id]) + assert bpp.data[COLUMNS.tracking_id] - def test_N_id(self): - self.tra.id = 'N' - bpp_list = _convert_tool_row(_parse_vcf_record(self.tra)[0], SUPPORTED_TOOL.VCF, False) - self.assertEqual(1, len(bpp_list)) + def test_N_id(self, vcf_translocation): + vcf_translocation.id = 'N' + bpp_list = _convert_tool_row( + _parse_vcf_record(vcf_translocation)[0], SUPPORTED_TOOL.VCF, False + ) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertTrue(bpp.data[COLUMNS.tracking_id]) - self.assertNotEqual('N', bpp.data[COLUMNS.tracking_id]) + assert bpp.data[COLUMNS.tracking_id] + assert bpp.data[COLUMNS.tracking_id] != 'N' - def test_id_given(self): - self.tra.id = 'thing-1' - bpp_list = _convert_tool_row(_parse_vcf_record(self.tra)[0], SUPPORTED_TOOL.VCF, False) - self.assertEqual(1, len(bpp_list)) + def test_id_given(self, vcf_translocation): + vcf_translocation.id = 'thing-1' + bpp_list = _convert_tool_row( + _parse_vcf_record(vcf_translocation)[0], SUPPORTED_TOOL.VCF, False + ) + assert len(bpp_list) == 1 bpp = bpp_list[0] - self.assertEqual('vcf-thing-1', bpp.data[COLUMNS.tracking_id]) + assert bpp.data[COLUMNS.tracking_id] == 'vcf-thing-1' diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index 62b6910e..e163349e 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -1,6 +1,6 @@ import os -import unittest +import pytest from mavis.constants import COLUMNS, ORIENT, STRAND from mavis.error import NotSpecifiedError from mavis.util import ( @@ -14,351 +14,360 @@ from .mock import Mock -class MockFileHandle(Mock): - def __init__(self, lines): - Mock.__init__(self, lines=lines) - - def readlines(self): - return self.lines - - -class TestGetConnectedComponents(unittest.TestCase): +class TestGetConnectedComponents: def test_no_nodes(self): - self.assertEqual([], get_connected_components({})) + assert get_connected_components({}) == [] def test_no_connections(self): graph = {1: {}, 2: {}, 3: {}} components = get_connected_components(graph) - self.assertEqual(3, len(components)) + assert len(components) == 3 def test_fully_connected(self): graph = {1: {2, 3, 1}, 2: {1, 2, 2}, 3: {3, 2}} components = get_connected_components(graph) - self.assertEqual(1, len(components)) - self.assertEqual([{1, 2, 3}], components) + assert len(components) == 1 + assert components == [{1, 2, 3}] def test_multiple_components(self): graph = {1: {2}, 2: {3}, 3: {4}, 6: {7, 8}} components = get_connected_components(graph) - self.assertEqual(2, len(components)) - self.assertEqual({1, 2, 3, 4}, components[0]) - self.assertEqual({6, 7, 8}, components[1]) + assert len(components) == 2 + assert components[0] == {1, 2, 3, 4} + assert components[1] == {6, 7, 8} -class TestCast(unittest.TestCase): +class TestCast: def test_float(self): - self.assertEqual(type(1.0), type(cast('1', float))) - self.assertNotEqual(type(1.0), type(cast('1', int))) + assert type(cast('1', float)) == type(1.0) + assert type(cast('1', int)) != type(1.0) def test_boolean(self): - self.assertEqual(type(False), type(cast('f', bool))) - self.assertEqual(type(False), type(cast('false', bool))) - self.assertFalse(cast('f', bool)) - self.assertFalse(cast('false', bool)) - self.assertFalse(cast('0', bool)) - self.assertFalse(cast('F', bool)) - - -class TestGetEnvVariable(unittest.TestCase): - def setUp(self): - if 'MAVIS_TEST_ENV' in os.environ: - del os.environ['MAVIS_TEST_ENV'] - - def test_not_set(self): - self.assertEqual(1, get_env_variable('test_env', 1)) - - def test_needs_casting(self): - os.environ['MAVIS_TEST_ENV'] = '15' - self.assertEqual(15, get_env_variable('test_env', 1)) - - -class TestReadBreakpointPairsFromFile(unittest.TestCase): - def build_filehandle(self, row): - header = [c for c in row] - line = [row[c] for c in header] - lines = ['\t'.join(header), '\t'.join([str(v) for v in line])] - return MockFileHandle(lines) - - def test_break1_strand_ns(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.POS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: False, - } + assert type(cast('f', bool)) == type(False) + assert type(cast('false', bool)) == type(False) + assert not cast('f', bool) + assert not cast('false', bool) + assert not cast('0', bool) + assert not cast('F', bool) + + +def mock_file_content(row): + header = [c for c in row] + line = [row[c] for c in header] + lines = ['\t'.join(header), '\t'.join([str(v) for v in line])] + return '\n'.join(lines) + + +class TestReadBreakpointPairsFromFile: + def test_break1_strand_ns(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.NS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.POS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: False, + } + ) ) - with self.assertRaises(NotSpecifiedError): - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=True) + with pytest.raises(NotSpecifiedError): + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=True) for b in bpps: print(b) - def test_stranded_no_expand_error(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.POS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: False, - } + def test_stranded_no_expand_error(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.NS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.POS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.POS, bpps[0].break1.strand) - self.assertEqual(STRAND.POS, bpps[0].break2.strand) - - def test_break2_strand_ns(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.POS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=True, expand_orient=True) + assert len(bpps) == 1 + assert bpps[0].break1.strand == STRAND.POS + assert bpps[0].break2.strand == STRAND.POS + + def test_break2_strand_ns(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.POS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: False, + } + ) ) - with self.assertRaises(NotSpecifiedError) as err: + with pytest.raises(NotSpecifiedError) as err: print(err) - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=False) - - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.POS, bpps[0].break2.strand) - - def test_stranded_expand_strands_and_orient(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=False) + + bpps = read_bpp_from_input_file(input_file, expand_strand=True, expand_orient=True) + assert len(bpps) == 1 + assert bpps[0].break2.strand == STRAND.POS + + def test_stranded_expand_strands_and_orient(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.NS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=False) - self.assertEqual(2, len(bpps)) - self.assertEqual(STRAND.POS, bpps[0].break1.strand) - self.assertEqual(STRAND.POS, bpps[0].break2.strand) - self.assertEqual(STRAND.NEG, bpps[1].break1.strand) - self.assertEqual(STRAND.NEG, bpps[1].break2.strand) - - def test_expand_strands_and_orient(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: False, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=True, expand_orient=False) + assert len(bpps) == 2 + assert bpps[0].break1.strand == STRAND.POS + assert bpps[0].break2.strand == STRAND.POS + assert bpps[1].break1.strand == STRAND.NEG + assert bpps[1].break2.strand == STRAND.NEG + + def test_expand_strands_and_orient(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.NS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: False, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=False) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.NS, bpps[0].break1.strand) - self.assertEqual(STRAND.NS, bpps[0].break2.strand) - - def test_stranded_expand_strands_not_orient(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=True, expand_orient=False) + assert len(bpps) == 1 + assert bpps[0].break1.strand == STRAND.NS + assert bpps[0].break2.strand == STRAND.NS + + def test_stranded_expand_strands_not_orient(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.NS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=True) - self.assertEqual(2, len(bpps)) - self.assertEqual(STRAND.POS, bpps[0].break1.strand) - self.assertEqual(STRAND.POS, bpps[0].break2.strand) - self.assertEqual(STRAND.NEG, bpps[1].break1.strand) - self.assertEqual(STRAND.NEG, bpps[1].break2.strand) - - def test_expand_orient_not_strand(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.NS, - COLUMNS.break1_orientation: ORIENT.LEFT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: False, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=True, expand_orient=True) + assert len(bpps) == 2 + assert bpps[0].break1.strand == STRAND.POS + assert bpps[0].break2.strand == STRAND.POS + assert bpps[1].break1.strand == STRAND.NEG + assert bpps[1].break2.strand == STRAND.NEG + + def test_expand_orient_not_strand(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.NS, + COLUMNS.break1_orientation: ORIENT.LEFT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: False, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.NS, bpps[0].break1.strand) - self.assertEqual(STRAND.NS, bpps[0].break2.strand) - - def test_break1_orient_ns(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.POS, - COLUMNS.break1_orientation: ORIENT.NS, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.POS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: False, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=True) + assert len(bpps) == 1 + assert bpps[0].break1.strand == STRAND.NS + assert bpps[0].break2.strand == STRAND.NS + + def test_break1_orient_ns(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.POS, + COLUMNS.break1_orientation: ORIENT.NS, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.POS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: False, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(ORIENT.LEFT, bpps[0].break1.orient) - - def test_break2_orient_ns(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.POS, - COLUMNS.break1_orientation: ORIENT.NS, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.POS, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: False, - COLUMNS.opposing_strands: False, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=True) + assert len(bpps) == 1 + assert bpps[0].break1.orient == ORIENT.LEFT + + @pytest.mark.skip(reason='TODO') + def test_break2_orient_ns(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.POS, + COLUMNS.break1_orientation: ORIENT.NS, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.POS, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: False, + COLUMNS.opposing_strands: False, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(ORIENT.LEFT, bpps[0].break1.orient) - raise unittest.SkipTest('TODO') - - def test_both_break_orient_ns(self): - raise unittest.SkipTest('TODO') - - def test_base_case(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.POS, - COLUMNS.break1_orientation: ORIENT.RIGHT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NEG, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: True, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=True) + assert len(bpps) == 1 + assert bpps[0].break1.orient == ORIENT.LEFT + + @pytest.mark.skip(reason='TODO') + def test_both_break_orient_ns(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + + def test_base_case(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.POS, + COLUMNS.break1_orientation: ORIENT.RIGHT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NEG, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: True, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=False) - self.assertEqual(1, len(bpps)) - self.assertEqual(ORIENT.RIGHT, bpps[0].break1.orient) - self.assertEqual(True, bpps[0].opposing_strands) - - def test_unstranded_with_strand_calls(self): - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.POS, - COLUMNS.break1_orientation: ORIENT.RIGHT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NEG, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: False, - COLUMNS.opposing_strands: True, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=False) + assert len(bpps) == 1 + assert bpps[0].break1.orient == ORIENT.RIGHT + assert bpps[0].opposing_strands == True + + def test_unstranded_with_strand_calls(self, tmp_path): + input_file = tmp_path / "inputs.tsv" + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.POS, + COLUMNS.break1_orientation: ORIENT.RIGHT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NEG, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: False, + COLUMNS.opposing_strands: True, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=False) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.NS, bpps[0].break1.strand) - self.assertEqual(STRAND.NS, bpps[0].break2.strand) - - bpps = read_bpp_from_input_file(fh, expand_strand=False, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.NS, bpps[0].break1.strand) - self.assertEqual(STRAND.NS, bpps[0].break2.strand) - - fh = self.build_filehandle( - { - COLUMNS.break1_chromosome: '1', - COLUMNS.break1_position_start: 1, - COLUMNS.break1_position_end: 1, - COLUMNS.break1_strand: STRAND.POS, - COLUMNS.break1_orientation: ORIENT.RIGHT, - COLUMNS.break2_chromosome: '1', - COLUMNS.break2_position_start: 10, - COLUMNS.break2_position_end: 10, - COLUMNS.break2_strand: STRAND.NEG, - COLUMNS.break2_orientation: ORIENT.RIGHT, - COLUMNS.stranded: True, - COLUMNS.opposing_strands: True, - } + bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=False) + assert len(bpps) == 1 + assert bpps[0].break1.strand == STRAND.NS + assert bpps[0].break2.strand == STRAND.NS + + input_file = tmp_path / "inputs2.tsv" + + input_file.write_text( + mock_file_content( + { + COLUMNS.break1_chromosome: '1', + COLUMNS.break1_position_start: 1, + COLUMNS.break1_position_end: 1, + COLUMNS.break1_strand: STRAND.POS, + COLUMNS.break1_orientation: ORIENT.RIGHT, + COLUMNS.break2_chromosome: '1', + COLUMNS.break2_position_start: 10, + COLUMNS.break2_position_end: 10, + COLUMNS.break2_strand: STRAND.NEG, + COLUMNS.break2_orientation: ORIENT.RIGHT, + COLUMNS.stranded: True, + COLUMNS.opposing_strands: True, + } + ) ) - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=False) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.POS, bpps[0].break1.strand) - self.assertEqual(STRAND.NEG, bpps[0].break2.strand) - - bpps = read_bpp_from_input_file(fh, expand_strand=True, expand_orient=True) - self.assertEqual(1, len(bpps)) - self.assertEqual(STRAND.POS, bpps[0].break1.strand) - self.assertEqual(STRAND.NEG, bpps[0].break2.strand) + bpps = read_bpp_from_input_file(input_file, expand_strand=True, expand_orient=False) + assert len(bpps) == 1 + assert bpps[0].break1.strand == STRAND.POS + assert bpps[0].break2.strand == STRAND.NEG From 95d2a335f2230ea6f6c473de335687360c399593 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 23 Apr 2021 13:38:11 -0700 Subject: [PATCH 022/137] Fix import sort --- src/mavis/align.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/mavis/align.py b/src/mavis/align.py index dffed765..20984ba6 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -13,8 +13,16 @@ from .bam import cigar as _cigar from .bam import read as _read from .breakpoint import Breakpoint, BreakpointPair -from .constants import (CIGAR, COLUMNS, NA_MAPPING_QUALITY, ORIENT, STRAND, - SVTYPE, MavisNamespace, reverse_complement) +from .constants import ( + CIGAR, + COLUMNS, + NA_MAPPING_QUALITY, + ORIENT, + STRAND, + SVTYPE, + MavisNamespace, + reverse_complement, +) from .error import InvalidRearrangement from .interval import Interval from .util import DEVNULL From 6a12c3b650f8f1efdc329b4ea7ab61b3cb745230 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 23 Apr 2021 14:51:47 -0700 Subject: [PATCH 023/137] Fix typo --- docs/migrating.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/migrating.md b/docs/migrating.md index 213ee00c..d56e17c3 100644 --- a/docs/migrating.md +++ b/docs/migrating.md @@ -12,7 +12,7 @@ reference files ### Configuration -MAVIS no longer users command line arguments, config files, and environment variables for +MAVIS no longer uses command line arguments, config files, and environment variables for configuration. Instead all configurable settings are controlled via a single input JSON config file From 7c9442971b0278e2585540b9750216a9630c954c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 23 Apr 2021 15:09:00 -0700 Subject: [PATCH 024/137] Remove pound from examples in docs as well --- .github/CONTRIBUTING.md | 7 +------ docs/inputs/reference.md | 34 +++++++++++++++++++--------------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 48817547..d38244d4 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,4 +1,3 @@ - ## Getting Started If you are new to the project a good way to get started is by adding to the documentation, or adding unit tests where @@ -47,7 +46,6 @@ mkdocs build The contents of the user manual can then be viewed by opening the build-docs/index.html in any available web browser (i.e. google-chrome, firefox, etc.) - ## Deploy to PyPi Install deployment dependencies @@ -68,13 +66,11 @@ Use twine to upload twine upload -r pypi dist/* ``` - ## Reporting a Bug Please make sure to search through the issues before reporting a bug to ensure there isn't already an open issue. - ## Conventions ### Linting @@ -82,7 +78,7 @@ already an open issue. Use [black](https://github.com/psf/black) with strings off and line length 100 ```bash -black mavis -S -l 100 +black src/mavis -S -l 100 ``` ### Docstrings @@ -112,7 +108,6 @@ def some_function(some_arg: List[str]) -> None: any column name which may appear in any of the intermediate or final output files must be defined in `mavis.constants.COLUMNS` as well as added to the [columns glossary](../outputs/columns) - ### Tests - all new code must have unit tests in the tests subdirectory diff --git a/docs/inputs/reference.md b/docs/inputs/reference.md index cc31d56e..5eff1cbb 100644 --- a/docs/inputs/reference.md +++ b/docs/inputs/reference.md @@ -21,7 +21,6 @@ not available, | [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | | [aligner reference](../../inputs/reference/#aligner-reference) | `MAVIS_ALIGNER_REFERENCE` | [![](../images/get_app-24px.svg) GRCh37/Hg19 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit)
[![](../images/get_app-24px.svg) GRCh38 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit) | - If the environment variables above are set they will be used as the default values when any step of the pipeline script is called (including generating the template config file) @@ -38,11 +37,13 @@ chromosomes. This is only used during visualization. The structure of the file should look something like this - chr1 0 2300000 p36.33 gneg - chr1 2300000 5400000 p36.32 gpos25 - chr1 5400000 7200000 p36.31 gneg - chr1 7200000 9200000 p36.23 gpos25 - chr1 9200000 12700000 p36.22 gneg +```text +chr1 0 2300000 p36.33 gneg +chr1 2300000 5400000 p36.32 gpos25 +chr1 5400000 7200000 p36.31 gneg +chr1 7200000 9200000 p36.23 gpos25 +chr1 9200000 12700000 p36.22 gneg +``` ## Masking File @@ -52,9 +53,11 @@ known false positives, bad mapping, centromeres, telomeres etc. An example of the expected format is shown below. The file should have four columns: chr, start, end and name. - #chr start end name - chr1 0 2300000 centromere - chr1 9200000 12700000 telomere +```text +chr start end name +chr1 0 2300000 centromere +chr1 9200000 12700000 telomere +``` The pre-built masking files in the downloads table above are telomere regions, centromere regions (based on the cytoband file), and nspan @@ -81,7 +84,6 @@ the ensembl annotations file including non-coding transcripts below. [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69 (includes non-coding genes)](http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations_with_ncrna.json) - !!! warning the `mavis.annotate.file_io.load_reference_genes`{.interpreted-text role="func"} will only load valid translations. If the cds sequence in @@ -98,7 +100,7 @@ be seen below { "name": string, "start": int, - "end": int + "end": int, "aliases": [string, string, ...], "transcripts": [ { @@ -180,10 +182,12 @@ awk '{print $2"\t"$3"\t"$4"\t"$1} GRCh37_hg19_variants_2016-05-15.txt > dgv_hg19 Note in hg19 the column is called "name" and in hg38 the column is called "variantaccession". An example is shown below - #chr start end name - 1 1 2300000 nsv482937 - 1 10001 22118 dgv1n82 - 1 10001 127330 nsv7879 +```text +chr start end name +1 1 2300000 nsv482937 +1 10001 22118 dgv1n82 +1 10001 127330 nsv7879 +``` ## Aligner Reference From 01a2eda28c66567cb85456d95a6ff1d3fcbd0488 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 26 Apr 2021 15:27:36 -0700 Subject: [PATCH 025/137] Convert to pytest syntax --- tests/end_to_end/test_convert.py | 61 +- tests/end_to_end/test_help.py | 50 +- tests/end_to_end/test_overlay.py | 1 - tests/end_to_end/test_ref_alt_count.py | 92 +- tests/integration/test_align.py | 386 +++-- tests/integration/test_annotate.py | 1599 +++++++++++-------- tests/integration/test_annotate_examples.py | 160 +- tests/integration/test_annotate_fileio.py | 32 +- tests/integration/test_args.py | 2 - tests/integration/test_assemble.py | 138 +- tests/integration/test_bam.py | 511 +++--- tests/integration/test_bam_cigar.py | 237 ++- tests/integration/test_blat.py | 151 +- tests/integration/test_breakpoint.py | 118 +- tests/integration/test_cluster.py | 68 +- tests/integration/test_illustrate.py | 192 ++- tests/integration/test_pairing.py | 403 ++--- tests/integration/test_splicing.py | 609 ++++--- tests/integration/test_validate.py | 328 ++-- tests/integration/test_validate_call.py | 699 ++++---- tests/integration/test_validate_evidence.py | 805 +++++----- tests/unit/test_annotate.py | 268 ++-- tests/unit/test_assemble.py | 100 +- tests/unit/test_bam.py | 126 +- tests/unit/test_blat.py | 15 +- tests/unit/test_breakpoint.py | 169 +- tests/unit/test_call_indels.py | 164 +- tests/unit/test_cluster.py | 13 +- tests/unit/test_constants.py | 63 +- tests/unit/test_illustrate.py | 7 +- tests/unit/test_interval.py | 278 ++-- tests/unit/test_summary.py | 268 ++-- tests/unit/test_tool.py | 86 +- tests/unit/test_util.py | 28 +- tests/unit/test_validate.py | 33 +- tests/util.py | 13 + 36 files changed, 4236 insertions(+), 4037 deletions(-) diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index a1d33be2..514fae52 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -1,4 +1,3 @@ -import glob import os import shutil import sys @@ -23,7 +22,7 @@ def setUpModule(): print('output dir', TEMP_OUTPUT) -class TestConvert(unittest.TestCase): +class TestConvert: def run_main(self, inputfile, file_type, strand_specific=False): outputfile = os.path.join(TEMP_OUTPUT, file_type + '.tab') args = [ @@ -41,7 +40,7 @@ def run_main(self, inputfile, file_type, strand_specific=False): with patch.object(sys, 'argv', args): main() print('output', outputfile) - self.assertTrue(unique_exists(outputfile)) + assert unique_exists(outputfile) result = {} for pair in read_bpp_from_input_file(outputfile): result.setdefault(pair.data['tracking_id'], []).append(pair) @@ -56,44 +55,44 @@ def test_defuse(self): def test_delly(self): result = self.run_main(get_data('delly_events.vcf'), SUPPORTED_TOOL.DELLY, False) # test the contents were converted successfully - self.assertEqual(1, len(result['delly-DUP00000424'])) + assert len(result['delly-DUP00000424']) == 1 bpp = result['delly-DUP00000424'][0] print(bpp.data) print(bpp) - self.assertEqual(SVTYPE.DUP, bpp.event_type) - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('1', bpp.break2.chr) - self.assertEqual(224646569, bpp.break1.start) - self.assertEqual(224646569, bpp.break1.end) - self.assertEqual(224800120, bpp.break2.start) - self.assertEqual(224800120, bpp.break2.end) - self.assertEqual(1, len(result['delly-TRA00020624'])) + assert bpp.event_type == SVTYPE.DUP + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '1' + assert bpp.break1.start == 224646569 + assert bpp.break1.end == 224646569 + assert bpp.break2.start == 224800120 + assert bpp.break2.end == 224800120 + assert len(result['delly-TRA00020624']) == 1 bpp = result['delly-TRA00020624'][0] - self.assertEqual(SVTYPE.TRANS, bpp.event_type) - self.assertEqual('10', bpp.break1.chr) - self.assertEqual('19', bpp.break2.chr) - self.assertEqual(7059510 - 670, bpp.break1.start) - self.assertEqual(7059510 + 670, bpp.break1.end) - self.assertEqual(17396810 - 670, bpp.break2.start) - self.assertEqual(17396810 + 670, bpp.break2.end) - self.assertEqual(len(result), 31) + assert bpp.event_type == SVTYPE.TRANS + assert bpp.break1.chr == '10' + assert bpp.break2.chr == '19' + assert bpp.break1.start == 7059510 - 670 + assert bpp.break1.end == 7059510 + 670 + assert bpp.break2.start == 17396810 - 670 + assert bpp.break2.end == 17396810 + 670 + assert 31 == len(result) def test_manta(self): result = self.run_main(get_data('manta_events.vcf'), SUPPORTED_TOOL.MANTA, False) # ensure weird bnd type is converted correctly bnd_id = 'manta-MantaBND:173633:0:1:0:0:0:0' - self.assertEqual(1, len(result[bnd_id])) + assert len(result[bnd_id]) == 1 bpp = result[bnd_id][0] - self.assertEqual(SVTYPE.TRANS, bpp.event_type) - self.assertEqual('10', bpp.break1.chr) - self.assertEqual('19', bpp.break2.chr) - self.assertEqual(7059511 - 0, bpp.break1.start) - self.assertEqual(7059511 + 1, bpp.break1.end) - self.assertEqual(17396810, bpp.break2.start) - self.assertEqual(17396810, bpp.break2.end) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) + assert bpp.event_type == SVTYPE.TRANS + assert bpp.break1.chr == '10' + assert bpp.break2.chr == '19' + assert bpp.break1.start == 7059511 - 0 + assert bpp.break1.end == 7059511 + 1 + assert bpp.break2.start == 17396810 + assert bpp.break2.end == 17396810 + assert bpp.break2.orient == ORIENT.LEFT somatic_event = result['manta-MantaDEL:20644:0:2:0:0:0'][0] - self.assertEqual(True, somatic_event.data.get('SOMATIC', False)) + assert somatic_event.data.get('SOMATIC', False) is True def test_pindel(self): self.run_main(get_data('pindel_events.vcf'), SUPPORTED_TOOL.PINDEL, False) @@ -107,7 +106,7 @@ def test_vcf(self): print(results.keys()) record = results['vcf-460818'][0] print(record, record.data) - self.assertEqual('Pathogenic', record.data['CLNSIG']) + assert record.data['CLNSIG'] == 'Pathogenic' def test_breakseq2(self): self.run_main(get_data('breakseq.vcf'), SUPPORTED_TOOL.BREAKSEQ, False) diff --git a/tests/end_to_end/test_help.py b/tests/end_to_end/test_help.py index 4ff3172a..6d3cdd24 100644 --- a/tests/end_to_end/test_help.py +++ b/tests/end_to_end/test_help.py @@ -1,110 +1,106 @@ -import os -import subprocess import sys -import unittest from unittest.mock import patch - from mavis.constants import SUBCOMMAND from mavis.main import main -class TestHelpMenu(unittest.TestCase): +class TestHelpMenu: def test_main(self): with patch.object(sys, 'argv', ['mavis', '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_pipeline(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_cluster(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CLUSTER, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_validate(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.VALIDATE, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_annotate(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.ANNOTATE, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_pairing(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.PAIR, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_summary(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SUMMARY, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_convert(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CONVERT, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_overlay(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.OVERLAY, '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 def test_bad_option(self): with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '--blargh']): try: returncode = main() except SystemExit as err: - self.assertNotEqual(0, err.code) + assert err.code != 0 else: - self.assertNotEqual(0, returncode) + assert returncode != 0 def test_ref_alt_count(self): with patch.object(sys, 'argv', ['calculate_ref_alt_counts', '-h']): try: returncode = main() except SystemExit as err: - self.assertEqual(0, err.code) + assert err.code == 0 else: - self.assertEqual(0, returncode) + assert returncode == 0 diff --git a/tests/end_to_end/test_overlay.py b/tests/end_to_end/test_overlay.py index db664c55..5950701d 100644 --- a/tests/end_to_end/test_overlay.py +++ b/tests/end_to_end/test_overlay.py @@ -1,7 +1,6 @@ import json import os import shutil -import subprocess import sys import tempfile from unittest.mock import patch diff --git a/tests/end_to_end/test_ref_alt_count.py b/tests/end_to_end/test_ref_alt_count.py index 4c30fb81..c1afb816 100644 --- a/tests/end_to_end/test_ref_alt_count.py +++ b/tests/end_to_end/test_ref_alt_count.py @@ -1,8 +1,8 @@ import os import shutil import tempfile -import unittest +import pytest from mavis.annotate.file_io import load_reference_genome from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import ORIENT, SVTYPE @@ -31,68 +31,60 @@ def print_file_tree(dirname): print('{}{}'.format(subindent, f)) -class TestFullCalculator(unittest.TestCase): - def setUp(self): - # create the temp output directory to store file outputs - self.temp_output = tempfile.mkdtemp() - print('output dir', self.temp_output) +@pytest.fixture +def calculator(): + return RefAltCalculator( + [("TEST", get_data('mock_reads_for_events.sorted.bam'))], + REFERENCE_GENOME, + max_event_size=100, + buffer=20, + ) - self.calculator = RefAltCalculator( - [("TEST", get_data('mock_reads_for_events.sorted.bam'))], - REFERENCE_GENOME, - max_event_size=100, - buffer=20, - ) - - def test_calculate_all_counts(self): - self.calculator.calculate_all_counts( - [get_data("mavis_summary_all_mock-A36971_mock-A47933.tab")], - os.path.join(self.temp_output, "ref_alt_output.tab"), - ) - self.assertTrue(glob_exists(self.temp_output, "ref_alt_output.tab")) - def tearDown(self): - # remove the temp directory and outputs - print_file_tree(self.temp_output) - shutil.rmtree(self.temp_output) +@pytest.fixture +def temp_output(): + d = tempfile.mkdtemp() + yield d + shutil.rmtree(d) -class TestRefAltCalulator(unittest.TestCase): - def setUp(self): - self.calculator = RefAltCalculator( - [("TEST", get_data('mock_reads_for_events.sorted.bam'))], - REFERENCE_GENOME, - max_event_size=100, - buffer=20, +class TestFullCalculator: + def test_calculate_all_counts(self, calculator, temp_output): + calculator.calculate_all_counts( + [get_data("mavis_summary_all_mock-A36971_mock-A47933.tab")], + os.path.join(temp_output, "ref_alt_output.tab"), ) + assert glob_exists(temp_output, "ref_alt_output.tab") + - def test_calculate_count(self): +class TestRefAltCalulator: + def test_calculate_count(self, calculator): ev1 = BreakpointPair( Breakpoint('reference11', 5999, orient=ORIENT.LEFT), Breakpoint('reference11', 6003, orient=ORIENT.RIGHT), opposing_strands=False, event_type=SVTYPE.DEL, ) - bpp = self.calculator.calculate_ref_counts(ev1) + bpp = calculator.calculate_ref_counts(ev1) print(bpp.data) - self.assertEqual(27, bpp.data["TEST_ref_count"]) - self.assertEqual(14, bpp.data["TEST_alt_count"]) - self.assertEqual(188, bpp.data['TEST_ignored_count']) + assert bpp.data["TEST_ref_count"] == 27 + assert bpp.data["TEST_alt_count"] == 14 + assert bpp.data['TEST_ignored_count'] == 188 - def test_calculate_count2(self): + def test_calculate_count2(self, calculator): ev1 = BreakpointPair( Breakpoint('reference11', 9999, orient=ORIENT.LEFT), Breakpoint('reference11', 10030, orient=ORIENT.RIGHT), opposing_strands=False, event_type=SVTYPE.DEL, ) - bpp = self.calculator.calculate_ref_counts(ev1) + bpp = calculator.calculate_ref_counts(ev1) print(bpp.data) - self.assertEqual(0, bpp.data["TEST_ref_count"]) - self.assertEqual(63, bpp.data["TEST_alt_count"]) - self.assertEqual(197, bpp.data['TEST_ignored_count']) + assert bpp.data["TEST_ref_count"] == 0 + assert bpp.data["TEST_alt_count"] == 63 + assert bpp.data['TEST_ignored_count'] == 197 - def test_calculate_count3(self): + def test_calculate_count3(self, calculator): ev1 = BreakpointPair( Breakpoint('reference1', 2002, orient=ORIENT.LEFT), Breakpoint('reference1', 2003, orient=ORIENT.RIGHT), @@ -100,21 +92,21 @@ def test_calculate_count3(self): event_type=SVTYPE.INS, untemplated_seq='TT', ) - bpp = self.calculator.calculate_ref_counts(ev1) + bpp = calculator.calculate_ref_counts(ev1) print(bpp.data) - self.assertEqual(0, bpp.data["TEST_ref_count"]) - self.assertEqual(23, bpp.data["TEST_alt_count"]) - self.assertEqual(145, bpp.data['TEST_ignored_count']) + assert bpp.data["TEST_ref_count"] == 0 + assert bpp.data["TEST_alt_count"] == 23 + assert bpp.data['TEST_ignored_count'] == 145 - def test_calculate_count4(self): + def test_calculate_count4(self, calculator): ev1 = BreakpointPair( Breakpoint('reference11', 1999, orient=ORIENT.LEFT), Breakpoint('reference11', 2001, orient=ORIENT.RIGHT), opposing_strands=False, event_type=SVTYPE.DEL, ) - bpp = self.calculator.calculate_ref_counts(ev1) + bpp = calculator.calculate_ref_counts(ev1) print(bpp.data) - self.assertEqual(0, bpp.data["TEST_ref_count"]) - self.assertEqual(50, bpp.data["TEST_alt_count"]) - self.assertEqual(191, bpp.data['TEST_ignored_count']) + assert bpp.data["TEST_ref_count"] == 0 + assert bpp.data["TEST_alt_count"] == 50 + assert bpp.data['TEST_ignored_count'] == 191 diff --git a/tests/integration/test_align.py b/tests/integration/test_align.py index 45c9cb1a..4effd774 100644 --- a/tests/integration/test_align.py +++ b/tests/integration/test_align.py @@ -1,21 +1,21 @@ import shutil -import unittest from unittest import mock import mavis.bam.cigar as _cigar +import pytest from mavis import align from mavis.annotate.file_io import load_reference_genome from mavis.assemble import Contig from mavis.bam.cache import BamCache from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import CIGAR, ORIENT, STRAND, SVTYPE, reverse_complement +from mavis.constants import CIGAR, ORIENT, STRAND, reverse_complement from mavis.interval import Interval from mavis.schemas import DEFAULTS from mavis.validate.evidence import GenomeEvidence from ..util import get_data -from . import MockBamFileHandle, MockLongString, MockObject, MockRead +from . import MockLongString, MockObject, MockRead REFERENCE_GENOME = None @@ -32,7 +32,7 @@ def setUpModule(): BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) -class TestCallReadEvents(unittest.TestCase): +class TestCallReadEvents: def test_hardclipping(self): read = SamRead(reference_name='15') read.reference_start = 71491944 @@ -46,16 +46,13 @@ def test_hardclipping(self): untemplated_seq='', ) events = align.call_read_events(read, is_stranded=True) - self.assertEqual(1, len(events)) - self.assertEqual(expected_bpp.break1, events[0].break1) - self.assertEqual(expected_bpp.break2, events[0].break2) + assert len(events) == 1 + assert events[0].break1 == expected_bpp.break1 + assert events[0].break2 == expected_bpp.break2 -class TestAlign(unittest.TestCase): - def setUp(self): - self.cache = BamCache(MockBamFileHandle({'Y': 23, 'fake': 0, 'reference3': 3})) - - @unittest.skipIf(not shutil.which('blat'), 'missing the blat command') +class TestAlign: + @pytest.mark.skipif(not shutil.which('blat'), reason='missing the blat command') def test_blat_contigs(self): ev = GenomeEvidence( Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), @@ -92,16 +89,16 @@ def test_blat_contigs(self): align.select_contig_alignments(ev, seq) print(ev.contigs[0].alignments) alignment = list(ev.contigs[0].alignments)[0] - self.assertEqual(1, alignment.read1.reference_id) - self.assertEqual(1, alignment.read2.reference_id) - self.assertEqual(Interval(125, 244), align.query_coverage_interval(alignment.read1)) - self.assertEqual(Interval(117, 244), align.query_coverage_interval(alignment.read2)) - self.assertEqual(1114, alignment.read1.reference_start) - self.assertEqual(2187, alignment.read2.reference_start) - self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], alignment.read1.cigar) - self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], alignment.read2.cigar) - - @unittest.skipIf(not shutil.which('bwa'), 'missing the command') + assert alignment.read1.reference_id == 1 + assert alignment.read2.reference_id == 1 + assert align.query_coverage_interval(alignment.read1) == Interval(125, 244) + assert align.query_coverage_interval(alignment.read2) == Interval(117, 244) + assert alignment.read1.reference_start == 1114 + assert alignment.read2.reference_start == 2187 + assert alignment.read1.cigar == [(CIGAR.S, 125), (CIGAR.EQ, 120)] + assert alignment.read2.cigar == [(CIGAR.S, 117), (CIGAR.EQ, 128)] + + @pytest.mark.skipif(not shutil.which('bwa'), reason='missing the command') def test_bwa_contigs(self): ev = GenomeEvidence( Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), @@ -139,21 +136,19 @@ def test_bwa_contigs(self): align.select_contig_alignments(ev, seq) print(ev.contigs[0].alignments) alignment = list(ev.contigs[0].alignments)[0] - self.assertEqual( - reverse_complement(alignment.read1.query_sequence), alignment.read2.query_sequence - ) - self.assertEqual('reference3', alignment.read1.reference_name) - self.assertEqual('reference3', alignment.read2.reference_name) - self.assertEqual(1, alignment.read1.reference_id) - self.assertEqual(1, alignment.read2.reference_id) - self.assertEqual(Interval(125, 244), align.query_coverage_interval(alignment.read1)) - self.assertEqual(Interval(117, 244), align.query_coverage_interval(alignment.read2)) - self.assertEqual(1114, alignment.read1.reference_start) - self.assertEqual(2187, alignment.read2.reference_start) - self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], alignment.read1.cigar) - self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], alignment.read2.cigar) - - @unittest.skipIf(not shutil.which('blat'), 'missing the blat command') + assert alignment.read2.query_sequence == reverse_complement(alignment.read1.query_sequence) + assert alignment.read1.reference_name == 'reference3' + assert alignment.read2.reference_name == 'reference3' + assert alignment.read1.reference_id == 1 + assert alignment.read2.reference_id == 1 + assert align.query_coverage_interval(alignment.read1) == Interval(125, 244) + assert align.query_coverage_interval(alignment.read2) == Interval(117, 244) + assert alignment.read1.reference_start == 1114 + assert alignment.read2.reference_start == 2187 + assert alignment.read1.cigar == [(CIGAR.S, 125), (CIGAR.EQ, 120)] + assert alignment.read2.cigar == [(CIGAR.S, 117), (CIGAR.EQ, 128)] + + @pytest.mark.skipif(not shutil.which('blat'), reason='missing the blat command') def test_blat_contigs_deletion(self): ev = GenomeEvidence( Breakpoint('fake', 1714, orient=ORIENT.LEFT), @@ -188,20 +183,16 @@ def test_blat_contigs_deletion(self): print('alignments:') for aln in alignments: print(aln, repr(aln.read1), repr(aln.read2)) - self.assertEqual(1, len(alignments)) + assert len(alignments) == 1 alignment = alignments[0] - self.assertTrue(alignment.read2 is None) - self.assertEqual(0, alignment.read1.reference_id) - self.assertTrue(not alignment.read1.is_reverse) - self.assertEqual(Interval(0, 175), align.query_coverage_interval(alignment.read1)) - self.assertEqual(1612, alignment.read1.reference_start) - self.assertEqual([(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)], alignment.read1.cigar) - - @unittest.skipIf(not shutil.which('blat'), 'missing the blat command') - def test_blat_contigs_inversion(self): - raise unittest.SkipTest('TODO') - - @unittest.skipIf(not shutil.which('blat'), 'missing the blat command') + assert alignment.read2 is None + assert alignment.read1.reference_id == 0 + assert not alignment.read1.is_reverse + assert align.query_coverage_interval(alignment.read1) == Interval(0, 175) + assert alignment.read1.reference_start == 1612 + assert alignment.read1.cigar == [(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)] + + @pytest.mark.skipif(not shutil.which('blat'), reason='missing the blat command') def test_blat_contigs_deletion_revcomp(self): ev = GenomeEvidence( Breakpoint('fake', 1714, orient=ORIENT.LEFT), @@ -231,33 +222,32 @@ def test_blat_contigs_deletion_revcomp(self): print('alignments:', ev.contigs[0].alignments) alignment = list(ev.contigs[0].alignments)[0] print(alignment) - self.assertTrue(alignment.read2 is None) - self.assertEqual(0, alignment.read1.reference_id) - self.assertTrue(alignment.read1.is_reverse) - self.assertEqual(seq, alignment.read1.query_sequence) - self.assertEqual(Interval(0, 175), align.query_coverage_interval(alignment.read1)) - self.assertEqual(1612, alignment.read1.reference_start) - self.assertEqual([(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)], alignment.read1.cigar) - - -class TestBreakpointContigRemappedDepth(unittest.TestCase): - def setUp(self): - self.contig = Contig(' ' * 60, None) - self.contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10)) - self.contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=20)) - self.contig.add_mapped_sequence(MockObject(reference_start=50, reference_end=60)) + assert alignment.read2 is None + assert alignment.read1.reference_id == 0 + assert alignment.read1.is_reverse + assert alignment.read1.query_sequence == seq + assert align.query_coverage_interval(alignment.read1) == Interval(0, 175) + assert alignment.read1.reference_start == 1612 + assert alignment.read1.cigar == [(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)] + +class TestBreakpointContigRemappedDepth: def test_break_left_deletion(self): + contig = Contig(' ' * 60, None) + contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10)) + contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=20)) + contig.add_mapped_sequence(MockObject(reference_start=50, reference_end=60)) + b = Breakpoint('10', 1030, 1030, orient=ORIENT.LEFT) read = MockRead( cigar=_cigar.convert_string_to_cigar('35M10D5I20M'), reference_start=999, reference_name='10', ) - align.SplitAlignment.breakpoint_contig_remapped_depth(b, self.contig, read) + align.SplitAlignment.breakpoint_contig_remapped_depth(b, contig, read) -class TestSplitEvents(unittest.TestCase): +class TestSplitEvents: def test_read_with_exons(self): contig = MockRead( query_sequence='CTTGAAGGAAACTGAATTCAAAAAGATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGGACTCTGGATCCCAGAAGGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGACATCTCCGAAAGCCAACAAGGAAATCCTCGATGAAGCCTACGTGATGGCCAGCGTGGACAACCCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCATCATGCAGCTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATATTGGCTCCCAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGGCATGAACTACTTGGAGGACCGTCGCTTGGTGCACCGCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCACAGATTTTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGGCAAAGTGCCTATCAAGTGGATGGCATTGGAATCAATTTTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGGGTGACCGTTTGGGAGTTGATGACCTTTGGATCCAA', @@ -268,10 +258,10 @@ def test_read_with_exons(self): reference_id=6, reference_start=55241669, ) - self.assertEqual(6, len(align.call_read_events(contig))) + assert len(align.call_read_events(contig)) == 6 -class TestCallBreakpointPair(unittest.TestCase): +class TestCallBreakpointPair: def test_single_one_event(self): r = MockRead( reference_id=0, @@ -281,14 +271,14 @@ def test_single_one_event(self): query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) bpps = align.call_read_events(r) - self.assertEqual(1, len(bpps)) + assert len(bpps) == 1 bpp = bpps[0] - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(10, bpp.break1.start) - self.assertEqual(10, bpp.break1.end) - self.assertEqual(18, bpp.break2.start) - self.assertEqual(18, bpp.break2.end) - self.assertEqual('GGG', bpp.untemplated_seq) + assert bpp.opposing_strands is False + assert bpp.break1.start == 10 + assert bpp.break1.end == 10 + assert bpp.break2.start == 18 + assert bpp.break2.end == 18 + assert bpp.untemplated_seq == 'GGG' def test_ins_and_del(self): r = MockRead( @@ -300,20 +290,20 @@ def test_ins_and_del(self): ) # only report the major del event for now bpps = align.call_read_events(r) - self.assertEqual(2, len(bpps)) + assert len(bpps) == 2 bpp = bpps[0] - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(10, bpp.break1.start) - self.assertEqual(10, bpp.break1.end) - self.assertEqual(11, bpp.break2.start) - self.assertEqual(11, bpp.break2.end) - self.assertEqual('GGG', bpp.untemplated_seq) + assert bpp.opposing_strands is False + assert bpp.break1.start == 10 + assert bpp.break1.end == 10 + assert bpp.break2.start == 11 + assert bpp.break2.end == 11 + assert bpp.untemplated_seq == 'GGG' bpp = bpps[1] - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(15, bpp.break1.start) - self.assertEqual(15, bpp.break1.end) - self.assertEqual(23, bpp.break2.start) - self.assertEqual(23, bpp.break2.end) + assert bpp.opposing_strands is False + assert bpp.break1.start == 15 + assert bpp.break1.end == 15 + assert bpp.break2.start == 23 + assert bpp.break2.end == 23 def test_single_insertion(self): r = MockRead( @@ -324,12 +314,12 @@ def test_single_insertion(self): query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) bpp = align.call_read_events(r)[0] - self.assertEqual(False, bpp.opposing_strands) - self.assertEqual(10, bpp.break1.start) - self.assertEqual(10, bpp.break1.end) - self.assertEqual(11, bpp.break2.start) - self.assertEqual(11, bpp.break2.end) - self.assertEqual('GGGTAGCT', bpp.untemplated_seq) + assert bpp.opposing_strands is False + assert bpp.break1.start == 10 + assert bpp.break1.end == 10 + assert bpp.break2.start == 11 + assert bpp.break2.end == 11 + assert bpp.untemplated_seq == 'GGGTAGCT' def test_single_duplication(self): r = MockRead( @@ -341,9 +331,9 @@ def test_single_duplication(self): 'GACAGACTCTAGTAGTGTC', ) bpp = align.call_read_events(r)[0] - self.assertEqual(27220, bpp.break1.start) - self.assertEqual(27316, bpp.break2.start) - self.assertEqual('AGACTT', bpp.untemplated_seq) + assert bpp.break1.start == 27220 + assert bpp.break2.start == 27316 + assert bpp.untemplated_seq == 'AGACTT' def test_single_duplication_with_leading_untemp(self): r = MockRead( @@ -360,11 +350,9 @@ def test_single_duplication_with_leading_untemp(self): is_reverse=False, ) bpp = align.call_read_events(r)[0] - self.assertEqual( - 'AGGTTCCATGGGCTCCGTAGGTTCCATGGGCTCCGTAGGTTCCATCGGCTCCGT', bpp.untemplated_seq - ) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) + assert bpp.untemplated_seq == 'AGGTTCCATGGGCTCCGTAGGTTCCATGGGCTCCGTAGGTTCCATCGGCTCCGT' + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.RIGHT def test_single_duplication_with_no_untemp(self): r = MockRead( @@ -381,11 +369,11 @@ def test_single_duplication_with_no_untemp(self): ) # repeat: GATTTTGCTGTTGTTTTTGTTC bpp = align.convert_to_duplication(align.call_read_events(r)[0], REFERENCE_GENOME) - self.assertEqual('', bpp.untemplated_seq) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(bpp.break2.start, 1548) - self.assertEqual(bpp.break1.start, 1527) + assert bpp.untemplated_seq == '' + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.LEFT + assert 1548 == bpp.break2.start + assert 1527 == bpp.break1.start def test_single_duplication_with_trailing_untemp(self): r = MockRead( @@ -411,11 +399,11 @@ def test_single_duplication_with_trailing_untemp(self): print(bpp) bpp = align.convert_to_duplication(bpp, REFERENCE_GENOME) print(bpp) - self.assertEqual('GTCAA', bpp.untemplated_seq) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual(bpp.break2.start, 1548) - self.assertEqual(bpp.break1.start, 1527) + assert bpp.untemplated_seq == 'GTCAA' + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.LEFT + assert 1548 == bpp.break2.start + assert 1527 == bpp.break1.start def test_read_pair_indel(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT 1-30 1-? @@ -441,15 +429,15 @@ def test_read_pair_indel(self): is_reverse=False, ) bpp = align.call_paired_read_event(r1, r2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.POS, bpp.break2.strand) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual('GGGAATTCCGGA', bpp.untemplated_seq) - self.assertEqual(9, bpp.break1.start) - self.assertEqual(100, bpp.break2.start) - self.assertEqual('AAATTTCCC', bpp.break1.seq) - self.assertEqual('TCGATCGAT', bpp.break2.seq) + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.POS + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.untemplated_seq == 'GGGAATTCCGGA' + assert bpp.break1.start == 9 + assert bpp.break2.start == 100 + assert bpp.break1.seq == 'AAATTTCCC' + assert bpp.break2.seq == 'TCGATCGAT' def test_read_pair_deletion(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT @@ -474,13 +462,13 @@ def test_read_pair_deletion(self): is_reverse=False, ) bpp = align.call_paired_read_event(r1, r2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.POS, bpp.break2.strand) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual('', bpp.untemplated_seq) - self.assertEqual(21, bpp.break1.start) - self.assertEqual(100, bpp.break2.start) + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.POS + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.untemplated_seq == '' + assert bpp.break1.start == 21 + assert bpp.break2.start == 100 def test_read_pair_translocation(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT @@ -505,13 +493,13 @@ def test_read_pair_translocation(self): is_reverse=False, ) bpp = align.call_paired_read_event(r1, r2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.POS, bpp.break2.strand) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual('1', bpp.break1.chr) - self.assertEqual('2', bpp.break2.chr) - self.assertEqual('', bpp.untemplated_seq) + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.POS + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.break1.chr == '1' + assert bpp.break2.chr == '2' + assert bpp.untemplated_seq == '' def test_read_pair_deletion_overlapping_query_coverage(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT @@ -536,17 +524,17 @@ def test_read_pair_deletion_overlapping_query_coverage(self): query_sequence=seq, is_reverse=False, ) - self.assertEqual(21, r1.reference_end) + assert r1.reference_end == 21 bpp = align.call_paired_read_event(r1, r2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.POS, bpp.break2.strand) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual('', bpp.untemplated_seq) - self.assertEqual(21, bpp.break1.start) - self.assertEqual(103, bpp.break2.start) - self.assertEqual('AAATTTCCCGGGAATTCCGGA', bpp.break1.seq) - self.assertEqual('TCGATCGAT', bpp.break2.seq) + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.POS + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.untemplated_seq == '' + assert bpp.break1.start == 21 + assert bpp.break2.start == 103 + assert bpp.break1.seq == 'AAATTTCCCGGGAATTCCGGA' + assert bpp.break2.seq == 'TCGATCGAT' def test_read_pair_inversion_overlapping_query_coverage(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT @@ -573,15 +561,15 @@ def test_read_pair_inversion_overlapping_query_coverage(self): is_reverse=True, ) bpp = align.call_paired_read_event(r1, r2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.NEG, bpp.break2.strand) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual('', bpp.untemplated_seq) - self.assertEqual(21, bpp.break1.start) - self.assertEqual(108, bpp.break2.start) - self.assertEqual('AAATTTCCCGGGAATTCCGGA', bpp.break1.seq) - self.assertEqual(reverse_complement('TCGATCGAT'), bpp.break2.seq) + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.NEG + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.untemplated_seq == '' + assert bpp.break1.start == 21 + assert bpp.break2.start == 108 + assert bpp.break1.seq == 'AAATTTCCCGGGAATTCCGGA' + assert bpp.break2.seq == reverse_complement('TCGATCGAT') def test_read_pair_large_inversion_overlapping_query_coverage(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' @@ -601,24 +589,22 @@ def test_read_pair_large_inversion_overlapping_query_coverage(self): is_reverse=True, ) bpp = align.call_paired_read_event(read1, read2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.NEG, bpp.break2.strand) - self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) - self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) - self.assertEqual('', bpp.untemplated_seq) - self.assertEqual(1115, bpp.break1.start) - self.assertEqual(2188 + 3, bpp.break2.start) + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.NEG + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.RIGHT + assert bpp.untemplated_seq == '' + assert bpp.break1.start == 1115 + assert bpp.break2.start == 2188 + 3 print(bpp.break1.seq) print(bpp.break2.seq) - self.assertEqual( - 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAG' - 'GGTTTTCATTTCTGTATGTTAAT', - bpp.break1.seq, + assert ( + bpp.break1.seq + == 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' ) - self.assertEqual( - 'GCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCA' - 'AATTCTGTGTTTACAGGGCTTTCATGCTCAG', - bpp.break2.seq, + assert ( + bpp.break2.seq + == 'GCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' ) def test_read_pair_inversion_gap_in_query_coverage(self): @@ -646,18 +632,18 @@ def test_read_pair_inversion_gap_in_query_coverage(self): is_reverse=True, ) bpp = align.call_paired_read_event(r1, r2, is_stranded=True) - self.assertEqual(STRAND.POS, bpp.break1.strand) - self.assertEqual(STRAND.NEG, bpp.break2.strand) - self.assertEqual(ORIENT.LEFT, bpp.break1.orient) - self.assertEqual(ORIENT.LEFT, bpp.break2.orient) - self.assertEqual('CC', bpp.untemplated_seq) - self.assertEqual(16, bpp.break1.start) - self.assertEqual(111, bpp.break2.start) - self.assertEqual('AAATTTCCCGGGAATT', bpp.break1.seq) - self.assertEqual(reverse_complement('GGATCGATCGAT'), bpp.break2.seq) - - -class TestConvertToDuplication(unittest.TestCase): + assert bpp.break1.strand == STRAND.POS + assert bpp.break2.strand == STRAND.NEG + assert bpp.break1.orient == ORIENT.LEFT + assert bpp.break2.orient == ORIENT.LEFT + assert bpp.untemplated_seq == 'CC' + assert bpp.break1.start == 16 + assert bpp.break2.start == 111 + assert bpp.break1.seq == 'AAATTTCCCGGGAATT' + assert bpp.break2.seq == reverse_complement('GGATCGATCGAT') + + +class TestConvertToDuplication: def test_insertion_to_duplication(self): # BPP(Breakpoint(3:60204611L), Breakpoint(3:60204612R), opposing=False, seq='CATACATACATACATACATACATACATACATA') # insertion contig [seq2] contig_alignment_score: 0.99, contig_alignment_mq: Interval(255, 255) @@ -681,13 +667,13 @@ def test_insertion_to_duplication(self): setattr(bpp, 'read2', None) event = align.convert_to_duplication(bpp, reference_genome) print(event) - self.assertEqual(ORIENT.RIGHT, event.break1.orient) - self.assertEqual(60204588, event.break1.start) - self.assertEqual(ORIENT.LEFT, event.break2.orient) - self.assertEqual(60204611, event.break2.start) + assert event.break1.orient == ORIENT.RIGHT + assert event.break1.start == 60204588 + assert event.break2.orient == ORIENT.LEFT + assert event.break2.start == 60204611 # CATACATACATACATACATACATACATACATA # ........................******** - self.assertEqual('CATACATA', event.untemplated_seq) + assert event.untemplated_seq == 'CATACATA' def test_single_bp_insertion(self): bpp = BreakpointPair( @@ -704,14 +690,14 @@ def test_single_bp_insertion(self): setattr(bpp, 'read2', None) event = align.convert_to_duplication(bpp, reference_genome) print(event) - self.assertEqual(ORIENT.RIGHT, event.break1.orient) - self.assertEqual(121, event.break1.start) - self.assertEqual(ORIENT.LEFT, event.break2.orient) - self.assertEqual(121, event.break2.start) - self.assertEqual('', event.untemplated_seq) + assert event.break1.orient == ORIENT.RIGHT + assert event.break1.start == 121 + assert event.break2.orient == ORIENT.LEFT + assert event.break2.start == 121 + assert event.untemplated_seq == '' -class TestSelectContigAlignments(unittest.TestCase): +class TestSelectContigAlignments: def test_inversion_and_deletion(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' evidence = MockObject( @@ -756,34 +742,30 @@ def test_inversion_and_deletion(self): raw_alignments = {s: [read1, read2]} align.select_contig_alignments(evidence, raw_alignments) alignments = list(evidence.contigs[0].alignments) - self.assertEqual(2, len(alignments)) + assert len(alignments) == 2 -class TestGetAlignerVersion(unittest.TestCase): +class TestGetAlignerVersion: def test_get_blat_36x2(self): content = 'blat - Standalone BLAT v. 36x2 fast sequence search command line tool\n' with mock.patch('subprocess.getoutput', mock.Mock(return_value=content)): - self.assertEqual('36x2', align.get_aligner_version(align.SUPPORTED_ALIGNER.BLAT)) + assert align.get_aligner_version(align.SUPPORTED_ALIGNER.BLAT) == '36x2' def test_get_blat_36(self): content = "blat - Standalone BLAT v. 36 fast sequence search command line tool" with mock.patch('subprocess.getoutput', mock.Mock(return_value=content)): - self.assertEqual('36', align.get_aligner_version(align.SUPPORTED_ALIGNER.BLAT)) + assert align.get_aligner_version(align.SUPPORTED_ALIGNER.BLAT) == '36' def test_get_bwa_0_7_15(self): content = ( "\nProgram: bwa (alignment via Burrows-Wheeler transformation)\nVersion: 0.7.15-r1140" ) with mock.patch('subprocess.getoutput', mock.Mock(return_value=content)): - self.assertEqual( - '0.7.15-r1140', align.get_aligner_version(align.SUPPORTED_ALIGNER.BWA_MEM) - ) + assert align.get_aligner_version(align.SUPPORTED_ALIGNER.BWA_MEM) == '0.7.15-r1140' def test_get_bwa_0_7_12(self): content = ( "\nProgram: bwa (alignment via Burrows-Wheeler transformation)\nVersion: 0.7.12-r1039" ) with mock.patch('subprocess.getoutput', mock.Mock(return_value=content)): - self.assertEqual( - '0.7.12-r1039', align.get_aligner_version(align.SUPPORTED_ALIGNER.BWA_MEM) - ) + assert align.get_aligner_version(align.SUPPORTED_ALIGNER.BWA_MEM) == '0.7.12-r1039' diff --git a/tests/integration/test_annotate.py b/tests/integration/test_annotate.py index 9532665c..bf816b70 100644 --- a/tests/integration/test_annotate.py +++ b/tests/integration/test_annotate.py @@ -1,6 +1,7 @@ -import os +import argparse import unittest +import pytest from mavis.annotate.base import BioInterval, ReferenceName from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.fusion import FusionTranscript, determine_prime @@ -11,16 +12,15 @@ _gather_annotations, _gather_breakpoint_annotations, annotate_events, - flatten_fusion_transcript, overlapping_transcripts, ) from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import ORIENT, PRIME, PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, reverse_complement +from mavis.constants import ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, reverse_complement from mavis.error import NotSpecifiedError from mavis.interval import Interval from ..util import get_data -from . import MockLongString, MockObject, get_example_genes +from . import MockObject, get_example_genes REFERENCE_ANNOTATIONS = None REFERENCE_GENOME = None @@ -40,129 +40,156 @@ def setUpModule(): print('loaded the reference genome', get_data('mock_reference_genome.fa')) -class TestTemplate(unittest.TestCase): +class TestTemplate: def test_template_hashing(self): t = Template('1', 1, 10) d = {'1': 1, '2': 2, 1: '5'} - self.assertEqual('1', t.name) - self.assertEqual(1, d[t.name]) - self.assertEqual(1, d[t]) - - -class TestFusionTranscript(unittest.TestCase): - def setUp(self): - self.x = Interval(100, 199) # C - self.y = Interval(500, 599) # G - self.z = Interval(1200, 1299) # T - self.w = Interval(1500, 1599) # C - self.s = Interval(1700, 1799) # G - # introns: 99, 300, 600, 200, 100, ... - reference_sequence = 'A' * 99 + 'C' * 100 + 'A' * 300 + 'G' * 100 - reference_sequence += 'A' * 600 + 'T' * 100 + 'A' * 200 + 'C' * 100 - reference_sequence += 'A' * 100 + 'G' * 100 + 'A' * 200 + 'T' * 100 - - self.a = Interval(2000, 2099) # T - self.b = Interval(2600, 2699) # C - self.c = Interval(3000, 3099) # G - self.d = Interval(3300, 3399) # T - reference_sequence += 'A' * 500 + 'C' * 100 + 'A' * 300 + 'G' * 100 - reference_sequence += 'A' * 200 + 'T' * 100 + 'A' * 200 - self.reference_sequence = reference_sequence - - self.b1 = Interval(600, 699) # A - self.b2 = Interval(800, 899) # G - self.b3 = Interval(1100, 1199) # T - self.b4 = Interval(1400, 1499) # A - self.b5 = Interval(1700, 1799) # G - self.b6 = Interval(2100, 2199) # A - alternate_sequence = 'C' * 599 + 'A' * 100 + 'C' * 100 + 'G' * 100 - alternate_sequence += 'C' * 200 + 'T' * 100 + 'C' * 200 + 'A' * 100 - alternate_sequence += 'C' * 200 + 'G' * 100 + 'C' * 300 + 'A' * 100 - alternate_sequence += 'C' * 200 - self.alternate_sequence = alternate_sequence - - def test__pull_exons_left_pos_intronic(self): + assert t.name == '1' + assert d[t.name] == 1 + assert d[t] == 1 + + +@pytest.fixture +def intervals(): + n = argparse.Namespace() + n.x = Interval(100, 199) # C + n.y = Interval(500, 599) # G + n.z = Interval(1200, 1299) # T + n.w = Interval(1500, 1599) # C + n.s = Interval(1700, 1799) # G + # introns: 99, 300, 600, 200, 100, ... + reference_sequence = 'A' * 99 + 'C' * 100 + 'A' * 300 + 'G' * 100 + reference_sequence += 'A' * 600 + 'T' * 100 + 'A' * 200 + 'C' * 100 + reference_sequence += 'A' * 100 + 'G' * 100 + 'A' * 200 + 'T' * 100 + + n.a = Interval(2000, 2099) # T + n.b = Interval(2600, 2699) # C + n.c = Interval(3000, 3099) # G + n.d = Interval(3300, 3399) # T + reference_sequence += 'A' * 500 + 'C' * 100 + 'A' * 300 + 'G' * 100 + reference_sequence += 'A' * 200 + 'T' * 100 + 'A' * 200 + n.reference_sequence = reference_sequence + + n.b1 = Interval(600, 699) # A + n.b2 = Interval(800, 899) # G + n.b3 = Interval(1100, 1199) # T + n.b4 = Interval(1400, 1499) # A + n.b5 = Interval(1700, 1799) # G + n.b6 = Interval(2100, 2199) # A + alternate_sequence = 'C' * 599 + 'A' * 100 + 'C' * 100 + 'G' * 100 + alternate_sequence += 'C' * 200 + 'T' * 100 + 'C' * 200 + 'A' * 100 + alternate_sequence += 'C' * 200 + 'G' * 100 + 'C' * 300 + 'A' * 100 + alternate_sequence += 'C' * 200 + n.alternate_sequence = alternate_sequence + return n + + +class TestFusionTranscript: + def test__pull_exons_left_pos_intronic(self, intervals): # 100-199, 500-599, 1200-1299, 1500-1599, 1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 700, orient=ORIENT.LEFT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) - expt = 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (700 - 600 + 1) - self.assertEqual(expt, seq) - self.assertEqual(2, len(new_exons)) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) + expt = ( + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (700 - 600 + 1) + ) + assert seq == expt + assert len(new_exons) == 2 e = new_exons[0][0] - self.assertEqual(1, e.start) - self.assertEqual(100, e.end) - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) + assert e.start == 1 + assert e.end == 100 + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is True - def test__pull_exons_left_pos_intronic_splice(self): - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + def test__pull_exons_left_pos_intronic_splice(self, intervals): + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 201, orient=ORIENT.LEFT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'C' * 100 + 'A' * 2 - self.assertEqual(expt, seq) - self.assertEqual(1, len(new_exons)) + assert seq == expt + assert len(new_exons) == 1 e = new_exons[0][0] - self.assertEqual(1, e.start) - self.assertEqual(100, e.end) - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(False, e.end_splice_site.intact) + assert e.start == 1 + assert e.end == 100 + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is False - def test__pull_exons_left_pos_exonic(self): - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + def test__pull_exons_left_pos_exonic(self, intervals): + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) print('transcriptt exons:', t.exons) b = Breakpoint(REF_CHR, 199, orient=ORIENT.LEFT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'C' * 100 - self.assertEqual(expt, seq) - self.assertEqual(1, len(new_exons)) + assert seq == expt + assert len(new_exons) == 1 e = new_exons[0][0] - self.assertEqual(1, e.start) - self.assertEqual(100, e.end) - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(False, e.end_splice_site.intact) + assert e.start == 1 + assert e.end == 100 + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is False - def test__pull_exons_left_pos_exonic_splice(self): + def test__pull_exons_left_pos_exonic_splice(self, intervals): # 100-199, 500-599, 1200-1299, 1500-1599, 1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 101, orient=ORIENT.LEFT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'C' * 2 - self.assertEqual(expt, seq) - self.assertEqual(1, len(new_exons)) + assert seq == expt + assert len(new_exons) == 1 e = new_exons[0][0] - self.assertEqual(1, e.start) - self.assertEqual(2, e.end) - self.assertEqual(False, e.start_splice_site.intact) - self.assertEqual(False, e.end_splice_site.intact) + assert e.start == 1 + assert e.end == 2 + assert e.start_splice_site.intact is False + assert e.end_splice_site.intact is False - def test__pull_exons_right_pos_intronic(self): + def test__pull_exons_right_pos_intronic(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 1600, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) - expt = 'A' * (1699 - 1600 + 1) + 'G' * len(self.s) - self.assertEqual(expt, seq) - self.assertEqual(1, len(new_exons)) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) + expt = 'A' * (1699 - 1600 + 1) + 'G' * len(intervals.s) + assert seq == expt + assert len(new_exons) == 1 b = Breakpoint(REF_CHR, 300, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'A' * (499 - 300 + 1) + 'G' * 100 + 'A' * (1199 - 600 + 1) + 'T' * 100 expt += 'A' * (1499 - 1300 + 1) + 'C' * 100 + 'A' * (1699 - 1600 + 1) + 'G' * 100 - self.assertEqual(expt, seq) - self.assertEqual(4, len(new_exons)) + assert seq == expt + assert len(new_exons) == 4 e = new_exons[0][0] - self.assertEqual(201, e.start) - self.assertEqual(300, e.end) - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) + assert e.start == 201 + assert e.end == 300 + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is True - def test__pull_exons_right_pos_intronic_splice(self): + def test__pull_exons_right_pos_intronic_splice(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 1198, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = ( 'AA' + 'T' * 100 @@ -171,64 +198,76 @@ def test__pull_exons_right_pos_intronic_splice(self): + 'A' * (1699 - 1600 + 1) + 'G' * 100 ) - self.assertEqual(expt, seq) - self.assertEqual(3, len(new_exons)) + assert seq == expt + assert len(new_exons) == 3 e = new_exons[0][0] - self.assertEqual(False, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) + assert e.start_splice_site.intact is False + assert e.end_splice_site.intact is True - def test__pull_exons_right_pos_exonic(self): + def test__pull_exons_right_pos_exonic(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 1201, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'T' * 99 + 'A' * (1499 - 1300 + 1) + 'C' * 100 + 'A' * (1699 - 1600 + 1) + 'G' * 100 - self.assertEqual(expt, seq) - self.assertEqual(3, len(new_exons)) + assert seq == expt + assert len(new_exons) == 3 e = new_exons[0][0] - self.assertEqual(False, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) + assert e.start_splice_site.intact is False + assert e.end_splice_site.intact is True - def test__pull_exons_right_pos_exonic_splice(self): + def test__pull_exons_right_pos_exonic_splice(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b = Breakpoint(REF_CHR, 1298, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'TT' + 'A' * (1499 - 1300 + 1) + 'C' * 100 + 'A' * (1699 - 1600 + 1) + 'G' * 100 - self.assertEqual(expt, seq) - self.assertEqual(3, len(new_exons)) + assert seq == expt + assert len(new_exons) == 3 e = new_exons[0][0] - self.assertEqual(False, e.start_splice_site.intact) - self.assertEqual(False, e.end_splice_site.intact) + assert e.start_splice_site.intact is False + assert e.end_splice_site.intact is False - def test__pull_exons_right_neg_intronic(self): + def test__pull_exons_right_neg_intronic(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) b = Breakpoint(REF_CHR, 700, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = 'A' * (1199 - 700 + 1) + 'T' * 100 + 'A' * (1499 - 1300 + 1) + 'C' * 100 expt += 'A' * (1699 - 1600 + 1) + 'G' * 100 expt = reverse_complement(expt) - self.assertEqual(expt, seq) - self.assertEqual(3, len(new_exons)) + assert seq == expt + assert len(new_exons) == 3 e = new_exons[0][0] - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) - self.assertEqual(1, e.start) - self.assertEqual(100, e.end) - self.assertEqual('C' * 100, seq[e.start - 1 : e.end]) + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is True + assert e.start == 1 + assert e.end == 100 + assert seq[e.start - 1 : e.end] == 'C' * 100 e = new_exons[1][0] - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) - self.assertEqual(201, e.start) - self.assertEqual(300, e.end) - self.assertEqual('G' * 100, seq[e.start - 1 : e.end]) + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is True + assert e.start == 201 + assert e.end == 300 + assert seq[e.start - 1 : e.end] == 'G' * 100 - def test__pull_exons_right_neg_intronic_splice(self): + def test__pull_exons_right_neg_intronic_splice(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) b = Breakpoint(REF_CHR, 1198, orient=ORIENT.RIGHT) - seq, new_exons = FusionTranscript._pull_exons(t, b, self.reference_sequence) + seq, new_exons = FusionTranscript._pull_exons(t, b, intervals.reference_sequence) expt = ( 'AA' + 'T' * 100 @@ -238,63 +277,66 @@ def test__pull_exons_right_neg_intronic_splice(self): + 'G' * 100 ) expt = reverse_complement(expt) - self.assertEqual(expt, seq) - self.assertEqual(3, len(new_exons)) + assert seq == expt + assert len(new_exons) == 3 e = new_exons[0][0] - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) - self.assertEqual(1, e.start) - self.assertEqual(100, e.end) - self.assertEqual('C' * 100, seq[e.start - 1 : e.end]) + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is True + assert e.start == 1 + assert e.end == 100 + assert seq[e.start - 1 : e.end] == 'C' * 100 e = new_exons[1][0] - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(True, e.end_splice_site.intact) - self.assertEqual(201, e.start) - self.assertEqual(300, e.end) - self.assertEqual('G' * 100, seq[e.start - 1 : e.end]) + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is True + assert e.start == 201 + assert e.end == 300 + assert seq[e.start - 1 : e.end] == 'G' * 100 e = new_exons[2][0] - self.assertEqual(True, e.start_splice_site.intact) - self.assertEqual(False, e.end_splice_site.intact) - self.assertEqual(501, e.start) - self.assertEqual(600, e.end) - self.assertEqual('A' * 100, seq[e.start - 1 : e.end]) + assert e.start_splice_site.intact is True + assert e.end_splice_site.intact is False + assert e.start == 501 + assert e.end == 600 + assert seq[e.start - 1 : e.end] == 'A' * 100 - def test_build_single_transcript_indel(self): + def test_build_single_transcript_indel(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b1 = Breakpoint(REF_CHR, 599, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='ATCGATCG') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.DEL, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'C' * len(intervals.x) + 'A' * (499 - 200 + 1) - + 'G' * len(self.y) + + 'G' * len(intervals.y) + 'ATCGATCG' - + 'T' * len(self.z) + + 'T' * len(intervals.z) ) expt += ( 'A' * (1499 - 1300 + 1) - + 'C' * len(self.w) + + 'C' * len(intervals.w) + 'A' * (1699 - 1600 + 1) - + 'G' * len(self.s) + + 'G' * len(intervals.s) ) - self.assertEqual(expt, ft.seq) - self.assertEqual(5, len(ft.exons)) + assert ft.seq == expt + assert len(ft.exons) == 5 for i, ex in enumerate(t.exons): n = ft.exons[i] - self.assertEqual(ex, ft.exon_mapping[n.position]) + assert ft.exon_mapping[n.position] == ex - self.assertEqual(1, ft.exons[0].start) - self.assertEqual(100, ft.exons[0].end) + assert ft.exons[0].start == 1 + assert ft.exons[0].end == 100 splice_pattern = [(True, True), (True, False), (False, True), (True, True), (True, True)] char_pattern = [x * 100 for x in ['C', 'G', 'T', 'C', 'G']] @@ -302,60 +344,72 @@ def test_build_single_transcript_indel(self): for i in range(0, len(splice_pattern)): s, t = splice_pattern[i] ex = ft.exons[i] - self.assertEqual(s, ex.start_splice_site.intact) - self.assertEqual(t, ex.end_splice_site.intact) - self.assertEqual(char_pattern[i], ft.seq[ex.start - 1 : ex.end]) + assert ex.start_splice_site.intact == s + assert ex.end_splice_site.intact == t + assert ft.seq[ex.start - 1 : ex.end] == char_pattern[i] - def test_build_single_transcript_inversion(self): + def test_build_single_transcript_inversion(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 1299, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=True, untemplated_seq='ATCGTC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.INV, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) ) - expt += 'ATCGTC' + 'A' * len(self.z) + expt += 'ATCGTC' + 'A' * len(intervals.z) expt += ( 'A' * (1499 - 1300 + 1) - + 'C' * len(self.w) + + 'C' * len(intervals.w) + 'A' * (1699 - 1600 + 1) - + 'G' * len(self.s) + + 'G' * len(intervals.s) ) exons = [(1, 100), (401, 500), (1407, 1506), (1607, 1706)] for i in range(len(exons)): - self.assertEqual(exons[i][0], ft.exons[i].start) - self.assertEqual(exons[i][1], ft.exons[i].end) - self.assertEqual(expt, ft.seq) - self.assertEqual(4, len(ft.exons)) + assert ft.exons[i].start == exons[i][0] + assert ft.exons[i].end == exons[i][1] + assert ft.seq == expt + assert len(ft.exons) == 4 - def test_build_single_transcript_inversion_transcriptome(self): + def test_build_single_transcript_inversion_transcriptome(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 1299, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=True, untemplated_seq='ATCGTC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.INV, protocol=PROTOCOL.TRANS ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) ) - expt += 'ATCGTC' + 'A' * len(self.z) + expt += 'ATCGTC' + 'A' * len(intervals.z) expt += ( 'A' * (1499 - 1300 + 1) - + 'C' * len(self.w) + + 'C' * len(intervals.w) + 'A' * (1699 - 1600 + 1) - + 'G' * len(self.s) + + 'G' * len(intervals.s) ) exons = [ Exon(1, 100, strand=STRAND.POS), @@ -366,106 +420,119 @@ def test_build_single_transcript_inversion_transcriptome(self): ] print(ft.exons) for i in range(len(exons)): - self.assertEqual(exons[i].start, ft.exons[i].start) - self.assertEqual(exons[i].end, ft.exons[i].end) - self.assertEqual( - exons[i].start_splice_site.intact, ft.exons[i].start_splice_site.intact - ) - self.assertEqual(exons[i].end_splice_site.intact, ft.exons[i].end_splice_site.intact) - self.assertEqual(expt, ft.seq) - self.assertEqual(5, len(ft.exons)) - - def test_build_single_transcript_inversion_neg(self): + assert ft.exons[i].start == exons[i].start + assert ft.exons[i].end == exons[i].end + assert ft.exons[i].start_splice_site.intact == exons[i].start_splice_site.intact + assert ft.exons[i].end_splice_site.intact == exons[i].end_splice_site.intact + assert ft.seq == expt + assert len(ft.exons) == 5 + + def test_build_single_transcript_inversion_neg(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) b1 = Breakpoint(REF_CHR, 1300, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2, opposing_strands=True, untemplated_seq='ATCGTC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.INV, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.s) + 'C' * len(intervals.s) + 'T' * (1699 - 1600 + 1) - + 'G' * len(self.w) + + 'G' * len(intervals.w) + 'T' * (1499 - 1300 + 1) ) - expt += 'T' * len(self.z) + 'GACGAT' + 'T' * (1199 - 600 + 1) + 'C' * len(self.y) - expt += 'T' * (499 - 200 + 1) + 'G' * len(self.x) + expt += 'T' * len(intervals.z) + 'GACGAT' + 'T' * (1199 - 600 + 1) + 'C' * len(intervals.y) + expt += 'T' * (499 - 200 + 1) + 'G' * len(intervals.x) exons = [(1, 100), (201, 300), (1207, 1306), (1607, 1706)] for i in range(len(exons)): - self.assertEqual(exons[i][0], ft.exons[i].start) - self.assertEqual(exons[i][1], ft.exons[i].end) - self.assertEqual(expt, ft.seq) - self.assertEqual(4, len(ft.exons)) + assert ft.exons[i].start == exons[i][0] + assert ft.exons[i].end == exons[i][1] + assert ft.seq == expt + assert len(ft.exons) == 4 - def test_build_single_transcript_duplication_pos(self): + def test_build_single_transcript_duplication_pos(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 1299, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='ATCGATCG') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.DUP, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) - self.assertEqual(STRAND.POS, ft.get_strand()) + assert ft.get_strand() == STRAND.POS expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) ) - expt += 'T' * len(self.z) + 'ATCGATCG' + 'T' * len(self.z) + expt += 'T' * len(intervals.z) + 'ATCGATCG' + 'T' * len(intervals.z) expt += ( 'A' * (1499 - 1300 + 1) - + 'C' * len(self.w) + + 'C' * len(intervals.w) + 'A' * (1699 - 1600 + 1) - + 'G' * len(self.s) + + 'G' * len(intervals.s) ) - self.assertEqual(expt, ft.seq) + assert ft.seq == expt exons = [(1, 100), (401, 500), (1101, 1200), (1209, 1308), (1509, 1608), (1709, 1808)] for i in range(len(exons)): - self.assertEqual(exons[i][0], ft.exons[i].start) - self.assertEqual(exons[i][1], ft.exons[i].end) + assert ft.exons[i].start == exons[i][0] + assert ft.exons[i].end == exons[i][1] - self.assertEqual(6, len(ft.exons)) - self.assertTrue(ft.exons[2].start_splice_site.intact) - self.assertTrue(ft.exons[3].end_splice_site.intact) - self.assertFalse(ft.exons[2].end_splice_site.intact) - self.assertFalse(ft.exons[3].start_splice_site.intact) + assert len(ft.exons) == 6 + assert ft.exons[2].start_splice_site.intact + assert ft.exons[3].end_splice_site.intact + assert not ft.exons[2].end_splice_site.intact + assert not ft.exons[3].start_splice_site.intact - def test_build_single_transcript_duplication_pos_transcriptome(self): + def test_build_single_transcript_duplication_pos_transcriptome(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 1299, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='ATCGATCG') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.DUP, protocol=PROTOCOL.TRANS ) ft = FusionTranscript.build(ann, ref) - self.assertEqual(STRAND.POS, ft.get_strand()) + assert ft.get_strand() == STRAND.POS expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) ) - expt += 'T' * len(self.z) + 'ATCGATCG' + 'T' * len(self.z) + expt += 'T' * len(intervals.z) + 'ATCGATCG' + 'T' * len(intervals.z) expt += ( 'A' * (1499 - 1300 + 1) - + 'C' * len(self.w) + + 'C' * len(intervals.w) + 'A' * (1699 - 1600 + 1) - + 'G' * len(self.s) + + 'G' * len(intervals.s) ) - self.assertEqual(expt, ft.seq) + assert ft.seq == expt exons = [ Exon(1, 100, strand=STRAND.POS), Exon(401, 500, strand=STRAND.POS), @@ -477,241 +544,290 @@ def test_build_single_transcript_duplication_pos_transcriptome(self): ] print(ft.exons) for i in range(len(exons)): - self.assertEqual(exons[i].start, ft.exons[i].start) - self.assertEqual(exons[i].end, ft.exons[i].end) - self.assertEqual( - exons[i].start_splice_site.intact, ft.exons[i].start_splice_site.intact - ) - self.assertEqual(exons[i].end_splice_site.intact, ft.exons[i].end_splice_site.intact) + assert ft.exons[i].start == exons[i].start + assert ft.exons[i].end == exons[i].end + assert ft.exons[i].start_splice_site.intact == exons[i].start_splice_site.intact + assert ft.exons[i].end_splice_site.intact == exons[i].end_splice_site.intact - self.assertEqual(7, len(ft.exons)) + assert len(ft.exons) == 7 - def test_build_single_transcript_duplication_neg(self): + def test_build_single_transcript_duplication_neg(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG - t = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 1299, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='ATCGATCG') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t, transcript2=t, event_type=SVTYPE.DUP, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) ) - expt += 'T' * len(self.z) + 'ATCGATCG' + 'T' * len(self.z) + expt += 'T' * len(intervals.z) + 'ATCGATCG' + 'T' * len(intervals.z) expt += ( 'A' * (1499 - 1300 + 1) - + 'C' * len(self.w) + + 'C' * len(intervals.w) + 'A' * (1699 - 1600 + 1) - + 'G' * len(self.s) + + 'G' * len(intervals.s) ) expt = reverse_complement(expt) - self.assertEqual(expt, ft.seq) + assert ft.seq == expt exons = [(1, 100), (201, 300), (501, 600), (609, 708), (1309, 1408), (1709, 1808)] for i in range(len(exons)): - self.assertEqual(exons[i][0], ft.exons[i].start) - self.assertEqual(exons[i][1], ft.exons[i].end) - - self.assertEqual(6, len(ft.exons)) - self.assertTrue(ft.exons[2].start_splice_site.intact) - self.assertTrue(ft.exons[3].end_splice_site.intact) - self.assertFalse(ft.exons[2].end_splice_site.intact) - self.assertFalse(ft.exons[3].start_splice_site.intact) - self.assertEqual(3, ft.exon_number(ft.exons[2])) - self.assertEqual(3, ft.exon_number(ft.exons[3])) - - def test_build_two_transcript_inversion_5prime_pos(self): + assert ft.exons[i].start == exons[i][0] + assert ft.exons[i].end == exons[i][1] + + assert len(ft.exons) == 6 + assert ft.exons[2].start_splice_site.intact + assert ft.exons[3].end_splice_site.intact + assert not ft.exons[2].end_splice_site.intact + assert not ft.exons[3].start_splice_site.intact + assert ft.exon_number(ft.exons[2]) == 3 + assert ft.exon_number(ft.exons[3]) == 3 + + def test_build_two_transcript_inversion_5prime_pos(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.NEG) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.NEG + ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 2699, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=True, untemplated_seq='ATCGACTC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.INV, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) - ) - expt += 'ATCGACTC' + 'G' * len(self.b) + 'T' * (2599 - 2100 + 1) + 'A' * len(self.a) - self.assertEqual(expt, ft.seq) - self.assertEqual(4, len(ft.exons)) - self.assertTrue(ft.exons[3].end_splice_site.intact) - self.assertFalse(ft.exons[2].start_splice_site.intact) - self.assertTrue(ft.exons[2].end_splice_site.intact) - self.assertEqual(2, ft.exon_number(ft.exons[1])) - self.assertEqual(3, ft.exon_number(ft.exons[2])) - - def test_build_two_transcript_inversion_5prime_neg(self): + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) + ) + expt += ( + 'ATCGACTC' + 'G' * len(intervals.b) + 'T' * (2599 - 2100 + 1) + 'A' * len(intervals.a) + ) + assert ft.seq == expt + assert len(ft.exons) == 4 + assert ft.exons[3].end_splice_site.intact + assert not ft.exons[2].start_splice_site.intact + assert ft.exons[2].end_splice_site.intact + assert ft.exon_number(ft.exons[1]) == 2 + assert ft.exon_number(ft.exons[2]) == 3 + + def test_build_two_transcript_inversion_5prime_neg(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.POS) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.POS + ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 2699, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=True, untemplated_seq='ATCGACTC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.INV, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) - expt = 'T' * len(self.a) + 'A' * (2599 - 2100 + 1) + 'C' * len(self.b) + 'ATCGACTC' + expt = ( + 'T' * len(intervals.a) + 'A' * (2599 - 2100 + 1) + 'C' * len(intervals.b) + 'ATCGACTC' + ) expt += ( - 'T' * (1199 - 600 + 1) + 'C' * len(self.y) + 'T' * (499 - 200 + 1) + 'G' * len(self.x) + 'T' * (1199 - 600 + 1) + + 'C' * len(intervals.y) + + 'T' * (499 - 200 + 1) + + 'G' * len(intervals.x) ) - self.assertEqual(4, len(ft.exons)) - self.assertEqual(2, ft.exon_number(ft.exons[1])) - self.assertEqual(4, ft.exon_number(ft.exons[2])) - self.assertEqual(expt, ft.seq) + assert len(ft.exons) == 4 + assert ft.exon_number(ft.exons[1]) == 2 + assert ft.exon_number(ft.exons[2]) == 4 + assert ft.seq == expt - def test_build_two_transcript_duplication_pos(self): + def test_build_two_transcript_duplication_pos(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.POS) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.POS + ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 2699, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='ATCGAC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.DUP, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) - expt = 'T' * len(self.a) + 'A' * (2599 - 2100 + 1) + 'C' * len(self.b) + 'ATCGAC' - expt += 'T' * len(self.z) + 'A' * (1499 - 1300 + 1) + 'C' * len(self.w) - expt += 'A' * (1699 - 1600 + 1) + 'G' * len(self.s) + expt = 'T' * len(intervals.a) + 'A' * (2599 - 2100 + 1) + 'C' * len(intervals.b) + 'ATCGAC' + expt += 'T' * len(intervals.z) + 'A' * (1499 - 1300 + 1) + 'C' * len(intervals.w) + expt += 'A' * (1699 - 1600 + 1) + 'G' * len(intervals.s) - self.assertEqual(5, len(ft.exons)) - self.assertEqual(2, ft.exon_number(ft.exons[1])) - self.assertEqual(3, ft.exon_number(ft.exons[2])) - self.assertEqual(expt, ft.seq) + assert len(ft.exons) == 5 + assert ft.exon_number(ft.exons[1]) == 2 + assert ft.exon_number(ft.exons[2]) == 3 + assert ft.seq == expt - def test_build_two_transcript_duplication_neg(self): + def test_build_two_transcript_duplication_neg(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.NEG) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.NEG + ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 2699, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='ATCGAC') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.DUP, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.s) + 'C' * len(intervals.s) + 'T' * (1699 - 1600 + 1) - + 'G' * len(self.w) + + 'G' * len(intervals.w) + 'T' * (1499 - 1300 + 1) ) - expt += 'A' * len(self.z) + 'GTCGAT' + 'G' * len(self.b) + 'T' * (2599 - 2100 + 1) - expt += 'A' * len(self.a) + expt += 'A' * len(intervals.z) + 'GTCGAT' + 'G' * len(intervals.b) + 'T' * (2599 - 2100 + 1) + expt += 'A' * len(intervals.a) - self.assertEqual(5, len(ft.exons)) - self.assertEqual(2, ft.exon_number(ft.exons[1])) - self.assertEqual(3, ft.exon_number(ft.exons[2])) - self.assertEqual(expt, ft.seq) + assert len(ft.exons) == 5 + assert ft.exon_number(ft.exons[1]) == 2 + assert ft.exon_number(ft.exons[2]) == 3 + assert ft.seq == expt - def test_build_two_transcript_deletion_pos(self): + def test_build_two_transcript_deletion_pos(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.POS) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.POS + ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 2700, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='AACGTGT') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.DEL, protocol=PROTOCOL.GENOME ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'C' * len(intervals.x) + 'A' * (499 - 200 + 1) - + 'G' * len(self.y) + + 'G' * len(intervals.y) + 'A' * (1199 - 600 + 1) + 'AACGTGT' ) expt += ( 'A' * (2999 - 2700 + 1) - + 'G' * len(self.c) + + 'G' * len(intervals.c) + 'A' * (3299 - 3100 + 1) - + 'T' * len(self.d) + + 'T' * len(intervals.d) ) - self.assertEqual(expt, ft.seq) - self.assertTrue(4, len(ft.exons)) + assert ft.seq == expt + assert 4, len(ft.exons) - def test_build_two_transcript_deletion_pos_transcriptome(self): + def test_build_two_transcript_deletion_pos_transcriptome(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.POS) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.POS + ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 2700, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='AACGTGT') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.DEL, protocol=PROTOCOL.TRANS ) ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'C' * len(intervals.x) + 'A' * (499 - 200 + 1) - + 'G' * len(self.y) + + 'G' * len(intervals.y) + 'A' * (1199 - 600 + 1) + 'AACGTGT' ) expt += ( 'A' * (2999 - 2700 + 1) - + 'G' * len(self.c) + + 'G' * len(intervals.c) + 'A' * (3299 - 3100 + 1) - + 'T' * len(self.d) + + 'T' * len(intervals.d) ) - self.assertEqual(expt, ft.seq) - self.assertTrue(5, len(ft.exons)) + assert ft.seq == expt + assert 5, len(ft.exons) - def test_build_two_transcript_deletion_neg(self): + def test_build_two_transcript_deletion_neg(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # a:2000-2099, b:2600-2699, c:3000-3099, d:3300-3399 # TTTTTTTTT CCCCCCCCC GGGGGGGGG TTTTTTTTT - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) - t2 = PreTranscript(exons=[self.a, self.b, self.c, self.d], strand=STRAND.NEG) + t2 = PreTranscript( + exons=[intervals.a, intervals.b, intervals.c, intervals.d], strand=STRAND.NEG + ) print('t1 exons', t1.exons) print('t2 exons', t2.exons) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 2699, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='AACGAGTGT') - ref = {REF_CHR: MockObject(seq=self.reference_sequence)} + ref = {REF_CHR: MockObject(seq=intervals.reference_sequence)} ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.DEL, protocol=PROTOCOL.GENOME ) @@ -719,456 +835,521 @@ def test_build_two_transcript_deletion_neg(self): ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.s) + 'C' * len(intervals.s) + 'T' * (1699 - 1600 + 1) - + 'G' * len(self.w) + + 'G' * len(intervals.w) + 'T' * (1499 - 1300 + 1) ) - expt += 'A' * len(self.z) + 'ACACTCGTT' + 'G' * len(self.b) + 'T' * (2599 - 2100 + 1) - expt += 'A' * len(self.a) + expt += ( + 'A' * len(intervals.z) + 'ACACTCGTT' + 'G' * len(intervals.b) + 'T' * (2599 - 2100 + 1) + ) + expt += 'A' * len(intervals.a) - self.assertEqual(expt, ft.seq) - self.assertTrue(5, len(ft.exons)) - self.assertEqual(3, ft.exon_number(ft.exons[2])) - self.assertEqual(3, ft.exon_number(ft.exons[3])) + assert ft.seq == expt + assert 5, len(ft.exons) + assert ft.exon_number(ft.exons[2]) == 3 + assert ft.exon_number(ft.exons[3]) == 3 - def test_build_two_transcript_translocation(self): + def test_build_two_transcript_translocation(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # 1:600-699, 2:800-899, 3:1100-1199, 4:1400-1499, 5:1700-1799 6:2100-2199 # AAAAAAA GGGGGGG, TTTTTTTTT, AAAAAAAAA, GGGGGGGGG AAAAAAAAA - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.POS) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.POS, + ) t2 = PreTranscript( - exons=[self.b1, self.b2, self.b3, self.b4, self.b5, self.b6], strand=STRAND.POS + exons=[ + intervals.b1, + intervals.b2, + intervals.b3, + intervals.b4, + intervals.b5, + intervals.b6, + ], + strand=STRAND.POS, ) b1 = Breakpoint(REF_CHR, 1199, orient=ORIENT.LEFT) b2 = Breakpoint('ref2', 1200, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='GCAACATAT') ref = { - REF_CHR: MockObject(seq=self.reference_sequence), - 'ref2': MockObject(seq=self.alternate_sequence), + REF_CHR: MockObject(seq=intervals.reference_sequence), + 'ref2': MockObject(seq=intervals.alternate_sequence), } ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.TRANS, protocol=PROTOCOL.GENOME ) - self.assertEqual(b1, ann.break1) + assert ann.break1 == b1 ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.x) + 'A' * (499 - 200 + 1) + 'G' * len(self.y) + 'A' * (1199 - 600 + 1) + 'C' * len(intervals.x) + + 'A' * (499 - 200 + 1) + + 'G' * len(intervals.y) + + 'A' * (1199 - 600 + 1) ) - expt += 'GCAACATAT' + 'C' * (1399 - 1200 + 1) + 'A' * len(self.b4) + 'C' * (1699 - 1500 + 1) - expt += 'G' * len(self.b5) + 'C' * (2099 - 1800 + 1) + 'A' * len(self.b6) + expt += ( + 'GCAACATAT' + + 'C' * (1399 - 1200 + 1) + + 'A' * len(intervals.b4) + + 'C' * (1699 - 1500 + 1) + ) + expt += 'G' * len(intervals.b5) + 'C' * (2099 - 1800 + 1) + 'A' * len(intervals.b6) - self.assertEqual(expt, ft.seq) - self.assertTrue(5, len(ft.exons)) - self.assertTrue(2, ft.exon_number(ft.exons[1])) - self.assertTrue(4, ft.exon_number(ft.exons[2])) + assert ft.seq == expt + assert 5, len(ft.exons) + assert 2, ft.exon_number(ft.exons[1]) + assert 4, ft.exon_number(ft.exons[2]) - def test_build_two_transcript_translocation_neg(self): + def test_build_two_transcript_translocation_neg(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # 1:600-699, 2:800-899, 3:1100-1199, 4:1400-1499, 5:1700-1799 6:2100-2199 # AAAAAAA GGGGGGG, TTTTTTTTT, AAAAAAAAA, GGGGGGGGG AAAAAAAAA - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) t2 = PreTranscript( - exons=[self.b1, self.b2, self.b3, self.b4, self.b5, self.b6], strand=STRAND.NEG + exons=[ + intervals.b1, + intervals.b2, + intervals.b3, + intervals.b4, + intervals.b5, + intervals.b6, + ], + strand=STRAND.NEG, ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(ALT_REF_CHR, 1199, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2, opposing_strands=False, untemplated_seq='TCTACATAT') ref = { - REF_CHR: MockObject(seq=self.reference_sequence), - ALT_REF_CHR: MockObject(seq=self.alternate_sequence), + REF_CHR: MockObject(seq=intervals.reference_sequence), + ALT_REF_CHR: MockObject(seq=intervals.alternate_sequence), } ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.TRANS, protocol=PROTOCOL.GENOME ) - self.assertEqual(b1, ann.break1) - self.assertEqual(b2, ann.break2) + assert ann.break1 == b1 + assert ann.break2 == b2 ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.s) + 'C' * len(intervals.s) + 'T' * (1699 - 1600 + 1) - + 'G' * len(self.w) + + 'G' * len(intervals.w) + 'T' * (1499 - 1300 + 1) ) - expt += 'A' * len(self.z) + 'ATATGTAGA' + 'A' * len(self.b3) + 'G' * (1099 - 900 + 1) - expt += 'C' * len(self.b2) + 'G' * (799 - 700 + 1) + 'T' * len(self.b1) + expt += ( + 'A' * len(intervals.z) + 'ATATGTAGA' + 'A' * len(intervals.b3) + 'G' * (1099 - 900 + 1) + ) + expt += 'C' * len(intervals.b2) + 'G' * (799 - 700 + 1) + 'T' * len(intervals.b1) - self.assertEqual(expt, ft.seq) - self.assertEqual(6, len(ft.exons)) - self.assertTrue(3, ft.exon_number(ft.exons[2])) - self.assertTrue(3, ft.exon_number(ft.exons[3])) + assert ft.seq == expt + assert len(ft.exons) == 6 + assert 3, ft.exon_number(ft.exons[2]) + assert 3, ft.exon_number(ft.exons[3]) - def test_build_two_transcript_inverted_translocation(self): + def test_build_two_transcript_inverted_translocation(self, intervals): # x:100-199, y:500-599, z:1200-1299, w:1500-1599, s:1700-1799 # CCCCCCC GGGGGGG TTTTTTTTT CCCCCCCCC GGGGGGGGG # 1:600-699, 2:800-899, 3:1100-1199, 4:1400-1499, 5:1700-1799 6:2100-2199 # AAAAAAA GGGGGGG, TTTTTTTTT, AAAAAAAAA, GGGGGGGGG AAAAAAAAA - t1 = PreTranscript(exons=[self.x, self.y, self.z, self.w, self.s], strand=STRAND.NEG) + t1 = PreTranscript( + exons=[intervals.x, intervals.y, intervals.z, intervals.w, intervals.s], + strand=STRAND.NEG, + ) t2 = PreTranscript( - exons=[self.b1, self.b2, self.b3, self.b4, self.b5, self.b6], strand=STRAND.POS + exons=[ + intervals.b1, + intervals.b2, + intervals.b3, + intervals.b4, + intervals.b5, + intervals.b6, + ], + strand=STRAND.POS, ) b1 = Breakpoint(REF_CHR, 1200, orient=ORIENT.RIGHT) b2 = Breakpoint(ALT_REF_CHR, 1200, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2, opposing_strands=True, untemplated_seq='GATACATAT') ref = { - REF_CHR: MockObject(seq=self.reference_sequence), - ALT_REF_CHR: MockObject(seq=self.alternate_sequence), + REF_CHR: MockObject(seq=intervals.reference_sequence), + ALT_REF_CHR: MockObject(seq=intervals.alternate_sequence), } ann = Annotation( bpp, transcript1=t1, transcript2=t2, event_type=SVTYPE.TRANS, protocol=PROTOCOL.GENOME ) - self.assertEqual(b1, ann.break1) - self.assertEqual(b2, ann.break2) + assert ann.break1 == b1 + assert ann.break2 == b2 ft = FusionTranscript.build(ann, ref) expt = ( - 'C' * len(self.s) + 'C' * len(intervals.s) + 'T' * (1699 - 1600 + 1) - + 'G' * len(self.w) + + 'G' * len(intervals.w) + 'T' * (1499 - 1300 + 1) ) - expt += 'A' * len(self.z) + 'ATATGTATC' + 'C' * (1399 - 1200 + 1) + 'A' * len(self.b4) + expt += ( + 'A' * len(intervals.z) + 'ATATGTATC' + 'C' * (1399 - 1200 + 1) + 'A' * len(intervals.b4) + ) expt += ( 'C' * (1699 - 1500 + 1) - + 'G' * len(self.b5) + + 'G' * len(intervals.b5) + 'C' * (2099 - 1800 + 1) - + 'A' * len(self.b6) + + 'A' * len(intervals.b6) ) - self.assertEqual(expt, ft.seq) - self.assertEqual(6, len(ft.exons)) - self.assertTrue(3, ft.exon_number(ft.exons[2])) - self.assertTrue(4, ft.exon_number(ft.exons[3])) + assert ft.seq == expt + assert len(ft.exons) == 6 + assert 3, ft.exon_number(ft.exons[2]) + assert 4, ft.exon_number(ft.exons[3]) -class TestSequenceFetching(unittest.TestCase): - def setUp(self): - self.gene = Gene(REF_CHR, 1, 900, strand=STRAND.POS) +@pytest.fixture +def mock_ann_obj(): + n = argparse.Namespace() + n.gene = Gene(REF_CHR, 1, 900, strand=STRAND.POS) - self.pre_transcript = PreTranscript( - exons=[(101, 200), (301, 400), (501, 600), (701, 800)], gene=self.gene - ) - self.gene.transcripts.append(self.pre_transcript) + n.pre_transcript = PreTranscript( + exons=[(101, 200), (301, 400), (501, 600), (701, 800)], gene=n.gene + ) + n.gene.transcripts.append(n.pre_transcript) - self.transcript = Transcript( - self.pre_transcript, self.pre_transcript.generate_splicing_patterns()[0] - ) - self.pre_transcript.transcripts.append(self.transcript) + n.transcript = Transcript(n.pre_transcript, n.pre_transcript.generate_splicing_patterns()[0]) + n.pre_transcript.transcripts.append(n.transcript) - self.translation = Translation(51, 350, self.transcript) - self.transcript.translations.append(self.translation) + n.translation = Translation(51, 350, n.transcript) + n.transcript.translations.append(n.translation) - self.spliced_seq = ( - 'GGTGAATTTCTAGTTTGCCTTTTCAGCTAGGGATTAGCTTTTTAGGGGTCCCAATG' - 'CCTAGGGAGATTTCTAGGTCCTCTGTTCCTTGCTGACCTCCAATAATCAGAAAATGCTGTGAAGGAAAAAC' - 'AAAATGAAATTGCATTGTTTCTACCGGCCCTTTATCAAGCCCTGGCCACCATGATAGTCATGAATTCCAAT' - 'TGTGTTGAAATCACTTCAATGTGTTTCTCTTCTTTCTGGGAGCTTACACACTCAAGTTCTGGATGCTTTGA' - 'TTGCTATCAGAAGCCGTTAAATAGCTACTTATAAATAGCATTGAGTTATCAGTACTTTCATGTCTTGATAC' - 'ATTTCTTCTTGAAAATGTTCATGCTTGCTGATTTGTCTGTTTGTTGAGAGGAGAATGTTC' - ) + n.spliced_seq = ( + 'GGTGAATTTCTAGTTTGCCTTTTCAGCTAGGGATTAGCTTTTTAGGGGTCCCAATG' + 'CCTAGGGAGATTTCTAGGTCCTCTGTTCCTTGCTGACCTCCAATAATCAGAAAATGCTGTGAAGGAAAAAC' + 'AAAATGAAATTGCATTGTTTCTACCGGCCCTTTATCAAGCCCTGGCCACCATGATAGTCATGAATTCCAAT' + 'TGTGTTGAAATCACTTCAATGTGTTTCTCTTCTTTCTGGGAGCTTACACACTCAAGTTCTGGATGCTTTGA' + 'TTGCTATCAGAAGCCGTTAAATAGCTACTTATAAATAGCATTGAGTTATCAGTACTTTCATGTCTTGATAC' + 'ATTTCTTCTTGAAAATGTTCATGCTTGCTGATTTGTCTGTTTGTTGAGAGGAGAATGTTC' + ) - self.domain = Domain( - name=REF_CHR, regions=[(11, 20), (51, 60)], translation=self.translation - ) - self.translation.domains.append(self.domain) + n.domain = Domain(name=REF_CHR, regions=[(11, 20), (51, 60)], translation=n.translation) + n.translation.domains.append(n.domain) + return n - def test_fetch_gene_seq_from_ref(self): + +class TestSequenceFetching: + def test_fetch_gene_seq_from_ref(self, mock_ann_obj): expt = str(REFERENCE_GENOME[REF_CHR][0:900].seq).upper() - self.assertEqual(expt, self.gene.get_seq(REFERENCE_GENOME)) + assert mock_ann_obj.gene.get_seq(REFERENCE_GENOME) == expt # gene seq should be the same if gene in on reverse strand b/c gene seq always given on pos - self.gene.strand = STRAND.NEG - self.assertEqual(expt, self.gene.get_seq(REFERENCE_GENOME)) + mock_ann_obj.gene.strand = STRAND.NEG + assert mock_ann_obj.gene.get_seq(REFERENCE_GENOME) == expt - def test_fetch_gene_seq_from_stored(self): + def test_fetch_gene_seq_from_stored(self, mock_ann_obj): expt = 'AAA' - self.gene.seq = expt - self.assertEqual(expt, self.gene.get_seq(REFERENCE_GENOME)) + mock_ann_obj.gene.seq = expt + assert mock_ann_obj.gene.get_seq(REFERENCE_GENOME) == expt - def test_fetch_gene_seq_force_uncached(self): + def test_fetch_gene_seq_force_uncached(self, mock_ann_obj): expt = str(REFERENCE_GENOME[REF_CHR][0:900].seq).upper() - self.gene.seq = 'AAA' - self.assertEqual(expt, self.gene.get_seq(REFERENCE_GENOME, ignore_cache=True)) + mock_ann_obj.gene.seq = 'AAA' + assert mock_ann_obj.gene.get_seq(REFERENCE_GENOME, ignore_cache=True) == expt - def test_fetch_us_transcript_seq_from_ref(self): + def test_fetch_us_transcript_seq_from_ref(self, mock_ann_obj): expt = str(REFERENCE_GENOME[REF_CHR][100:800].seq).upper() - self.assertEqual(expt, self.pre_transcript.get_seq(REFERENCE_GENOME)) + assert mock_ann_obj.pre_transcript.get_seq(REFERENCE_GENOME) == expt - def test_fetch_us_transcript_seq_from_ref_revcomp(self): - self.gene.strand = STRAND.NEG + def test_fetch_us_transcript_seq_from_ref_revcomp(self, mock_ann_obj): + mock_ann_obj.gene.strand = STRAND.NEG expt = reverse_complement(str(REFERENCE_GENOME[REF_CHR][100:800].seq).upper()) - self.assertEqual(expt, self.pre_transcript.get_seq(REFERENCE_GENOME)) + assert mock_ann_obj.pre_transcript.get_seq(REFERENCE_GENOME) == expt - def test_fetch_us_transcript_seq_from_stored(self): + def test_fetch_us_transcript_seq_from_stored(self, mock_ann_obj): expt = 'AAA' - self.pre_transcript.seq = expt - self.assertEqual(expt, self.pre_transcript.get_seq(REFERENCE_GENOME)) + mock_ann_obj.pre_transcript.seq = expt + assert mock_ann_obj.pre_transcript.get_seq(REFERENCE_GENOME) == expt - def test_fetch_us_transcript_seq_from_parent_gene(self): - self.gene.seq = 'A' * len(self.gene) - self.assertEqual('A' * len(self.pre_transcript), self.pre_transcript.get_seq()) + def test_fetch_us_transcript_seq_from_parent_gene(self, mock_ann_obj): + mock_ann_obj.gene.seq = 'A' * len(mock_ann_obj.gene) + assert mock_ann_obj.pre_transcript.get_seq() == 'A' * len(mock_ann_obj.pre_transcript) - def test_fetch_us_transcript_seq_from_parent_gene_revcomp(self): - self.gene.seq = 'A' * len(self.gene) - self.gene.strand = STRAND.NEG - self.assertEqual('T' * len(self.pre_transcript), self.pre_transcript.get_seq()) + def test_fetch_us_transcript_seq_from_parent_gene_revcomp(self, mock_ann_obj): + mock_ann_obj.gene.seq = 'A' * len(mock_ann_obj.gene) + mock_ann_obj.gene.strand = STRAND.NEG + assert mock_ann_obj.pre_transcript.get_seq() == 'T' * len(mock_ann_obj.pre_transcript) - def test_fetch_us_transcript_seq_force_uncached(self): + def test_fetch_us_transcript_seq_force_uncached(self, mock_ann_obj): expt = str(REFERENCE_GENOME[REF_CHR][100:800].seq).upper() - self.pre_transcript.seq = 'AAA' - self.assertEqual(expt, self.pre_transcript.get_seq(REFERENCE_GENOME, ignore_cache=True)) + mock_ann_obj.pre_transcript.seq = 'AAA' + assert mock_ann_obj.pre_transcript.get_seq(REFERENCE_GENOME, ignore_cache=True) == expt - def test_fetch_transcript_seq_from_ref(self): - self.assertEqual(self.spliced_seq, self.transcript.get_seq(REFERENCE_GENOME)) + def test_fetch_transcript_seq_from_ref(self, mock_ann_obj): + assert mock_ann_obj.transcript.get_seq(REFERENCE_GENOME) == mock_ann_obj.spliced_seq - def test_fetch_transcript_seq_from_ref_revcomp(self): - self.gene.strand = STRAND.NEG - self.assertEqual( - reverse_complement(self.spliced_seq), self.transcript.get_seq(REFERENCE_GENOME) + def test_fetch_transcript_seq_from_ref_revcomp(self, mock_ann_obj): + mock_ann_obj.gene.strand = STRAND.NEG + assert mock_ann_obj.transcript.get_seq(REFERENCE_GENOME) == reverse_complement( + mock_ann_obj.spliced_seq ) - def test_fetch_transcript_seq_from_stored(self): + def test_fetch_transcript_seq_from_stored(self, mock_ann_obj): expt = 'AAA' - self.transcript.seq = expt - self.assertEqual(expt, self.transcript.get_seq(REFERENCE_GENOME)) + mock_ann_obj.transcript.seq = expt + assert mock_ann_obj.transcript.get_seq(REFERENCE_GENOME) == expt - def test_fetch_transcript_seq_from_parent_ust(self): - self.pre_transcript.seq = 'A' * len(self.pre_transcript) - self.assertEqual('A' * len(self.transcript), self.transcript.get_seq()) + def test_fetch_transcript_seq_from_parent_ust(self, mock_ann_obj): + mock_ann_obj.pre_transcript.seq = 'A' * len(mock_ann_obj.pre_transcript) + assert mock_ann_obj.transcript.get_seq() == 'A' * len(mock_ann_obj.transcript) - def test_fetch_transcript_seq_from_parent_gene(self): - self.gene.seq = 'A' * len(self.gene) - self.assertEqual('A' * len(self.transcript), self.transcript.get_seq()) + def test_fetch_transcript_seq_from_parent_gene(self, mock_ann_obj): + mock_ann_obj.gene.seq = 'A' * len(mock_ann_obj.gene) + assert mock_ann_obj.transcript.get_seq() == 'A' * len(mock_ann_obj.transcript) - def test_fetch_transcript_seq_force_uncached(self): - self.transcript.seq = 'AAA' - self.assertEqual( - self.spliced_seq, self.transcript.get_seq(REFERENCE_GENOME, ignore_cache=True) + def test_fetch_transcript_seq_force_uncached(self, mock_ann_obj): + mock_ann_obj.transcript.seq = 'AAA' + assert ( + mock_ann_obj.transcript.get_seq(REFERENCE_GENOME, ignore_cache=True) + == mock_ann_obj.spliced_seq ) - def test_fetch_translation_aa_seq_from_ref(self): - cds = self.spliced_seq[self.translation.start - 1 : self.translation.end] - self.assertEqual(translate(cds), self.translation.get_aa_seq(REFERENCE_GENOME)) + def test_fetch_translation_aa_seq_from_ref(self, mock_ann_obj): + cds = mock_ann_obj.spliced_seq[ + mock_ann_obj.translation.start - 1 : mock_ann_obj.translation.end + ] + assert mock_ann_obj.translation.get_aa_seq(REFERENCE_GENOME) == translate(cds) - def test_fetch_translation_cds_seq_from_ref(self): - cds = self.spliced_seq[self.translation.start - 1 : self.translation.end] - self.assertEqual(cds, self.translation.get_seq(REFERENCE_GENOME)) + def test_fetch_translation_cds_seq_from_ref(self, mock_ann_obj): + cds = mock_ann_obj.spliced_seq[ + mock_ann_obj.translation.start - 1 : mock_ann_obj.translation.end + ] + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME) == cds - def test_fetch_translation_cds_seq_from_ref_revcomp(self): - self.gene.strand = STRAND.NEG - cdna = reverse_complement(self.spliced_seq) - cds = cdna[self.translation.start - 1 : self.translation.end] - self.assertEqual(cds, self.translation.get_seq(REFERENCE_GENOME)) + def test_fetch_translation_cds_seq_from_ref_revcomp(self, mock_ann_obj): + mock_ann_obj.gene.strand = STRAND.NEG + cdna = reverse_complement(mock_ann_obj.spliced_seq) + cds = cdna[mock_ann_obj.translation.start - 1 : mock_ann_obj.translation.end] + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME) == cds - def test_fetch_translation_cds_seq_from_stored(self): + def test_fetch_translation_cds_seq_from_stored(self, mock_ann_obj): expt = 'AAA' - self.translation.seq = expt - self.assertEqual(expt, self.translation.get_seq(REFERENCE_GENOME)) + mock_ann_obj.translation.seq = expt + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME) == expt - def test_fetch_translation_cds_seq_from_parent_transcript(self): - self.transcript.seq = 'A' * len(self.transcript) - self.assertEqual('A' * len(self.translation), self.translation.get_seq(REFERENCE_GENOME)) + def test_fetch_translation_cds_seq_from_parent_transcript(self, mock_ann_obj): + mock_ann_obj.transcript.seq = 'A' * len(mock_ann_obj.transcript) + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME) == 'A' * len( + mock_ann_obj.translation + ) - def test_fetch_translation_cds_seq_from_parent_ust(self): - self.pre_transcript.seq = 'A' * len(self.pre_transcript) - self.assertEqual('A' * len(self.translation), self.translation.get_seq(REFERENCE_GENOME)) + def test_fetch_translation_cds_seq_from_parent_ust(self, mock_ann_obj): + mock_ann_obj.pre_transcript.seq = 'A' * len(mock_ann_obj.pre_transcript) + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME) == 'A' * len( + mock_ann_obj.translation + ) - def test_fetch_translation_cds_seq_from_parent_gene(self): - self.gene.seq = 'A' * len(self.gene) - self.assertEqual('A' * len(self.translation), self.translation.get_seq(REFERENCE_GENOME)) + def test_fetch_translation_cds_seq_from_parent_gene(self, mock_ann_obj): + mock_ann_obj.gene.seq = 'A' * len(mock_ann_obj.gene) + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME) == 'A' * len( + mock_ann_obj.translation + ) - def test_fetch_translation_cds_seq_force_uncached(self): - self.translation.seq = 'AAA' - cds = self.spliced_seq[self.translation.start - 1 : self.translation.end] - self.assertEqual(cds, self.translation.get_seq(REFERENCE_GENOME, ignore_cache=True)) + def test_fetch_translation_cds_seq_force_uncached(self, mock_ann_obj): + mock_ann_obj.translation.seq = 'AAA' + cds = mock_ann_obj.spliced_seq[ + mock_ann_obj.translation.start - 1 : mock_ann_obj.translation.end + ] + assert mock_ann_obj.translation.get_seq(REFERENCE_GENOME, ignore_cache=True) == cds - def test_fetch_domain_seq_from_ref(self): + def test_fetch_domain_seq_from_ref(self, mock_ann_obj): seqs = ['VPC*PPIIRK', 'C*NHFNVFLF'] - self.assertEqual(seqs, self.domain.get_seqs(REFERENCE_GENOME)) + assert mock_ann_obj.domain.get_seqs(REFERENCE_GENOME) == seqs -class TestStrandInheritance(unittest.TestCase): - def setUp(self): - self.gene = Gene('1', 1, 500, strand=STRAND.POS) - pre_transcript = PreTranscript(gene=self.gene, exons=[(1, 100), (200, 300), (400, 500)]) - self.gene.unspliced_transcripts.append(pre_transcript) - for spl in pre_transcript.generate_splicing_patterns(): - t = Transcript(pre_transcript, spl) - pre_transcript.spliced_transcripts.append(t) - tl = Translation(51, 250, t) - t.translations.append(tl) +@pytest.fixture +def unstranded_gene(): + gene = Gene('1', 1, 500, strand=STRAND.POS) + pre_transcript = PreTranscript(gene=gene, exons=[(1, 100), (200, 300), (400, 500)]) + gene.unspliced_transcripts.append(pre_transcript) + for spl in pre_transcript.generate_splicing_patterns(): + t = Transcript(pre_transcript, spl) + pre_transcript.spliced_transcripts.append(t) + tl = Translation(51, 250, t) + t.translations.append(tl) + return gene - def test_strand_gene(self): - self.assertEqual(STRAND.POS, self.gene.get_strand()) - def test_strand_us_transcript(self): - self.assertEqual(STRAND.POS, self.gene.unspliced_transcripts[0].get_strand()) +class TestStrandInheritance: + def test_strand_gene(self, unstranded_gene): + assert unstranded_gene.get_strand() == STRAND.POS - def test_strand_spl_transcript(self): - self.assertEqual(STRAND.POS, self.gene.spliced_transcripts[0].get_strand()) + def test_strand_us_transcript(self, unstranded_gene): + assert unstranded_gene.unspliced_transcripts[0].get_strand() == STRAND.POS - def test_strand_translation(self): - self.assertEqual(STRAND.POS, self.gene.spliced_transcripts[0].translations[0].get_strand()) + def test_strand_spl_transcript(self, unstranded_gene): + assert unstranded_gene.spliced_transcripts[0].get_strand() == STRAND.POS + def test_strand_translation(self, unstranded_gene): + assert unstranded_gene.spliced_transcripts[0].translations[0].get_strand() == STRAND.POS -class TestCoordinateCoversion(unittest.TestCase): - def setUp(self): - self.gene = Gene('1', 15, 700, strand=STRAND.POS) - self.pre_transcript = PreTranscript( - gene=self.gene, exons=[(101, 200), (301, 400), (501, 600)] - ) - self.gene.unspliced_transcripts.append(self.pre_transcript) - assert 1 == len(self.pre_transcript.generate_splicing_patterns()) +@pytest.fixture +def coord_conv_setup(): + n = argparse.Namespace() + n.gene = Gene('1', 15, 700, strand=STRAND.POS) + + n.pre_transcript = PreTranscript(gene=n.gene, exons=[(101, 200), (301, 400), (501, 600)]) + n.gene.unspliced_transcripts.append(n.pre_transcript) + assert 1 == len(n.pre_transcript.generate_splicing_patterns()) + + spl = n.pre_transcript.generate_splicing_patterns()[0] + n.transcript = Transcript(n.pre_transcript, spl) + n.pre_transcript.spliced_transcripts.append(n.transcript) - spl = self.pre_transcript.generate_splicing_patterns()[0] - self.transcript = Transcript(self.pre_transcript, spl) - self.pre_transcript.spliced_transcripts.append(self.transcript) + n.translation = Translation(51, 251, n.transcript) + n.transcript.translations.append(n.translation) - self.translation = Translation(51, 251, self.transcript) - self.transcript.translations.append(self.translation) + n.rev_gene = Gene('1', 15, 700, strand=STRAND.NEG) + n.rev_ust = PreTranscript(gene=n.rev_gene, exons=[(101, 200), (301, 400), (501, 600)]) + n.gene.unspliced_transcripts.append(n.rev_ust) + assert 1 == len(n.rev_ust.generate_splicing_patterns()) - self.rev_gene = Gene('1', 15, 700, strand=STRAND.NEG) - self.rev_ust = PreTranscript(gene=self.rev_gene, exons=[(101, 200), (301, 400), (501, 600)]) - self.gene.unspliced_transcripts.append(self.rev_ust) - assert 1 == len(self.rev_ust.generate_splicing_patterns()) + spl = n.rev_ust.generate_splicing_patterns()[0] + n.rev_transcript = Transcript(n.rev_ust, spl) + n.rev_ust.spliced_transcripts.append(n.rev_transcript) - spl = self.rev_ust.generate_splicing_patterns()[0] - self.rev_transcript = Transcript(self.rev_ust, spl) - self.rev_ust.spliced_transcripts.append(self.rev_transcript) + n.rev_translation = Translation(51, 251, n.rev_transcript) + n.rev_transcript.translations.append(n.rev_translation) + return n - self.rev_translation = Translation(51, 251, self.rev_transcript) - self.rev_transcript.translations.append(self.rev_translation) - def test_cdna_to_genomic(self): - self.assertEqual(150, self.transcript.convert_cdna_to_genomic(50)) - self.assertEqual(550, self.transcript.convert_cdna_to_genomic(250)) +class TestCoordinateCoversion: + def test_cdna_to_genomic(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_cdna_to_genomic(50) == 150 + assert coord_conv_setup.transcript.convert_cdna_to_genomic(250) == 550 - def test_cdna_to_genomic_before(self): - self.assertEqual(100, self.transcript.convert_cdna_to_genomic(-1)) - self.assertEqual(51, self.transcript.convert_cdna_to_genomic(-50)) + def test_cdna_to_genomic_before(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_cdna_to_genomic(-1) == 100 + assert coord_conv_setup.transcript.convert_cdna_to_genomic(-50) == 51 - def test_cdna_to_genomic_after(self): - self.assertEqual(650, self.transcript.convert_cdna_to_genomic(350)) + def test_cdna_to_genomic_after(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_cdna_to_genomic(350) == 650 - def test_cdna_to_genomic_revcomp(self): - self.assertEqual(551, self.rev_transcript.convert_cdna_to_genomic(50)) - self.assertEqual(151, self.rev_transcript.convert_cdna_to_genomic(250)) + def test_cdna_to_genomic_revcomp(self, coord_conv_setup): + assert coord_conv_setup.rev_transcript.convert_cdna_to_genomic(50) == 551 + assert coord_conv_setup.rev_transcript.convert_cdna_to_genomic(250) == 151 - def test_genomic_to_cdna(self): - self.assertEqual(50, self.transcript.convert_genomic_to_cdna(150)) - self.assertEqual(249, self.transcript.convert_genomic_to_cdna(549)) + def test_genomic_to_cdna(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_genomic_to_cdna(150) == 50 + assert coord_conv_setup.transcript.convert_genomic_to_cdna(549) == 249 - def test_genomic_to_cdna_before(self): - self.assertEqual((1, -1), self.transcript.convert_genomic_to_nearest_cdna(100)) + def test_genomic_to_cdna_before(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(100) == (1, -1) - def test_genomic_to_cdna_after(self): - self.assertEqual((300, 1), self.transcript.convert_genomic_to_nearest_cdna(601)) + def test_genomic_to_cdna_after(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(601) == (300, 1) - def test_genomic_to_cdna_revcomp(self): - self.assertEqual(50, self.rev_transcript.convert_genomic_to_cdna(551)) - self.assertEqual(250, self.rev_transcript.convert_genomic_to_cdna(151)) + def test_genomic_to_cdna_revcomp(self, coord_conv_setup): + assert coord_conv_setup.rev_transcript.convert_genomic_to_cdna(551) == 50 + assert coord_conv_setup.rev_transcript.convert_genomic_to_cdna(151) == 250 - def test_aa_to_cdna(self): - self.assertEqual(Interval(51, 53), self.translation.convert_aa_to_cdna(1)) - self.assertEqual(Interval(249, 251), self.translation.convert_aa_to_cdna(67)) + def test_aa_to_cdna(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_aa_to_cdna(1) == Interval(51, 53) + assert coord_conv_setup.translation.convert_aa_to_cdna(67) == Interval(249, 251) - def test_cdna_to_aa(self): - self.assertEqual(1, self.translation.convert_cdna_to_aa(51)) - self.assertEqual(67, self.translation.convert_cdna_to_aa(251)) - with self.assertRaises(IndexError): - self.translation.convert_cdna_to_aa(50) - with self.assertRaises(IndexError): - self.translation.convert_cdna_to_aa(252) + def test_cdna_to_aa(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_cdna_to_aa(51) == 1 + assert coord_conv_setup.translation.convert_cdna_to_aa(251) == 67 + with pytest.raises(IndexError): + coord_conv_setup.translation.convert_cdna_to_aa(50) + with pytest.raises(IndexError): + coord_conv_setup.translation.convert_cdna_to_aa(252) - def test_genomic_to_cds(self): - self.assertEqual(1, self.translation.convert_genomic_to_cds(151)) - self.assertEqual(201, self.translation.convert_genomic_to_cds(551)) + def test_genomic_to_cds(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds(151) == 1 + assert coord_conv_setup.translation.convert_genomic_to_cds(551) == 201 - def test_genomic_to_cds_3prime_utr(self): - self.assertEqual(-1, self.translation.convert_genomic_to_cds(150)) + def test_genomic_to_cds_3prime_utr(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds(150) == -1 - def test_genomic_to_cds_5prime_utr(self): - self.assertEqual(202, self.translation.convert_genomic_to_cds(552)) + def test_genomic_to_cds_5prime_utr(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds(552) == 202 - def test_genomic_to_cds_notation(self): - self.assertEqual('1', self.translation.convert_genomic_to_cds_notation(151)) - self.assertEqual('201', self.translation.convert_genomic_to_cds_notation(551)) + def test_genomic_to_cds_notation(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds_notation(151) == '1' + assert coord_conv_setup.translation.convert_genomic_to_cds_notation(551) == '201' - def test_genomic_to_cds_notation_3prime_utr(self): - self.assertEqual('-1', self.translation.convert_genomic_to_cds_notation(150)) + def test_genomic_to_cds_notation_3prime_utr(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds_notation(150) == '-1' - def test_genomic_to_cds_notation_5prime_utr(self): - self.assertEqual('*1', self.translation.convert_genomic_to_cds_notation(552)) + def test_genomic_to_cds_notation_5prime_utr(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds_notation(552) == '*1' - def test_genomic_to_cds_notation_intronic_pos(self): - self.assertEqual('50+2', self.translation.convert_genomic_to_cds_notation(202)) + def test_genomic_to_cds_notation_intronic_pos(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds_notation(202) == '50+2' - def test_genomic_to_cds_notation_intronic_neg(self): - self.assertEqual('51-2', self.translation.convert_genomic_to_cds_notation(299)) + def test_genomic_to_cds_notation_intronic_neg(self, coord_conv_setup): + assert coord_conv_setup.translation.convert_genomic_to_cds_notation(299) == '51-2' - def test_genomic_to_nearest_cdna_exonic(self): - self.assertEqual((1, 0), self.transcript.convert_genomic_to_nearest_cdna(101)) - self.assertEqual((300, 0), self.transcript.convert_genomic_to_nearest_cdna(600)) - self.assertEqual((101, 0), self.transcript.convert_genomic_to_nearest_cdna(301)) + def test_genomic_to_nearest_cdna_exonic(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(101) == (1, 0) + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(600) == (300, 0) + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(301) == (101, 0) - def test_genomic_to_nearest_cdna_intronic_pos(self): - self.assertEqual((100, 10), self.transcript.convert_genomic_to_nearest_cdna(210)) + def test_genomic_to_nearest_cdna_intronic_pos(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(210) == (100, 10) - def test_genomic_to_nearest_cdna_intronic_neg(self): - self.assertEqual((101, -2), self.transcript.convert_genomic_to_nearest_cdna(299)) + def test_genomic_to_nearest_cdna_intronic_neg(self, coord_conv_setup): + assert coord_conv_setup.transcript.convert_genomic_to_nearest_cdna(299) == (101, -2) - def test_genomic_to_nearest_cdna_rev_exonic(self): - self.assertEqual((300, 0), self.rev_transcript.convert_genomic_to_nearest_cdna(101)) - self.assertEqual((1, 0), self.rev_transcript.convert_genomic_to_nearest_cdna(600)) - self.assertEqual((101, 0), self.rev_transcript.convert_genomic_to_nearest_cdna(400)) + def test_genomic_to_nearest_cdna_rev_exonic(self, coord_conv_setup): + assert coord_conv_setup.rev_transcript.convert_genomic_to_nearest_cdna(101) == (300, 0) + assert coord_conv_setup.rev_transcript.convert_genomic_to_nearest_cdna(600) == (1, 0) + assert coord_conv_setup.rev_transcript.convert_genomic_to_nearest_cdna(400) == (101, 0) - def test_genomic_to_nearest_cdna_rev_intronic_pos(self): - self.assertEqual((201, -10), self.rev_transcript.convert_genomic_to_nearest_cdna(210)) + def test_genomic_to_nearest_cdna_rev_intronic_pos(self, coord_conv_setup): + assert coord_conv_setup.rev_transcript.convert_genomic_to_nearest_cdna(210) == (201, -10) - def test_genomic_to_nearest_cdna_rev_intronic_neg(self): - self.assertEqual((200, 2), self.rev_transcript.convert_genomic_to_nearest_cdna(299)) + def test_genomic_to_nearest_cdna_rev_intronic_neg(self, coord_conv_setup): + assert coord_conv_setup.rev_transcript.convert_genomic_to_nearest_cdna(299) == (200, 2) -class TestUSTranscript(unittest.TestCase): +class TestUSTranscript: def test___init__implicit_start(self): t = PreTranscript(gene=None, exons=[(1, 100), (200, 300), (400, 500)], strand=STRAND.POS) - self.assertEqual(1, t.start) - self.assertEqual(t.start, t.start) - self.assertEqual(500, t.end) - self.assertEqual(t.end, t.end) - self.assertEqual(1, t[0]) - self.assertEqual(500, t[1]) - self.assertFalse(Interval.overlaps((0, 0), t)) - self.assertTrue(Interval.overlaps((1, 1), t)) - self.assertTrue(Interval.overlaps((1, 50), t)) + assert t.start == 1 + assert t.start == t.start + assert t.end == 500 + assert t.end == t.end + assert t[0] == 1 + assert t[1] == 500 + assert not Interval.overlaps((0, 0), t) + assert Interval.overlaps((1, 1), t) + assert Interval.overlaps((1, 50), t) def test___init__strand_mismatch(self): g = Gene('1', 1, 9999, name='KRAS', strand=STRAND.POS) - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): PreTranscript([(1, 100)], gene=g, strand=STRAND.NEG) def test___init__overlapping_exon_error(self): - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): PreTranscript(exons=[Exon(1, 15), Exon(10, 20)]) def test_exon_number(self): t = PreTranscript(gene=None, exons=[(1, 99), (200, 299), (400, 499)], strand=STRAND.POS) for i, e in enumerate(t.exons): - self.assertEqual(i + 1, t.exon_number(e)) + assert t.exon_number(e) == i + 1 t = PreTranscript(gene=None, exons=[(1, 99), (200, 299), (400, 499)], strand=STRAND.NEG) for i, e in enumerate(sorted(t.exons, key=lambda x: x.start, reverse=True)): - self.assertEqual(i + 1, t.exon_number(e)) + assert t.exon_number(e) == i + 1 -class TestDomain(unittest.TestCase): +class TestDomain: def test___init__region_error(self): - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Domain('name', [(1, 3), (4, 3)]) def test_get_seq_from_ref(self): @@ -1177,13 +1358,13 @@ def test_get_seq_from_ref(self): t = PreTranscript(exons=[(2, 5), (7, 15)], gene=g) tl = Translation(4, 11, t, []) d = Domain('name', [(1, 2)], translation=tl) - self.assertEqual([translate('GGGGAT')], d.get_seqs(ref)) + assert d.get_seqs(ref) == [translate('GGGGAT')] def test_get_seq_from_translation_seq(self): t = PreTranscript(exons=[(2, 5), (7, 15)], seq='CCCTAATCCCCTTT', strand=STRAND.NEG) tl = Translation(4, 11, t, []) d = Domain('name', [(1, 2)], translation=tl) - self.assertEqual([translate('TAATCC')], d.get_seqs()) + assert d.get_seqs() == [translate('TAATCC')] def test_align_seq(self): regions = [ @@ -1213,15 +1394,15 @@ def test_align_seq(self): ) d = Domain('name', regions) - self.assertTrue(len(refseq) >= 578) + assert len(refseq) >= 578 match, total, temp = d.align_seq(refseq) - self.assertEqual(sum([len(d.seq) for d in regions]), total) - self.assertEqual(total, match) - self.assertEqual(len(regions), len(temp)) + assert total == sum([len(d.seq) for d in regions]) + assert match == total + assert len(temp) == len(regions) for dr1, dr2 in zip(temp, regions): - self.assertEqual(dr1.start, dr2.start) - self.assertEqual(dr1.end, dr2.end) - self.assertEqual(dr1.seq, dr2.seq) + assert dr2.start == dr1.start + assert dr2.end == dr1.end + assert dr2.seq == dr1.seq refseq = ( 'MHRPPRHMGNKAMEPMDSPLMSAIPRLRPLQPMGRPPMQLLMDSLPLVILLQLPPRHTASLSRGMALVLMIPPL' @@ -1236,7 +1417,7 @@ def test_align_seq(self): ) dom = Domain('name', [DomainRegion(1, len(d), d)]) - with self.assertRaises(UserWarning): + with pytest.raises(UserWarning): dom.align_seq(refseq) seq = ( @@ -1251,42 +1432,42 @@ def test_align_seq(self): d = 'IYVQGLNDSVTLDDLADFFKQCGVVKMNKRTGQPMIHIYLDKETGKPKGDATVSYEDPPTAKAAVEWFDGKDFQGSKLK' dom = Domain('name', [DomainRegion(1, len(d), d)]) - with self.assertRaises(UserWarning): + with pytest.raises(UserWarning): m, t, regions = dom.align_seq(seq) -class TestBioInterval(unittest.TestCase): +class TestBioInterval: def test___eq__(self): a = BioInterval(REF_CHR, 1, 2) b = BioInterval(REF_CHR, 1, 2) c = BioInterval('test2', 1, 2) - self.assertEqual(a, a) - self.assertEqual(a, b) - self.assertNotEqual(a, None) - self.assertNotEqual(a, c) + assert a == a + assert b == a + assert a is not None + assert c != a -class TestGene(unittest.TestCase): +class TestGene: def test___hash__(self): g1 = Gene(REF_CHR, 1, 2, 'name1', STRAND.POS) g2 = Gene(REF_CHR, 1, 2, 'name2', STRAND.POS) h = set([g1, g2]) - self.assertEqual(2, len(h)) + assert len(h) == 2 def test___eq__(self): g1 = Gene(REF_CHR, 1, 2, 'name1', STRAND.POS) g2 = Gene(REF_CHR, 1, 2, 'name2', STRAND.POS) - self.assertNotEqual(g1, g2) + assert g2 != g1 g3 = Gene('test2', 1, 2, 'name1', STRAND.POS) - self.assertNotEqual(g1, g3) - self.assertNotEqual(g3, g1) - self.assertNotEqual(g1, None) - self.assertNotEqual(None, g1) + assert g3 != g1 + assert g1 != g3 + assert None != g1 # noqa: E711 + assert g1 != None # noqa: E711 def test_get_seq(self): ref = {'1': MockObject(seq='AACCCTTTGGG')} g = Gene('1', 3, 8, strand=STRAND.POS) - self.assertEqual('CCCTTT', g.get_seq(ref)) + assert g.get_seq(ref) == 'CCCTTT' g = Gene(REF_CHR, 2836, 4144, strand=STRAND.POS) seq = ( 'GCAACTATATAATCTGTGGGAATATCTCCTTTTACACCTAGCCCTACTTCTGTCTGGCTACAGTCATTTATCTGGCTTTGGGAAATGTGACCACAGAATCAGATAT' @@ -1303,16 +1484,16 @@ def test_get_seq(self): 'CATCGATAAACATCACAAAATGACTACTGGTAACCACTATGAAACTCTTTAAGCGGTAGGTCCTGTATGAATTTTACTCCTCATGATTTGAAGATTATGCATAAAT' 'TCCTTCTTCCTGTTATTTTGTTTCCAATTTAGTCTTT' ).upper() - self.assertEqual(seq, g.get_seq(REFERENCE_GENOME)) + assert g.get_seq(REFERENCE_GENOME) == seq -class TestAnnotationGathering(unittest.TestCase): +class TestAnnotationGathering: def test_overlapping_transcripts(self): b = Breakpoint('C', 1000, strand=STRAND.POS) g = Gene('C', 1, 9999, 'gene1', STRAND.POS) t = PreTranscript(exons=[(100, 199), (500, 699), (1200, 1300)], gene=g) g.transcripts.append(t) - self.assertTrue(Interval.overlaps(b, t)) + assert Interval.overlaps(b, t) t = PreTranscript(exons=[(100, 199), (500, 699), (800, 900)], gene=g) g.transcripts.append(t) h = Gene('C', 1, 9999, 'gene1', STRAND.NEG) @@ -1320,73 +1501,73 @@ def test_overlapping_transcripts(self): h.transcripts.append(t) d = {'C': [g, h]} tlist = overlapping_transcripts(d, b) - self.assertEqual(1, len(tlist)) + assert len(tlist) == 1 def test_breakpoint_within_gene(self): b = Breakpoint(REF_CHR, 150, 150) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(1, len(pos)) - self.assertEqual(1, len(neg)) - self.assertEqual(STRAND.POS, pos[0].get_strand()) - self.assertEqual(b.start, neg[0].start) - self.assertEqual(b.end, neg[0].end) - self.assertEqual(STRAND.NEG, neg[0].get_strand()) + assert len(pos) == 1 + assert len(neg) == 1 + assert pos[0].get_strand() == STRAND.POS + assert neg[0].start == b.start + assert neg[0].end == b.end + assert neg[0].get_strand() == STRAND.NEG def test_breakpoint_overlapping_gene(self): b = Breakpoint(REF_CHR, 150, 230) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(2, len(pos)) - self.assertEqual(201, pos[1].start) - self.assertEqual(b.end, pos[1].end) - self.assertEqual(1, len(neg)) - self.assertEqual(b.start, neg[0].start) - self.assertEqual(b.end, neg[0].end) + assert len(pos) == 2 + assert pos[1].start == 201 + assert pos[1].end == b.end + assert len(neg) == 1 + assert neg[0].start == b.start + assert neg[0].end == b.end b = Breakpoint(REF_CHR, 150, 225, strand=STRAND.POS) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(2, len(pos)) - self.assertEqual(100, pos[0].start) - self.assertEqual(200, pos[0].end) - self.assertEqual(201, pos[1].start) - self.assertEqual(b.end, pos[1].end) - self.assertEqual(1, len(neg)) - self.assertEqual(b.start, neg[0].start) - self.assertEqual(b.end, neg[0].end) + assert len(pos) == 2 + assert pos[0].start == 100 + assert pos[0].end == 200 + assert pos[1].start == 201 + assert pos[1].end == b.end + assert len(neg) == 1 + assert neg[0].start == b.start + assert neg[0].end == b.end b = Breakpoint(REF_CHR, 375, 425, strand=STRAND.POS) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(2, len(pos)) - self.assertEqual(300, pos[0].start) - self.assertEqual(400, pos[0].end) - self.assertEqual(401, pos[1].start) - self.assertEqual(b.end, pos[1].end) - self.assertEqual(1, len(neg)) - self.assertEqual(b.start, neg[0].start) - self.assertEqual(b.end, neg[0].end) + assert len(pos) == 2 + assert pos[0].start == 300 + assert pos[0].end == 400 + assert pos[1].start == 401 + assert pos[1].end == b.end + assert len(neg) == 1 + assert neg[0].start == b.start + assert neg[0].end == b.end def test_breakpoint_overlapping_mutliple_genes_and_intergenic(self): b = Breakpoint(REF_CHR, 150, 275) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(2, len(pos)) - self.assertEqual(201, pos[1].start) - self.assertEqual(b.end, pos[1].end) - self.assertEqual(2, len(neg)) - self.assertEqual(b.start, neg[0].start) - self.assertEqual(249, neg[0].end) + assert len(pos) == 2 + assert pos[1].start == 201 + assert pos[1].end == b.end + assert len(neg) == 2 + assert neg[0].start == b.start + assert neg[0].end == 249 def test_breakpoint_overlapping_mutliple_pos_genes(self): b = Breakpoint(REF_CHR, 575, 625) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(2, len(pos)) - self.assertEqual(1, len(neg)) - self.assertEqual(b.start, neg[0].start) - self.assertEqual(b.end, neg[0].end) + assert len(pos) == 2 + assert len(neg) == 1 + assert neg[0].start == b.start + assert neg[0].end == b.end def test_breakpoint_overlapping_mutliple_genes(self): b = Breakpoint(REF_CHR, 300, 350) pos, neg = _gather_breakpoint_annotations(REFERENCE_ANNOTATIONS, b) - self.assertEqual(1, len(pos)) - self.assertEqual(1, len(neg)) + assert len(pos) == 1 + assert len(neg) == 1 def test_intrachromosomal(self): b1 = Breakpoint(REF_CHR, 150, 225, strand=STRAND.POS) @@ -1395,16 +1576,16 @@ def test_intrachromosomal(self): ann_list = sorted( _gather_annotations(REFERENCE_ANNOTATIONS, bpp), key=lambda x: (x.break1, x.break2) ) - self.assertEqual(5, len(ann_list)) + assert len(ann_list) == 5 first = ann_list[0] - self.assertEqual(1, len(first.encompassed_genes)) - self.assertEqual(0, len(first.genes_proximal_to_break1)) - self.assertEqual(1, len(first.genes_proximal_to_break2)) - self.assertEqual(0, len(first.genes_overlapping_break1)) - self.assertEqual(0, len(first.genes_overlapping_break2)) + assert len(first.encompassed_genes) == 1 + assert len(first.genes_proximal_to_break1) == 0 + assert len(first.genes_proximal_to_break2) == 1 + assert len(first.genes_overlapping_break1) == 0 + assert len(first.genes_overlapping_break2) == 0 near, dist = list(first.genes_proximal_to_break2)[0] - self.assertEqual(50, dist) - self.assertEqual(2, len(ann_list[1].encompassed_genes)) + assert dist == 50 + assert len(ann_list[1].encompassed_genes) == 2 def test_interchromosomal(self): raise unittest.SkipTest('TODO') @@ -1419,8 +1600,8 @@ def test_intrachromosomal_within_gene_inversion(self): b2 = Breakpoint(REF_CHR, 2250, strand=STRAND.NEG) bpp = BreakpointPair(b1, b2) ann_list = sorted(_gather_annotations(ref, bpp), key=lambda x: (x.break1, x.break2)) - self.assertEqual(1, len(ann_list)) - self.assertEqual(ann_list[0].transcript1, ann_list[0].transcript2) + assert len(ann_list) == 1 + assert ann_list[0].transcript2 == ann_list[0].transcript1 def test_breakpoint_single_gene(self): g = Gene(REF_CHR, 1000, 3000, strand=STRAND.POS) @@ -1431,40 +1612,40 @@ def test_breakpoint_single_gene(self): b2 = Breakpoint(REF_CHR, 800, strand=STRAND.POS) bpp = BreakpointPair(b1, b2, event_type=SVTYPE.DEL, protocol=PROTOCOL.GENOME) ann_list = sorted(_gather_annotations(ref, bpp), key=lambda x: (x.break1, x.break2)) - self.assertEqual(3, len(ann_list)) + assert len(ann_list) == 3 for ann in ann_list: - self.assertTrue(ann.break1.start in ann.transcript1.position) - self.assertTrue(ann.break1.end in ann.transcript1.position) - self.assertTrue(ann.break2.start in ann.transcript2.position) - self.assertTrue(ann.break2.end in ann.transcript2.position) + assert ann.break1.start in ann.transcript1.position + assert ann.break1.end in ann.transcript1.position + assert ann.break2.start in ann.transcript2.position + assert ann.break2.end in ann.transcript2.position -class TestAnnotate(unittest.TestCase): +class TestAnnotate: def test_reference_name_eq(self): first, second = ReferenceName('chr1'), ReferenceName('1') - self.assertEqual(first, second) + assert second == first def test_reference_name_set(self): first, second = ReferenceName('chr1'), ReferenceName('1') d = {first, second} - self.assertEqual(1, len(d)) + assert len(d) == 1 def test_reference_name_dict(self): first, second = ReferenceName('chr1'), ReferenceName('1') d = {first: 1} d[second] = 2 print(d) - self.assertEqual(1, len(d)) + assert len(d) == 1 d = {first: 1, second: 2} - self.assertEqual(1, len(d)) + assert len(d) == 1 def test_loading_json_annotations(self): annotations = load_annotations(get_data('mock_reference_annotations.json')) - self.assertEqual(1, len(annotations.keys())) - self.assertEqual(1, len(list(annotations.values())[0])) + assert len(annotations.keys()) == 1 + assert len(list(annotations.values())[0]) == 1 def test_loading_annotations_not_found(self): - with self.assertRaises(FileNotFoundError): + with pytest.raises(FileNotFoundError): load_annotations('file.other') def test_determine_prime(self): @@ -1473,22 +1654,22 @@ def test_determine_prime(self): bleft = Breakpoint(REF_CHR, 1, 2, orient=ORIENT.LEFT) bright = Breakpoint(REF_CHR, 1, 2, orient=ORIENT.RIGHT) # positive left should be five prime - self.assertEqual(PRIME.FIVE, determine_prime(tpos, bleft)) + assert determine_prime(tpos, bleft) == PRIME.FIVE # positive right should be three prime - self.assertEqual(PRIME.THREE, determine_prime(tpos, bright)) + assert determine_prime(tpos, bright) == PRIME.THREE # negative left should be three prime - self.assertEqual(PRIME.THREE, determine_prime(tneg, bleft)) + assert determine_prime(tneg, bleft) == PRIME.THREE # negative right should be five prime - self.assertEqual(PRIME.FIVE, determine_prime(tneg, bright)) + assert determine_prime(tneg, bright) == PRIME.FIVE - with self.assertRaises(NotSpecifiedError): + with pytest.raises(NotSpecifiedError): bleft.orient = ORIENT.NS determine_prime(tpos, bleft) - with self.assertRaises(NotSpecifiedError): + with pytest.raises(NotSpecifiedError): determine_prime(tneg, bleft) - with self.assertRaises(NotSpecifiedError): + with pytest.raises(NotSpecifiedError): tpos.strand = STRAND.NS determine_prime(tpos, bright) @@ -1520,11 +1701,11 @@ def test_calculate_orf_nested(self): ) orfs = calculate_orf(seq) for orf in orfs: - self.assertEqual('ATG', seq[orf.start - 1 : orf.start + 2]) + assert seq[orf.start - 1 : orf.start + 2] == 'ATG' orfs = sorted(orfs) - self.assertEqual(2, len(orfs)) - self.assertEqual(Interval(1, 894), orfs[0]) - self.assertEqual(Interval(590, 724), orfs[1]) + assert len(orfs) == 2 + assert orfs[0] == Interval(1, 894) + assert orfs[1] == Interval(590, 724) seq = ( 'AAGGAGAGAAAATGGCGTCCACGGATTACAGTACCTATAGCCAAGCTGCAGCGCAGCAGGGCTACAGTGCTTACACCGCCCAGCCCACTCAAGGATATGC' @@ -1551,10 +1732,10 @@ def test_calculate_orf_nested(self): orfs = calculate_orf(seq) for orf in orfs: - self.assertEqual('ATG', seq[orf.start - 1 : orf.start + 2]) + assert seq[orf.start - 1 : orf.start + 2] == 'ATG' -class TestAnnotateEvents(unittest.TestCase): +class TestAnnotateEvents: def test_annotate_events(self): reference_annotations = load_annotations(get_data('mock_reference_annotations.full.tsv')) b1 = Breakpoint('fakereference9', 658, orient=ORIENT.RIGHT, strand=STRAND.POS) @@ -1570,14 +1751,14 @@ def test_annotate_events(self): annotations = annotate_events( [bpp], reference_genome=REFERENCE_GENOME, annotations=reference_annotations, filters=[] ) - self.assertEqual(4, len(annotations)) - self.assertEqual(STRAND.POS, annotations[0].transcript1.get_strand()) - self.assertEqual(STRAND.NEG, annotations[0].transcript2.get_strand()) - self.assertEqual('ENST00000375851', annotations[0].transcript1.name) - self.assertEqual(None, annotations[0].transcript2.name) + assert len(annotations) == 4 + assert annotations[0].transcript1.get_strand() == STRAND.POS + assert annotations[0].transcript2.get_strand() == STRAND.NEG + assert annotations[0].transcript1.name == 'ENST00000375851' + assert annotations[0].transcript2.name is None for ann in annotations: print(ann.transcript1, ann.transcript2) annotations = annotate_events( [bpp], reference_genome=REFERENCE_GENOME, annotations=reference_annotations ) - self.assertEqual(2, len(annotations)) + assert len(annotations) == 2 diff --git a/tests/integration/test_annotate_examples.py b/tests/integration/test_annotate_examples.py index 1e28f845..f6ed15ee 100644 --- a/tests/integration/test_annotate_examples.py +++ b/tests/integration/test_annotate_examples.py @@ -1,6 +1,3 @@ -import os -import unittest - from mavis.annotate.fusion import FusionTranscript from mavis.annotate.variant import ( Annotation, @@ -22,23 +19,20 @@ def get_best(gene): raise KeyError('no best transcript for gene', gene) -class TestNDUFA12(unittest.TestCase): - def setUp(self): - print(get_example_genes().keys()) - self.gene = get_example_genes()['NDUFA12'] - self.reference_annotations = {self.gene.chr: [self.gene]} - self.reference_genome = { - self.gene.chr: MockObject(seq=MockLongString(self.gene.seq, offset=self.gene.start - 1)) +class TestNDUFA12: + def test_annotate_events_synonymous(self): + gene = get_example_genes()['NDUFA12'] + reference_annotations = {gene.chr: [gene]} + reference_genome = { + gene.chr: MockObject(seq=MockLongString(gene.seq, offset=gene.start - 1)) } - self.best = get_best(self.gene) - def test_annotate_events_synonymous(self): - for gene_list in self.reference_annotations.values(): + for gene_list in reference_annotations.values(): for gene in gene_list: for t in gene.transcripts: print(t) - b1 = Breakpoint(self.gene.chr, 95344068, orient=ORIENT.LEFT, strand=STRAND.NS) - b2 = Breakpoint(self.gene.chr, 95344379, orient=ORIENT.RIGHT, strand=STRAND.NS) + b1 = Breakpoint(gene.chr, 95344068, orient=ORIENT.LEFT, strand=STRAND.NS) + b2 = Breakpoint(gene.chr, 95344379, orient=ORIENT.RIGHT, strand=STRAND.NS) bpp = BreakpointPair( b1, b2, @@ -49,28 +43,27 @@ def test_annotate_events_synonymous(self): untemplated_seq='', ) annotations = annotate_events( - [bpp], reference_genome=self.reference_genome, annotations=self.reference_annotations + [bpp], reference_genome=reference_genome, annotations=reference_annotations ) ann = annotations[0] for a in annotations: print(a, a.fusion, a.fusion.transcripts) print(a.transcript1, a.transcript1.transcripts) fseq = ann.fusion.transcripts[0].get_seq() - refseq = ann.transcript1.transcripts[0].get_seq(self.reference_genome) - self.assertEqual(refseq, fseq) - self.assertEqual(1, len(annotations)) + refseq = ann.transcript1.transcripts[0].get_seq(reference_genome) + assert fseq == refseq + assert len(annotations) == 1 -class TestARID1B(unittest.TestCase): - def setUp(self): - self.gene = get_example_genes()['ARID1B'] - self.reference_annotations = {self.gene.chr: [self.gene]} - self.reference_genome = { - self.gene.chr: MockObject(seq=MockLongString(self.gene.seq, offset=self.gene.start - 1)) +class TestARID1B: + def test_small_duplication(self): + gene = get_example_genes()['ARID1B'] + reference_annotations = {gene.chr: [gene]} + reference_genome = { + gene.chr: MockObject(seq=MockLongString(gene.seq, offset=gene.start - 1)) } - self.best = get_best(self.gene) + best = get_best(gene) - def test_small_duplication(self): bpp = BreakpointPair( Breakpoint('6', 157100005, strand='+', orient='R'), Breakpoint('6', 157100007, strand='+', orient='L'), @@ -80,41 +73,40 @@ def test_small_duplication(self): ) # annotate the breakpoint with the gene annotations = annotate_events( - [bpp], reference_genome=self.reference_genome, annotations=self.reference_annotations + [bpp], reference_genome=reference_genome, annotations=reference_annotations ) - self.assertEqual(1, len(annotations)) + assert len(annotations) == 1 - ann = Annotation(bpp, transcript1=self.best, transcript2=self.best) + ann = Annotation(bpp, transcript1=best, transcript2=best) ft = FusionTranscript.build( ann, - self.reference_genome, + reference_genome, min_orf_size=300, max_orf_cap=10, min_domain_mapping_match=0.9, ) - ref_tx = self.best.translations[0] + ref_tx = best.translations[0] fusion_tx = ft.translations[0] # compare the fusion translation to the refernece translation to create the protein notation - ref_aa_seq = ref_tx.get_aa_seq(self.reference_genome) + ref_aa_seq = ref_tx.get_aa_seq(reference_genome) call = IndelCall(ref_aa_seq, fusion_tx.get_aa_seq()) - self.assertTrue(call.is_dup) + assert call.is_dup - notation = call_protein_indel(ref_tx, fusion_tx, self.reference_genome) + notation = call_protein_indel(ref_tx, fusion_tx, reference_genome) print(notation) - self.assertEqual('ENST00000346085:p.G319dupG', notation) + assert notation == 'ENST00000346085:p.G319dupG' -class TestSVEP1(unittest.TestCase): - def setUp(self): - self.gene = get_example_genes()['SVEP1'] - self.reference_annotations = {self.gene.chr: [self.gene]} - self.reference_genome = { - self.gene.chr: MockObject(seq=MockLongString(self.gene.seq, offset=self.gene.start - 1)) +class TestSVEP1: + def test_annotate_small_intronic_inversion(self): + gene = get_example_genes()['SVEP1'] + reference_annotations = {gene.chr: [gene]} + reference_genome = { + gene.chr: MockObject(seq=MockLongString(gene.seq, offset=gene.start - 1)) } - self.best = get_best(self.gene) + best = get_best(gene) - def test_annotate_small_intronic_inversion(self): bpp = BreakpointPair( Breakpoint('9', 113152627, 113152627, orient='L'), Breakpoint('9', 113152635, 113152635, orient='L'), @@ -125,19 +117,25 @@ def test_annotate_small_intronic_inversion(self): untemplated_seq='', ) annotations = annotate_events( - [bpp], reference_genome=self.reference_genome, annotations=self.reference_annotations + [bpp], reference_genome=reference_genome, annotations=reference_annotations ) for a in annotations: print(a, a.transcript1, a.transcript2) - self.assertEqual(1, len(annotations)) + assert len(annotations) == 1 ann = annotations[0] - self.assertEqual(self.best, ann.transcript1) - self.assertEqual(self.best, ann.transcript2) - refseq = self.best.transcripts[0].get_seq(self.reference_genome) - self.assertEqual(1, len(ann.fusion.transcripts)) - self.assertEqual(refseq, ann.fusion.transcripts[0].get_seq()) + assert ann.transcript1 == best + assert ann.transcript2 == best + refseq = best.transcripts[0].get_seq(reference_genome) + assert len(ann.fusion.transcripts) == 1 + assert ann.fusion.transcripts[0].get_seq() == refseq def test_build_single_transcript_inversion(self): + gene = get_example_genes()['SVEP1'] + reference_genome = { + gene.chr: MockObject(seq=MockLongString(gene.seq, offset=gene.start - 1)) + } + best = get_best(gene) + bpp = BreakpointPair( Breakpoint('9', 113152627, 113152627, orient='L'), Breakpoint('9', 113152635, 113152635, orient='L'), @@ -147,29 +145,27 @@ def test_build_single_transcript_inversion(self): protocol=PROTOCOL.GENOME, untemplated_seq='', ) - ann = Annotation(bpp, transcript1=self.best, transcript2=self.best) + ann = Annotation(bpp, transcript1=best, transcript2=best) ft = FusionTranscript.build( ann, - self.reference_genome, + reference_genome, min_orf_size=300, max_orf_cap=10, min_domain_mapping_match=0.9, ) - refseq = self.best.transcripts[0].get_seq(self.reference_genome) - self.assertEqual(1, len(ft.transcripts)) - self.assertEqual(refseq, ft.transcripts[0].get_seq()) + refseq = best.transcripts[0].get_seq(reference_genome) + assert len(ft.transcripts) == 1 + assert ft.transcripts[0].get_seq() == refseq -class TestPRKCB(unittest.TestCase): - def setUp(self): - self.gene = get_example_genes()['PRKCB'] - self.reference_annotations = {self.gene.chr: [self.gene]} - self.reference_genome = { - self.gene.chr: MockObject(seq=MockLongString(self.gene.seq, offset=self.gene.start - 1)) +class TestPRKCB: + def test_retained_intron(self): + gene = get_example_genes()['PRKCB'] + reference_genome = { + gene.chr: MockObject(seq=MockLongString(gene.seq, offset=gene.start - 1)) } - self.best = get_best(self.gene) + best = get_best(gene) - def test_retained_intron(self): bpp = BreakpointPair( Breakpoint('16', 23957049, orient='L'), Breakpoint('16', 23957050, orient='R'), @@ -179,31 +175,29 @@ def test_retained_intron(self): protocol=PROTOCOL.TRANS, untemplated_seq='A', ) - ann = Annotation(bpp, transcript1=self.best, transcript2=self.best) + ann = Annotation(bpp, transcript1=best, transcript2=best) ft = FusionTranscript.build( ann, - self.reference_genome, + reference_genome, min_orf_size=300, max_orf_cap=10, min_domain_mapping_match=0.9, ) - self.assertEqual(1, len(ft.transcripts)) + assert len(ft.transcripts) == 1 print(ft.transcripts[0].splicing_pattern) - print(self.best.transcripts[0].splicing_pattern) - self.assertEqual(SPLICE_TYPE.RETAIN, ft.transcripts[0].splicing_pattern.splice_type) + print(best.transcripts[0].splicing_pattern) + assert ft.transcripts[0].splicing_pattern.splice_type == SPLICE_TYPE.RETAIN -class TestDSTYK(unittest.TestCase): - def setUp(self): +class TestDSTYK: + def test_build_single_transcript_inversion_reverse_strand(self): print(get_example_genes().keys()) - self.gene = get_example_genes()['DSTYK'] - self.reference_annotations = {self.gene.chr: [self.gene]} - self.reference_genome = { - self.gene.chr: MockObject(seq=MockLongString(self.gene.seq, offset=self.gene.start - 1)) + gene = get_example_genes()['DSTYK'] + reference_genome = { + gene.chr: MockObject(seq=MockLongString(gene.seq, offset=gene.start - 1)) } - self.best = get_best(self.gene) + best = get_best(gene) - def test_build_single_transcript_inversion_reverse_strand(self): # 1:205178631R 1:205178835R inversion bpp = BreakpointPair( Breakpoint('1', 205178631, orient='R'), @@ -214,10 +208,10 @@ def test_build_single_transcript_inversion_reverse_strand(self): protocol=PROTOCOL.GENOME, untemplated_seq='', ) - ann = Annotation(bpp, transcript1=self.best, transcript2=self.best) + ann = Annotation(bpp, transcript1=best, transcript2=best) ft = FusionTranscript.build( ann, - self.reference_genome, + reference_genome, min_orf_size=300, max_orf_cap=10, min_domain_mapping_match=0.9, @@ -233,8 +227,8 @@ def test_build_single_transcript_inversion_reverse_strand(self): len(ft.exon_mapping.get(ex.position, None)), ft.exon_number(ex), ) - # refseq = self.best.transcripts[0].get_seq(self.reference_genome) - self.assertEqual(1, len(ft.transcripts)) - self.assertEqual(1860, ft.break1) - self.assertEqual(2065, ft.break2) + # refseq = best.transcripts[0].get_seq(reference_genome) + assert len(ft.transcripts) == 1 + assert ft.break1 == 1860 + assert ft.break2 == 2065 flatten_fusion_transcript(ft.transcripts[0]) # test no error diff --git a/tests/integration/test_annotate_fileio.py b/tests/integration/test_annotate_fileio.py index 6eacbe8b..926928aa 100644 --- a/tests/integration/test_annotate_fileio.py +++ b/tests/integration/test_annotate_fileio.py @@ -1,28 +1,24 @@ -import os -import unittest - from mavis.annotate.file_io import convert_tab_to_json, load_annotations from ..util import get_data +TAB = get_data('annotations_subsample.tab') +JSON = get_data('annotations_subsample.json') -class TestAnnotationLoading(unittest.TestCase): - def setUp(self): - self.tab = get_data('annotations_subsample.tab') - self.json = get_data('annotations_subsample.json') +class TestAnnotationLoading: def test_convert_tab_to_json(self): - json = convert_tab_to_json(self.tab, warn=print) - self.assertEqual(32, len(json['genes'])) + json = convert_tab_to_json(TAB, warn=print) + assert len(json['genes']) == 32 def test_tab_equivalent_to_json(self): - tab_result = load_annotations(self.tab, warn=print) - json_result = load_annotations(self.json, warn=print) - self.assertEqual(sorted(tab_result.keys()), sorted(json_result.keys())) + tab_result = load_annotations(TAB, warn=print) + json_result = load_annotations(JSON, warn=print) + assert sorted(json_result.keys()) == sorted(tab_result.keys()) def test_load_tab(self): - result = load_annotations(self.tab, warn=print) - self.assertEqual(12, len(result.keys())) + result = load_annotations(TAB, warn=print) + assert len(result.keys()) == 12 domains = [] for gene in result['12']: for t in gene.spliced_transcripts: @@ -35,10 +31,10 @@ def test_load_tab(self): break for d in domains: print(d.name, d.regions) - self.assertEqual(2, len(domains)) + assert len(domains) == 2 result = load_annotations(get_data('mock_reference_annotations.tsv'), warn=print) - self.assertEqual(1, len(result.keys())) + assert len(result.keys()) == 1 def test_load_json(self): - result = load_annotations(self.json, warn=print) - self.assertEqual(12, len(result.keys())) + result = load_annotations(JSON, warn=print) + assert len(result.keys()) == 12 diff --git a/tests/integration/test_args.py b/tests/integration/test_args.py index 0dea660f..db14bc55 100644 --- a/tests/integration/test_args.py +++ b/tests/integration/test_args.py @@ -1,6 +1,4 @@ -import argparse import json -import os import sys import tempfile from unittest.mock import patch diff --git a/tests/integration/test_assemble.py b/tests/integration/test_assemble.py index 9903f8ff..8cae4394 100644 --- a/tests/integration/test_assemble.py +++ b/tests/integration/test_assemble.py @@ -1,7 +1,6 @@ -import os import time -import unittest +import pytest import timeout_decorator from mavis.assemble import Contig, assemble, filter_contigs from mavis.constants import reverse_complement @@ -9,11 +8,11 @@ from mavis.schemas import DEFAULTS from mavis.util import LOG -from ..util import get_data -from . import RUN_FULL, MockObject +from ..util import get_data, long_running_test +from . import MockObject -class TestFilterContigs(unittest.TestCase): +class TestFilterContigs: @timeout_decorator.timeout(30) def test_large_set(self): contigs = [] @@ -27,61 +26,66 @@ def test_large_set(self): print() for c in filtered: print(c.seq) - self.assertEqual(3, len(filtered)) # figure out amount later. need to optimize timing + assert len(filtered) == 3 # figure out amount later. need to optimize timing -class TestContigRemap(unittest.TestCase): - def setUp(self): - self.contig = Contig(' ' * 60, None) - self.contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10)) - self.contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=20)) - self.contig.add_mapped_sequence(MockObject(reference_start=50, reference_end=60)) +@pytest.fixture +def contig(): + contig = Contig(' ' * 60, None) + contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10)) + contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=20)) + contig.add_mapped_sequence(MockObject(reference_start=50, reference_end=60)) + return contig - def test_depth_even_coverage(self): - covg = self.contig.remap_depth(Interval(1, 10)) - self.assertEqual(2, covg) - def test_depth_mixed_coverage(self): - covg = self.contig.remap_depth(Interval(1, 20)) - self.assertEqual(1.5, covg) +class TestContigRemap: + def test_depth_even_coverage(self, contig): + covg = contig.remap_depth(Interval(1, 10)) + assert covg == 2 - def test_depth_no_coverage(self): - covg = self.contig.remap_depth(Interval(21, 49)) - self.assertEqual(0, covg) + def test_depth_mixed_coverage(self, contig): + covg = contig.remap_depth(Interval(1, 20)) + assert covg == 1.5 - def test_depth_whole_contig_coverage(self): - self.assertAlmostEqual(40 / 60, self.contig.remap_depth()) + def test_depth_no_coverage(self, contig): + covg = contig.remap_depth(Interval(21, 49)) + assert covg == 0 - def test_depth_weighted_read(self): - self.contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10), 5) - self.assertAlmostEqual(42 / 60, self.contig.remap_depth()) + def test_depth_whole_contig_coverage(self, contig): + assert pytest.approx(contig.remap_depth()) == 40 / 60 - def test_depth_bad_query_range(self): - with self.assertRaises(ValueError): - self.contig.remap_depth(Interval(0, 10)) - with self.assertRaises(ValueError): - self.contig.remap_depth(Interval(1, len(self.contig.seq) + 1)) + def test_depth_weighted_read(self, contig): + contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10), 5) + assert pytest.approx(contig.remap_depth()) == 42 / 60 - def test_coverage(self): - self.assertEqual(0.5, self.contig.remap_coverage()) + def test_depth_bad_query_range(self, contig): + with pytest.raises(ValueError): + contig.remap_depth(Interval(0, 10)) + with pytest.raises(ValueError): + contig.remap_depth(Interval(1, len(contig.seq) + 1)) + def test_coverage(self, contig): + assert contig.remap_coverage() == 0.5 -class TestAssemble(unittest.TestCase): - @classmethod - def setUpClass(cls): - # load files here so they do not count towar timeout checking - sequences = set() - with open(get_data('long_filter_assembly.txt'), 'r') as fh: - sequences.update([s.strip() for s in fh.readlines() if s]) - cls.long_filter_seq = sequences - sequences = set() - with open(get_data('large_assembly.txt'), 'r') as fh: - sequences.update([line.strip() for line in fh.readlines()]) - cls.large_assembly_seq = sequences - def setUp(self): - self.log = lambda *x, **k: print(x, k) +@pytest.fixture +def long_filter_seq(): + # load files here so they do not count towar timeout checking + sequences = set() + with open(get_data('long_filter_assembly.txt'), 'r') as fh: + sequences.update([s.strip() for s in fh.readlines() if s]) + return sequences + +@pytest.fixture +def large_assembly_seq(): + sequences = set() + with open(get_data('large_assembly.txt'), 'r') as fh: + sequences.update([line.strip() for line in fh.readlines()]) + return sequences + + +class TestAssemble: def test1(self): sequences = [ 'TCTTTTTCTTTCTTTCTTTCTTTCTTTCTATTCTATCTTCTTCCTGACTCTTCCTAGCTTAGTCTTACTGACAAGCATGTTACCTTCTTTTTATTTTTGTTTTTAAACCACATTGATCGTAAATCGCCGTGCTTGGTGCTTAATGTACTT', @@ -177,11 +181,10 @@ def test1(self): remap_min_exact_match=6, assembly_max_paths=20, assembly_min_uniq=0.01, - log=self.log, ) for contig in assembly: print(contig.seq) - self.assertTrue(assembly) + assert assembly def test_assembly_low_center(self): sequences = { @@ -245,11 +248,10 @@ def test_assembly_low_center(self): remap_min_exact_match=6, assembly_max_paths=20, assembly_min_uniq=0.01, - log=self.log, ) for assembly in assemblies: print(assembly.seq) - self.assertEqual(2, len(assemblies)) + assert len(assemblies) == 2 def test_low_evidence(self): seqs = [ @@ -280,11 +282,10 @@ def test_low_evidence(self): remap_min_exact_match=6, assembly_max_paths=20, assembly_min_uniq=0.01, - log=self.log, ) for assembly in assemblies: print(assembly.seq, assembly.remap_score()) - self.assertEqual(2, len(assemblies)) + assert len(assemblies) == 2 def test_multiple_events(self): sequences = { @@ -341,28 +342,23 @@ def test_multiple_events(self): remap_min_exact_match=DEFAULTS['validate.assembly_min_exact_match_to_remap'], assembly_max_paths=DEFAULTS['validate.assembly_max_paths'], assembly_min_uniq=0.01, - log=self.log, ) print('assemblies', len(assemblies)) for assembly in assemblies: print(assembly.seq, assembly.remap_score()) print(reverse_complement(assembly.seq)) expected = 'ACCAGGTCTTCGATATATAAAAACCCTAGGTCGGCCGGTCGGCCGTGTTAGTGAGACACACACACACACATGTATACCCGTGCGCGCCCGCGGGAGAGAGAGAGAGAGAGATATATATATAGCAGACCAGGAGAGCGAGAGCGAGAGAGATATAGAGAGATCGCGCGCGAGAGAGATAGGAGACC' - self.assertEqual(expected, assemblies[0].seq) - self.assertEqual(1, len(assemblies)) + assert assemblies[0].seq == expected + assert len(assemblies) == 1 @timeout_decorator.timeout(300) - @unittest.skipIf( - not RUN_FULL, - 'slower tests will not be run unless the environment variable RUN_FULL is given', - ) - def test_large_assembly(self): + @long_running_test + def test_large_assembly(self, large_assembly_seq): # simply testing that this will complete before the timeout - sequences = self.large_assembly_seq kmer_size = 150 * DEFAULTS['validate.assembly_kmer_size'] print('read inputs') contigs = assemble( - sequences, + large_assembly_seq, kmer_size, min_edge_trim_weight=DEFAULTS['validate.assembly_min_edge_trim_weight'], assembly_max_paths=DEFAULTS['validate.assembly_max_paths'], @@ -374,7 +370,7 @@ def test_large_assembly(self): for contig in contigs: print(len(contig.seq), contig.remap_score()) print(contig.seq) - self.assertTrue(len(contigs)) + assert len(contigs) def test_assemble_short_contig(self): sequences = { @@ -626,16 +622,12 @@ def test_assemble_short_contig(self): print('target', target) for contig in contigs: print(len(contig.seq), contig.remap_score(), contig.seq) - self.assertTrue({target, reverse_complement(target)} & {c.seq for c in contigs}) + assert {target, reverse_complement(target)} & {c.seq for c in contigs} @timeout_decorator.timeout(120) - @unittest.skipIf( - not RUN_FULL, - 'slower tests will not be run unless the environment variable RUN_FULL is given', - ) - def test_long_filter_bug(self): - sequences = self.long_filter_seq - contigs = assemble(sequences, 111, 3, 8, 0.1, 0.1, log=LOG) + @long_running_test + def test_long_filter_bug(self, long_filter_seq): + contigs = assemble(long_filter_seq, 111, 3, 8, 0.1, 0.1, log=LOG) for c in contigs: print(c.seq, c.remap_score()) - self.assertTrue(len(contigs)) + assert len(contigs) diff --git a/tests/integration/test_bam.py b/tests/integration/test_bam.py index 0712bc57..7f8b87f7 100644 --- a/tests/integration/test_bam.py +++ b/tests/integration/test_bam.py @@ -1,9 +1,9 @@ +import argparse import logging -import os -import unittest import warnings from unittest import mock +import pytest import timeout_decorator from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.bam import cigar as _cigar @@ -16,15 +16,7 @@ sequenced_strand, ) from mavis.bam.stats import Histogram, compute_genome_bam_stats, compute_transcriptome_bam_stats -from mavis.constants import ( - CIGAR, - DNA_ALPHABET, - NA_MAPPING_QUALITY, - ORIENT, - READ_PAIR_TYPE, - STRAND, - SVTYPE, -) +from mavis.constants import CIGAR, DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE from mavis.interval import Interval from ..util import get_data @@ -44,23 +36,23 @@ def setUpModule(): raise AssertionError('fake genome file does not have the expected contents') -class TestBamCache(unittest.TestCase): +class TestBamCache: def test___init__(self): fh = MockBamFileHandle() b = BamCache(fh) - self.assertEqual(fh, b.fh) + assert b.fh == fh def test_add_read(self): fh = MockBamFileHandle() b = BamCache(fh) r = mock.MagicMock(query_name='name', query_sequence='') b.add_read(r) - self.assertEqual(1, len(b.cache.values())) + assert len(b.cache.values()) == 1 b.add_read(r) - self.assertEqual(1, len(b.cache.values())) + assert len(b.cache.values()) == 1 r.reference_start = 0 b.add_read(r) - self.assertEqual(1, len(b.cache.values())) + assert len(b.cache.values()) == 1 @mock.patch('mavis.util.LOG') def test_add_invalid_read(self, log_patcher): @@ -69,7 +61,7 @@ def test_add_invalid_read(self, log_patcher): ) cache = BamCache(MockBamFileHandle()) cache.add_read(bad_read) - self.assertEqual(0, len(cache.cache)) + assert len(cache.cache) == 0 log_patcher.assert_called_with('ignoring invalid read', 'BAD_READ', level=logging.DEBUG) @mock.patch('mavis.util.LOG') @@ -81,7 +73,7 @@ def test_fetch_invalid_read(self, log_patcher): fh.configure_mock(**{'fetch.return_value': [bad_read]}) cache = BamCache(fh) cache.fetch('chr', 1, 10) - self.assertEqual(0, len(cache.cache)) + assert len(cache.cache) == 0 log_patcher.assert_called_with('ignoring invalid read', 'BAD_READ', level=logging.DEBUG) @mock.patch('mavis.util.LOG') @@ -93,85 +85,88 @@ def test_bin_fetch_invalid_read(self, log_patcher): fh.configure_mock(**{'fetch.return_value': [bad_read]}) cache = BamCache(fh) cache.fetch_from_bins('chr', 1, 10) - self.assertEqual(0, len(cache.cache)) + assert len(cache.cache) == 0 log_patcher.assert_called_with('ignoring invalid read', 'BAD_READ', level=logging.DEBUG) def test_reference_id(self): fh = MockBamFileHandle({'1': 0}) b = BamCache(fh) - self.assertEqual(0, b.reference_id('1')) - with self.assertRaises(KeyError): + assert b.reference_id('1') == 0 + with pytest.raises(KeyError): b.reference_id('2') def test_get_read_reference_name(self): fh = MockBamFileHandle({'1': 0}) b = BamCache(fh) r = MockRead('name', 0) - self.assertEqual('1', b.get_read_reference_name(r)) + assert b.get_read_reference_name(r) == '1' def test_generate_fetch_bins_single(self): - self.assertEqual([(1, 100)], BamCache._generate_fetch_bins(1, 100, 1, 1)) + assert BamCache._generate_fetch_bins(1, 100, 1, 1) == [(1, 100)] def test_generate_fetch_bins_multi(self): - self.assertEqual([(1, 50), (51, 100)], BamCache._generate_fetch_bins(1, 100, 2, 1)) - self.assertEqual( - [(1, 20), (21, 40), (41, 60), (61, 80), (81, 100)], - BamCache._generate_fetch_bins(1, 100, 5, 1), - ) + assert BamCache._generate_fetch_bins(1, 100, 2, 1) == [(1, 50), (51, 100)] + assert BamCache._generate_fetch_bins(1, 100, 5, 1) == [ + (1, 20), + (21, 40), + (41, 60), + (61, 80), + (81, 100), + ] def test_generate_fetch_bins_large_min_size(self): - self.assertEqual([(1, 50), (51, 100)], BamCache._generate_fetch_bins(1, 100, 5, 50)) + assert BamCache._generate_fetch_bins(1, 100, 5, 50) == [(1, 50), (51, 100)] def test_fetch_single_read(self): b = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) s = b.fetch_from_bins('reference3', 1382, 1383, read_limit=1, sample_bins=1) - self.assertEqual(1, len(s)) + assert len(s) == 1 r = list(s)[0] - self.assertEqual('HISEQX1_11:4:2122:14275:37717:split', r.qname) + assert r.qname == 'HISEQX1_11:4:2122:14275:37717:split' b.close() def test_get_mate(self): # dependant on fetch working b = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) s = b.fetch_from_bins('reference3', 1382, 1383, read_limit=1, sample_bins=1) - self.assertEqual(1, len(s)) + assert len(s) == 1 r = list(s)[0] - self.assertEqual('HISEQX1_11:4:2122:14275:37717:split', r.qname) + assert r.qname == 'HISEQX1_11:4:2122:14275:37717:split' o = b.get_mate(r, allow_file_access=True) - self.assertEqual(1, len(o)) - self.assertEqual('HISEQX1_11:4:2122:14275:37717:split', o[0].qname) + assert len(o) == 1 + assert o[0].qname == 'HISEQX1_11:4:2122:14275:37717:split' -class TestModule(unittest.TestCase): +class TestModule: """ test class for functions in the validate namespace that are not associated with a class """ def test_alphabet_matching(self): - self.assertTrue(DNA_ALPHABET.match('N', 'A')) - self.assertTrue(DNA_ALPHABET.match('A', 'N')) + assert DNA_ALPHABET.match('N', 'A') + assert DNA_ALPHABET.match('A', 'N') def test_breakpoint_pos(self): # ==========+++++++++> r = MockRead(reference_start=10, cigar=[(CIGAR.M, 10), (CIGAR.S, 10)]) - self.assertEqual(19, _read.breakpoint_pos(r)) + assert _read.breakpoint_pos(r) == 19 - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): breakpoint_pos(r, ORIENT.RIGHT) - self.assertEqual(19, _read.breakpoint_pos(r, ORIENT.LEFT)) + assert _read.breakpoint_pos(r, ORIENT.LEFT) == 19 # ++++++++++=========> r = MockRead(reference_start=10, cigar=[(CIGAR.S, 10), (CIGAR.M, 10)]) - self.assertEqual(10, _read.breakpoint_pos(r)) + assert _read.breakpoint_pos(r) == 10 - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): breakpoint_pos(r, ORIENT.LEFT) - self.assertEqual(10, _read.breakpoint_pos(r, ORIENT.RIGHT)) + assert _read.breakpoint_pos(r, ORIENT.RIGHT) == 10 - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): r = MockRead(reference_start=10, cigar=[(CIGAR.X, 10), (CIGAR.M, 10)]) _read.breakpoint_pos(r, ORIENT.LEFT) @@ -188,7 +183,7 @@ def test_nsb_align(self): # GATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG -class TestNsbAlign(unittest.TestCase): +class TestNsbAlign: def test_length_seq_le_ref(self): ref = ( 'GATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAG' @@ -199,9 +194,9 @@ def test_length_seq_le_ref(self): 'TGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAAC' ) alignment = _read.nsb_align(ref, seq) - self.assertEqual(1, len(alignment)) + assert len(alignment) == 1 alignment = _read.nsb_align(ref, seq, min_consecutive_match=20) - self.assertEqual(0, len(alignment)) + assert len(alignment) == 0 def test_length_ref_le_seq(self): pass @@ -217,17 +212,17 @@ def test_long_ref_seq(self): 'CGCAGCTACTCAGGAGATCGGAAG' ) alignment = _read.nsb_align(ref, seq, min_consecutive_match=6) - self.assertEqual(1, len(alignment)) + assert len(alignment) == 1 def test_left_softclipping(self): ref = 'TAAGCTTCTTCCTTTTTCTATGCCACCTACATAGGCATTTTGCATGGTCAGATTGGAATTTACATAATGCATACATGCAAAGAAATATATAGAAGCCAGATATATAAGGTAGTACATTGGCAGGCTTCATATATATAGACTCCCCCATATTGTCTATATGCTAAAAAAGTATTTTAAATCCTTAAATTTTATTTTTGTTCTCTGCATTTGAAATCTTTATCAACTAGGTCATGAAAATAGCCAGTCGGTTCTCCTTTTGGTCTATTAGAATAAAATCTGGACTGCAACTGAGAAGCAGAAGGTAATGTCAGAATGTAT' seq = 'GCTAAAAAAGTATTTTAAATCCTTAAATGTTATTTTTGTTCTC' alignments = _read.nsb_align(ref, seq, min_consecutive_match=6) - self.assertEqual(1, len(alignments)) + assert len(alignments) == 1 print(alignments) seq = 'CTTATAAAGCTGGAGTATCTGCTGAGAGCATCAGGAATTGACATCTAGGATAATGAGAGAAGGCTGATCATGGACAACATATAGCCTTTCTAGTAGATGCAGCTGAGGCTAAAAAAGTATTTTAAATCCTTAAATGTTATTTTTGTTCTC' alignments = _read.nsb_align(ref, seq, min_consecutive_match=6, min_overlap_percent=0.5) - self.assertEqual(1, len(alignments)) + assert len(alignments) == 1 def test_min_overlap(self): ref = 'ATTACATTAAAGATTCAAACTCCTAGAGTTTTTTTGATTTTTAGTATGATCTTTAGATAAAAAAAAAGGAAGAAAAAGAAAAAAAAACAGAGTCTATTAAGGCATCTTCTATGGTCAGATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGAT' @@ -240,193 +235,201 @@ def test_min_overlap(self): min_overlap_percent=(len(seq) - 15) / len(seq), ) print(alignments) - self.assertEqual(0, len(alignments)) - - -class TestReadPairStrand(unittest.TestCase): - def setUp(self): - self.read1_pos_neg = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=True) - assert not self.read1_pos_neg.is_read2 - self.read1_neg_pos = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=False) - self.read1_pos_pos = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=False) - self.read1_neg_neg = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=True) - - self.read2_pos_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - assert self.read2_pos_neg.is_read2 - self.read2_neg_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) - self.read2_pos_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) - self.read2_neg_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - - self.unpaired_pos = MockRead(is_reverse=False, is_paired=False) - self.unpaired_neg = MockRead(is_reverse=True, is_paired=False) - - def test_read_pair_strand_det1_read1(self): - self.assertEqual( - STRAND.POS, sequenced_strand(self.read1_pos_neg, strand_determining_read=1) - ) - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read1_neg_pos, strand_determining_read=1) - ) - self.assertEqual( - STRAND.POS, sequenced_strand(self.read1_pos_pos, strand_determining_read=1) - ) - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read1_neg_neg, strand_determining_read=1) - ) + assert len(alignments) == 0 - def test_read_pair_strand_det1_read2(self): - self.assertEqual( - STRAND.POS, sequenced_strand(self.read2_pos_neg, strand_determining_read=1) - ) - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read2_neg_pos, strand_determining_read=1) - ) - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read2_pos_pos, strand_determining_read=1) - ) - self.assertEqual( - STRAND.POS, sequenced_strand(self.read2_neg_neg, strand_determining_read=1) - ) - def test_read_pair_strand_det2_read2(self): - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read2_pos_neg, strand_determining_read=2) - ) - self.assertEqual( - STRAND.POS, sequenced_strand(self.read2_neg_pos, strand_determining_read=2) - ) - self.assertEqual( - STRAND.POS, sequenced_strand(self.read2_pos_pos, strand_determining_read=2) - ) - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read2_neg_neg, strand_determining_read=2) - ) +@pytest.fixture +def stranded_reads(): + n = argparse.Namespace() + n.read1_pos_neg = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=True) + assert not n.read1_pos_neg.is_read2 + n.read1_neg_pos = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=False) + n.read1_pos_pos = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=False) + n.read1_neg_neg = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=True) - def test_read_pair_strand_det2_read1(self): - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read1_pos_neg, strand_determining_read=2) - ) - self.assertEqual( - STRAND.POS, sequenced_strand(self.read1_neg_pos, strand_determining_read=2) - ) - self.assertEqual( - STRAND.NEG, sequenced_strand(self.read1_pos_pos, strand_determining_read=2) - ) - self.assertEqual( - STRAND.POS, sequenced_strand(self.read1_neg_neg, strand_determining_read=2) - ) + n.read2_pos_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) + assert n.read2_pos_neg.is_read2 + n.read2_neg_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) + n.read2_pos_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) + n.read2_neg_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - def test_read_pair_strand_unpaired(self): - with self.assertRaises(ValueError): - sequenced_strand(self.unpaired_pos) - with self.assertRaises(ValueError): - sequenced_strand(self.unpaired_neg) - - def test_read_pair_strand_det_error(self): - with self.assertRaises(ValueError): - sequenced_strand(self.read1_pos_neg, strand_determining_read=3) - - -class TestReadPairType(unittest.TestCase): - def setUp(self): - self.LR = MockRead( - reference_id=0, - next_reference_id=0, - reference_start=1, - next_reference_start=2, - is_reverse=False, - mate_is_reverse=True, - ) - self.LL = MockRead( - reference_id=0, - next_reference_id=0, - reference_start=1, - next_reference_start=2, - is_reverse=False, - mate_is_reverse=False, - ) - self.RR = MockRead( - reference_id=0, - next_reference_id=0, - reference_start=1, - next_reference_start=2, - is_reverse=True, - mate_is_reverse=True, + n.unpaired_pos = MockRead(is_reverse=False, is_paired=False) + n.unpaired_neg = MockRead(is_reverse=True, is_paired=False) + return n + + +class TestReadPairStrand: + def test_read_pair_strand_det1_read1(self, stranded_reads): + assert ( + sequenced_strand(stranded_reads.read1_pos_neg, strand_determining_read=1) == STRAND.POS + ) + assert ( + sequenced_strand(stranded_reads.read1_neg_pos, strand_determining_read=1) == STRAND.NEG + ) + assert ( + sequenced_strand(stranded_reads.read1_pos_pos, strand_determining_read=1) == STRAND.POS + ) + assert ( + sequenced_strand(stranded_reads.read1_neg_neg, strand_determining_read=1) == STRAND.NEG + ) + + def test_read_pair_strand_det1_read2(self, stranded_reads): + assert ( + sequenced_strand(stranded_reads.read2_pos_neg, strand_determining_read=1) == STRAND.POS ) - self.RL = MockRead( - reference_id=0, - next_reference_id=0, - reference_start=1, - next_reference_start=2, - is_reverse=True, - mate_is_reverse=False, - ) - - def test_read_pair_type_LR(self): - self.assertEqual(READ_PAIR_TYPE.LR, read_pair_type(self.LR)) - - def test_read_pair_type_LL(self): - self.assertEqual(READ_PAIR_TYPE.LL, read_pair_type(self.LL)) - - def test_read_pair_type_RR(self): - self.assertEqual(READ_PAIR_TYPE.RR, read_pair_type(self.RR)) - - def test_read_pair_type_RL(self): - self.assertEqual(READ_PAIR_TYPE.RL, read_pair_type(self.RL)) - - def test_orientation_supports_type_deletion(self): - self.assertTrue(orientation_supports_type(self.LR, SVTYPE.DEL)) - self.assertFalse(orientation_supports_type(self.RL, SVTYPE.DEL)) - self.assertFalse(orientation_supports_type(self.LL, SVTYPE.DEL)) - self.assertFalse(orientation_supports_type(self.RR, SVTYPE.DEL)) - - def test_orientation_supports_type_insertion(self): - self.assertTrue(orientation_supports_type(self.LR, SVTYPE.INS)) - self.assertFalse(orientation_supports_type(self.RL, SVTYPE.INS)) - self.assertFalse(orientation_supports_type(self.LL, SVTYPE.INS)) - self.assertFalse(orientation_supports_type(self.RR, SVTYPE.INS)) - - def test_orientation_supports_type_inversion(self): - self.assertFalse(orientation_supports_type(self.LR, SVTYPE.INV)) - self.assertFalse(orientation_supports_type(self.RL, SVTYPE.INV)) - self.assertTrue(orientation_supports_type(self.LL, SVTYPE.INV)) - self.assertTrue(orientation_supports_type(self.RR, SVTYPE.INV)) - - def test_orientation_supports_type_translocation_inversion(self): - self.assertFalse(orientation_supports_type(self.LR, SVTYPE.ITRANS)) - self.assertFalse(orientation_supports_type(self.RL, SVTYPE.ITRANS)) - self.assertTrue(orientation_supports_type(self.LL, SVTYPE.ITRANS)) - self.assertTrue(orientation_supports_type(self.RR, SVTYPE.ITRANS)) - - def test_orientation_supports_type_trans_duplication(self): - self.assertFalse(orientation_supports_type(self.LR, SVTYPE.DUP)) - self.assertTrue(orientation_supports_type(self.RL, SVTYPE.DUP)) - self.assertFalse(orientation_supports_type(self.LL, SVTYPE.DUP)) - self.assertFalse(orientation_supports_type(self.RR, SVTYPE.DUP)) - - def test_orientation_supports_type_translocation(self): - self.assertTrue(orientation_supports_type(self.LR, SVTYPE.TRANS)) - self.assertTrue(orientation_supports_type(self.RL, SVTYPE.TRANS)) - self.assertFalse(orientation_supports_type(self.LL, SVTYPE.TRANS)) - self.assertFalse(orientation_supports_type(self.RR, SVTYPE.TRANS)) - - -class TestHistogram(unittest.TestCase): + assert ( + sequenced_strand(stranded_reads.read2_neg_pos, strand_determining_read=1) == STRAND.NEG + ) + assert ( + sequenced_strand(stranded_reads.read2_pos_pos, strand_determining_read=1) == STRAND.NEG + ) + assert ( + sequenced_strand(stranded_reads.read2_neg_neg, strand_determining_read=1) == STRAND.POS + ) + + def test_read_pair_strand_det2_read2(self, stranded_reads): + assert ( + sequenced_strand(stranded_reads.read2_pos_neg, strand_determining_read=2) == STRAND.NEG + ) + assert ( + sequenced_strand(stranded_reads.read2_neg_pos, strand_determining_read=2) == STRAND.POS + ) + assert ( + sequenced_strand(stranded_reads.read2_pos_pos, strand_determining_read=2) == STRAND.POS + ) + assert ( + sequenced_strand(stranded_reads.read2_neg_neg, strand_determining_read=2) == STRAND.NEG + ) + + def test_read_pair_strand_det2_read1(self, stranded_reads): + assert ( + sequenced_strand(stranded_reads.read1_pos_neg, strand_determining_read=2) == STRAND.NEG + ) + assert ( + sequenced_strand(stranded_reads.read1_neg_pos, strand_determining_read=2) == STRAND.POS + ) + assert ( + sequenced_strand(stranded_reads.read1_pos_pos, strand_determining_read=2) == STRAND.NEG + ) + assert ( + sequenced_strand(stranded_reads.read1_neg_neg, strand_determining_read=2) == STRAND.POS + ) + + def test_read_pair_strand_unpaired(self, stranded_reads): + with pytest.raises(ValueError): + sequenced_strand(stranded_reads.unpaired_pos) + with pytest.raises(ValueError): + sequenced_strand(stranded_reads.unpaired_neg) + + def test_read_pair_strand_det_error(self, stranded_reads): + with pytest.raises(ValueError): + sequenced_strand(stranded_reads.read1_pos_neg, strand_determining_read=3) + + +@pytest.fixture +def read_pairs(): + n = argparse.Namespace() + n.LR = MockRead( + reference_id=0, + next_reference_id=0, + reference_start=1, + next_reference_start=2, + is_reverse=False, + mate_is_reverse=True, + ) + n.LL = MockRead( + reference_id=0, + next_reference_id=0, + reference_start=1, + next_reference_start=2, + is_reverse=False, + mate_is_reverse=False, + ) + n.RR = MockRead( + reference_id=0, + next_reference_id=0, + reference_start=1, + next_reference_start=2, + is_reverse=True, + mate_is_reverse=True, + ) + n.RL = MockRead( + reference_id=0, + next_reference_id=0, + reference_start=1, + next_reference_start=2, + is_reverse=True, + mate_is_reverse=False, + ) + return n + + +class TestReadPairType: + def test_read_pair_type_LR(self, read_pairs): + assert read_pair_type(read_pairs.LR) == READ_PAIR_TYPE.LR + + def test_read_pair_type_LL(self, read_pairs): + assert read_pair_type(read_pairs.LL) == READ_PAIR_TYPE.LL + + def test_read_pair_type_RR(self, read_pairs): + assert read_pair_type(read_pairs.RR) == READ_PAIR_TYPE.RR + + def test_read_pair_type_RL(self, read_pairs): + assert read_pair_type(read_pairs.RL) == READ_PAIR_TYPE.RL + + def test_orientation_supports_type_deletion(self, read_pairs): + assert orientation_supports_type(read_pairs.LR, SVTYPE.DEL) + assert not orientation_supports_type(read_pairs.RL, SVTYPE.DEL) + assert not orientation_supports_type(read_pairs.LL, SVTYPE.DEL) + assert not orientation_supports_type(read_pairs.RR, SVTYPE.DEL) + + def test_orientation_supports_type_insertion(self, read_pairs): + assert orientation_supports_type(read_pairs.LR, SVTYPE.INS) + assert not orientation_supports_type(read_pairs.RL, SVTYPE.INS) + assert not orientation_supports_type(read_pairs.LL, SVTYPE.INS) + assert not orientation_supports_type(read_pairs.RR, SVTYPE.INS) + + def test_orientation_supports_type_inversion(self, read_pairs): + assert not orientation_supports_type(read_pairs.LR, SVTYPE.INV) + assert not orientation_supports_type(read_pairs.RL, SVTYPE.INV) + assert orientation_supports_type(read_pairs.LL, SVTYPE.INV) + assert orientation_supports_type(read_pairs.RR, SVTYPE.INV) + + def test_orientation_supports_type_translocation_inversion(self, read_pairs): + assert not orientation_supports_type(read_pairs.LR, SVTYPE.ITRANS) + assert not orientation_supports_type(read_pairs.RL, SVTYPE.ITRANS) + assert orientation_supports_type(read_pairs.LL, SVTYPE.ITRANS) + assert orientation_supports_type(read_pairs.RR, SVTYPE.ITRANS) + + def test_orientation_supports_type_trans_duplication(self, read_pairs): + assert not orientation_supports_type(read_pairs.LR, SVTYPE.DUP) + assert orientation_supports_type(read_pairs.RL, SVTYPE.DUP) + assert not orientation_supports_type(read_pairs.LL, SVTYPE.DUP) + assert not orientation_supports_type(read_pairs.RR, SVTYPE.DUP) + + def test_orientation_supports_type_translocation(self, read_pairs): + assert orientation_supports_type(read_pairs.LR, SVTYPE.TRANS) + assert orientation_supports_type(read_pairs.RL, SVTYPE.TRANS) + assert not orientation_supports_type(read_pairs.LL, SVTYPE.TRANS) + assert not orientation_supports_type(read_pairs.RR, SVTYPE.TRANS) + + +class TestHistogram: def test_add(self): h = Histogram() h.add(1) h.add(1) - self.assertEqual(2, h[1]) + assert h[1] == 2 h.add(1, 4) - self.assertEqual(6, h[1]) + assert h[1] == 6 def test_median(self): h = Histogram() for i in range(1, 11): h.add(i) - self.assertEqual(5.5, h.median()) + assert h.median() == 5.5 h.add(11) - self.assertEqual(6, h.median()) + assert h.median() == 6 def test_distib_stderr(self): h = Histogram() @@ -435,9 +438,9 @@ def test_distib_stderr(self): for i in range(4, 8): h.add(i) m = h.median() - self.assertEqual(5, m) + assert m == 5 err = h.distribution_stderr(m, 1) - self.assertEqual(116 / 15, err) + assert err == 116 / 15 def test_add_operator(self): x = Histogram() @@ -445,19 +448,19 @@ def test_add_operator(self): x.add(1) y.add(1, 4) z = x + y - self.assertEqual(1, x[1]) - self.assertEqual(4, y[1]) - self.assertEqual(5, z[1]) + assert x[1] == 1 + assert y[1] == 4 + assert z[1] == 5 -class TestBamStats(unittest.TestCase): +class TestBamStats: def test_genome_bam_stats(self): bamfh = BamCache(get_data('mock_reads_for_events.sorted.bam')) stats = compute_genome_bam_stats( bamfh, 1000, 100, min_mapping_quality=1, sample_cap=10000, distribution_fraction=0.99 ) - self.assertGreaterEqual(50, abs(stats.median_fragment_size - 420)) - self.assertEqual(150, stats.read_length) + assert 50 >= abs(stats.median_fragment_size - 420) + assert stats.read_length == 150 bamfh.close() def test_trans_bam_stats(self): @@ -472,35 +475,37 @@ def test_trans_bam_stats(self): sample_cap=10000, distribution_fraction=0.99, ) - self.assertTrue(abs(stats.median_fragment_size - 185) < 5) - self.assertEqual(75, stats.read_length) - self.assertTrue(stats.stdev_fragment_size < 50) + assert abs(stats.median_fragment_size - 185) < 5 + assert stats.read_length == 75 + assert stats.stdev_fragment_size < 50 bamfh.close() -class TestMapRefRangeToQueryRange(unittest.TestCase): - def setUp(self): - self.contig_read = MockRead( - cigar=_cigar.convert_string_to_cigar('275M18I12041D278M'), - reference_start=89700025, - reference_name='10', - ) +@pytest.fixture +def contig_read(): + return MockRead( + cigar=_cigar.convert_string_to_cigar('275M18I12041D278M'), + reference_start=89700025, + reference_name='10', + ) + - def test_full_aligned_portion(self): +class TestMapRefRangeToQueryRange: + def test_full_aligned_portion(self, contig_read): ref_range = Interval(89700026, 89712619) - qrange = _read.map_ref_range_to_query_range(self.contig_read, ref_range) - self.assertEqual(571, len(qrange)) - self.assertEqual(1, qrange.start) - self.assertEqual(571, qrange.end) + qrange = _read.map_ref_range_to_query_range(contig_read, ref_range) + assert len(qrange) == 571 + assert qrange.start == 1 + assert qrange.end == 571 - def test_multiple_events(self): + def test_multiple_events(self, contig_read): ref_range = Interval(89700067, 89712347) - qrange = _read.map_ref_range_to_query_range(self.contig_read, ref_range) - self.assertEqual(len(ref_range) - 12041 + 18, len(qrange)) + qrange = _read.map_ref_range_to_query_range(contig_read, ref_range) + assert len(qrange) == len(ref_range) - 12041 + 18 - def test_no_events(self): + def test_no_events(self, contig_read): ref_range = Interval(89700031, 89700040) - qrange = _read.map_ref_range_to_query_range(self.contig_read, ref_range) - self.assertEqual(10, len(qrange)) - self.assertEqual(6, qrange.start) - self.assertEqual(15, qrange.end) + qrange = _read.map_ref_range_to_query_range(contig_read, ref_range) + assert len(qrange) == 10 + assert qrange.start == 6 + assert qrange.end == 15 diff --git a/tests/integration/test_bam_cigar.py b/tests/integration/test_bam_cigar.py index 6a71e54e..1d0c49d8 100644 --- a/tests/integration/test_bam_cigar.py +++ b/tests/integration/test_bam_cigar.py @@ -1,7 +1,9 @@ -import unittest import warnings +import pytest +import timeout_decorator from mavis.annotate.file_io import load_reference_genome +from mavis.bam import read as _read from mavis.bam.cigar import ( alignment_matches, compute, @@ -16,14 +18,11 @@ recompute_cigar_mismatch, score, ) -from mavis.constants import CIGAR from mavis.bam.read import SamRead -from mavis.bam import read as _read -import timeout_decorator +from mavis.constants import CIGAR -from . import MockRead, MockObject from ..util import get_data - +from . import MockObject, MockRead REFERENCE_GENOME = None @@ -39,14 +38,14 @@ def setUpModule(): raise AssertionError('fake genome file does not have the expected contents') -class TestRecomputeCigarMismatch(unittest.TestCase): +class TestRecomputeCigarMismatch: def test_simple(self): r = MockRead( reference_start=1456, query_sequence='CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', cigar=[(CIGAR.M, 45)], ) - self.assertEqual([(CIGAR.EQ, 45)], recompute_cigar_mismatch(r, REFERENCE_GENOME['fake'])) + assert recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) == [(CIGAR.EQ, 45)] def test_hardclipping(self): r = MockRead( @@ -54,8 +53,8 @@ def test_hardclipping(self): query_sequence='CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', cigar=[(CIGAR.H, 20), (CIGAR.M, 45)], ) - self.assertEqual( - [(CIGAR.H, 20), (CIGAR.EQ, 45)], recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) + assert [(CIGAR.H, 20), (CIGAR.EQ, 45)] == recompute_cigar_mismatch( + r, REFERENCE_GENOME['fake'] ) def test_with_events(self): @@ -64,10 +63,13 @@ def test_with_events(self): query_sequence='TATA' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', cigar=[(CIGAR.S, 4), (CIGAR.M, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], ) - self.assertEqual( - [(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.EQ, 25)], - recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']), - ) + assert [ + (CIGAR.S, 4), + (CIGAR.EQ, 10), + (CIGAR.D, 10), + (CIGAR.I, 10), + (CIGAR.EQ, 25), + ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) def test_mismatch_to_mismatch(self): r = MockRead( @@ -75,10 +77,13 @@ def test_mismatch_to_mismatch(self): query_sequence='CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', cigar=[(CIGAR.X, 4), (CIGAR.M, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], ) - self.assertEqual( - [(CIGAR.X, 4), (CIGAR.EQ, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.EQ, 25)], - recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']), - ) + assert [ + (CIGAR.X, 4), + (CIGAR.EQ, 10), + (CIGAR.D, 10), + (CIGAR.I, 10), + (CIGAR.EQ, 25), + ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) def test_m_to_mismatch(self): r = MockRead( @@ -86,38 +91,39 @@ def test_m_to_mismatch(self): query_sequence='CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', cigar=[(CIGAR.M, 14), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], ) - self.assertEqual( - [(CIGAR.X, 4), (CIGAR.EQ, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.EQ, 25)], - recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']), - ) + assert [ + (CIGAR.X, 4), + (CIGAR.EQ, 10), + (CIGAR.D, 10), + (CIGAR.I, 10), + (CIGAR.EQ, 25), + ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) -class TestExtendSoftclipping(unittest.TestCase): +class TestExtendSoftclipping: def test_softclipped_right(self): c = convert_string_to_cigar('70=2X1=8X4=1X1=4X1=6X1=4X1=4X2=5X3=3X1=4X1=3X1=14X1=1X2=1S') cnew, prefix = extend_softclipping(c, 6) - self.assertEqual(0, prefix) - self.assertEqual(convert_string_to_cigar('70=80S'), cnew) + assert prefix == 0 + assert cnew == convert_string_to_cigar('70=80S') -class TestCigarTools(unittest.TestCase): +class TestCigarTools: def test_alignment_matches(self): c = [(CIGAR.M, 10), (CIGAR.EQ, 10), (CIGAR.X, 10)] - self.assertEqual(30, alignment_matches(c)) + assert alignment_matches(c) == 30 def test_join(self): c = [(CIGAR.M, 10), (CIGAR.X, 10), (CIGAR.X, 10)] - self.assertEqual([(CIGAR.M, 10), (CIGAR.X, 20)], join(c)) + assert join(c) == [(CIGAR.M, 10), (CIGAR.X, 20)] k = [(CIGAR.X, 10), (CIGAR.M, 10), (CIGAR.X, 10)] - self.assertEqual([(CIGAR.M, 10), (CIGAR.X, 30), (CIGAR.M, 10), (CIGAR.X, 10)], join(c, k)) + assert join(c, k) == [(CIGAR.M, 10), (CIGAR.X, 30), (CIGAR.M, 10), (CIGAR.X, 10)] k = [(4, 1), (4, 2), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)] - self.assertEqual( - [(4, 3), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)], join(k) - ) + assert [(4, 3), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)] == join(k) def test_join_hardclipping(self): c = [(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 10), (CIGAR.X, 10)] - self.assertEqual([(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 20)], join(c)) + assert join(c) == [(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 20)] def test_longest_fuzzy_match(self): c = [ @@ -128,10 +134,10 @@ def test_longest_fuzzy_match(self): (CIGAR.I, 3), (CIGAR.EQ, 5), ] - self.assertEqual(15, longest_fuzzy_match(c, 1)) - self.assertEqual(10, longest_fuzzy_match(c, 0)) - self.assertEqual(16, longest_fuzzy_match(c, 2)) - self.assertEqual(16, longest_fuzzy_match(c, 4)) + assert longest_fuzzy_match(c, 1) == 15 + assert longest_fuzzy_match(c, 0) == 10 + assert longest_fuzzy_match(c, 2) == 16 + assert longest_fuzzy_match(c, 4) == 16 def test_score(self): c = [ @@ -142,10 +148,10 @@ def test_score(self): (CIGAR.I, 3), (CIGAR.EQ, 5), ] - self.assertEqual(22, score(c)) + assert score(c) == 22 def test_score_error(self): - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): c = [(CIGAR.S, 10), (CIGAR.EQ, 1), (CIGAR.X, 4), (99, 10), (CIGAR.I, 3), (CIGAR.EQ, 5)] score(c) @@ -158,56 +164,50 @@ def test_match_percent(self): (CIGAR.I, 3), (CIGAR.EQ, 5), ] - self.assertEqual(0.8, match_percent(c)) - with self.assertRaises(AttributeError): + assert match_percent(c) == 0.8 + with pytest.raises(AttributeError): match_percent([(CIGAR.M, 100)]) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): match_percent([(CIGAR.S, 100)]) def test_compute(self): # GTGAGTAAATTCAACATCGTTTTT # aacttagAATTCAAC--------- - self.assertEqual( - ([(CIGAR.S, 7), (CIGAR.EQ, 8)], 7), - compute('GTGAGTAAATTCAACATCGTTTTT', 'AACTTAGAATTCAAC---------'), + assert ([(CIGAR.S, 7), (CIGAR.EQ, 8)], 7) == compute( + 'GTGAGTAAATTCAACATCGTTTTT', 'AACTTAGAATTCAAC---------' ) - self.assertEqual( - ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7), - compute('GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------'), + assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7) == compute( + 'GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------' ) - self.assertEqual( - ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7), - compute('GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False), + assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7) == compute( + 'GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False ) - self.assertEqual( - ([(CIGAR.S, 5), (CIGAR.EQ, 5), (CIGAR.I, 2), (CIGAR.EQ, 1)], 7), - compute('GTGAGTAAATTC--CATCGTTTTT', '--CTTAGAATTCAAC---------', False), + assert ([(CIGAR.S, 5), (CIGAR.EQ, 5), (CIGAR.I, 2), (CIGAR.EQ, 1)], 7) == compute( + 'GTGAGTAAATTC--CATCGTTTTT', '--CTTAGAATTCAAC---------', False ) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): compute('CCTG', 'CCG') - self.assertEqual( - ([(CIGAR.EQ, 2), (CIGAR.X, 2)], 0), - compute('CCTG', 'CCGT', min_exact_to_stop_softclipping=10), + assert ([(CIGAR.EQ, 2), (CIGAR.X, 2)], 0) == compute( + 'CCTG', 'CCGT', min_exact_to_stop_softclipping=10 ) - self.assertEqual( - ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 5), - compute('--GAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False), + assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 5) == compute( + '--GAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False ) def test_convert_for_igv(self): c = [(CIGAR.M, 10), (CIGAR.EQ, 10), (CIGAR.X, 10)] - self.assertEqual([(CIGAR.M, 30)], convert_for_igv(c)) + assert convert_for_igv(c) == [(CIGAR.M, 30)] -class TestHgvsStandardizeCigars(unittest.TestCase): +class TestHgvsStandardizeCigars: def no_change_aligned(self): ref = 'AAATTTGGGCCCAATT' read = MockRead('name', '1', 1, cigar=[(CIGAR.M, 10)], query_sequence='AAATTTGGGC') - self.assertEqual([(CIGAR.M, 10)], hgvs_standardize_cigar(read, ref)) + assert hgvs_standardize_cigar(read, ref) == [(CIGAR.M, 10)] def no_change_proper_indel(self): ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' @@ -218,9 +218,8 @@ def no_change_proper_indel(self): query_sequence='ATCTAC' 'CCC' 'ATCG', cigar=[(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], ) - self.assertEqual( - [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], - hgvs_standardize_cigar(read, ref), + assert [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] == hgvs_standardize_cigar( + read, ref ) def ins_after_deletion(self): @@ -232,9 +231,8 @@ def ins_after_deletion(self): query_sequence='ATCTAC' 'CCC' 'ATCG', cigar=[(CIGAR.EQ, 6), (CIGAR.D, 3), (CIGAR.I, 3), (CIGAR.EQ, 4)], ) - self.assertEqual( - [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], - hgvs_standardize_cigar(read, ref), + assert [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] == hgvs_standardize_cigar( + read, ref ) def test_insertion_in_repeat(self): @@ -246,9 +244,7 @@ def test_insertion_in_repeat(self): query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', cigar=[(CIGAR.EQ, 4), (CIGAR.I, 4), (CIGAR.EQ, 8)], ) - self.assertEqual( - [(CIGAR.EQ, 8), (CIGAR.I, 4), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref) - ) + assert [(CIGAR.EQ, 8), (CIGAR.I, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref) def test_deletion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG' @@ -259,9 +255,7 @@ def test_deletion_in_repeat(self): query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', cigar=[(CIGAR.EQ, 4), (CIGAR.D, 4), (CIGAR.EQ, 12)], ) - self.assertEqual( - [(CIGAR.EQ, 12), (CIGAR.D, 4), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref) - ) + assert [(CIGAR.EQ, 12), (CIGAR.D, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref) def test_bubble_sort_indel_sections(self): rseq = 'ATAGGC' 'ATCT' 'GG' 'GA' 'GCGA' 'GATCGCTACG' @@ -280,9 +274,8 @@ def test_bubble_sort_indel_sections(self): (CIGAR.EQ, 8), ], ) - self.assertEqual( - [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 4), (CIGAR.EQ, 8)], - hgvs_standardize_cigar(read, rseq), + assert [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 4), (CIGAR.EQ, 8)] == hgvs_standardize_cigar( + read, rseq ) def test_bubble_sort_indel_sections_drop_mismatch(self): @@ -304,9 +297,8 @@ def test_bubble_sort_indel_sections_drop_mismatch(self): (CIGAR.EQ, 8), ], ) - self.assertEqual( - [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 8), (CIGAR.EQ, 9)], - hgvs_standardize_cigar(read, rseq), + assert [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 8), (CIGAR.EQ, 9)] == hgvs_standardize_cigar( + read, rseq ) def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): @@ -337,10 +329,13 @@ def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)})) print(SamRead.insertion_sequences(read)) print(read.query_sequence, len(read.query_sequence)) - self.assertEqual( - [(CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.I, 6), (CIGAR.D, 5), (CIGAR.EQ, 6)], - hgvs_standardize_cigar(read, ref), - ) + assert [ + (CIGAR.H, 10), + (CIGAR.EQ, 4), + (CIGAR.I, 6), + (CIGAR.D, 5), + (CIGAR.EQ, 6), + ] == hgvs_standardize_cigar(read, ref) def test_homopolymer_even_odd(self): ref = 'ATCGAGAT' + 'A' * 15 + 'TCGAGAT' @@ -351,8 +346,8 @@ def test_homopolymer_even_odd(self): query_sequence='ATCGAGATA' + 'A' * 12 + 'TCGAGAT', cigar=[(CIGAR.EQ, 8), (CIGAR.D, 2), (CIGAR.EQ, 20)], ) - self.assertEqual( - [(CIGAR.EQ, 9 + 12), (CIGAR.D, 2), (CIGAR.EQ, 7)], hgvs_standardize_cigar(read, ref) + assert [(CIGAR.EQ, 9 + 12), (CIGAR.D, 2), (CIGAR.EQ, 7)] == hgvs_standardize_cigar( + read, ref ) ref = ( 'CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA' @@ -370,9 +365,8 @@ def test_homopolymer_even_odd(self): ), cigar=[(CIGAR.EQ, 61), (CIGAR.I, 2), (CIGAR.EQ, 87)], ) - self.assertEqual( - [(CIGAR.EQ, 61 + 15), (CIGAR.I, 2), (CIGAR.EQ, 87 - 15)], - hgvs_standardize_cigar(read, ref), + assert [(CIGAR.EQ, 61 + 15), (CIGAR.I, 2), (CIGAR.EQ, 87 - 15)] == hgvs_standardize_cigar( + read, ref ) ref = ( @@ -391,10 +385,12 @@ def test_homopolymer_even_odd(self): ), cigar=[(CIGAR.S, 2), (CIGAR.EQ, 96), (CIGAR.I, 2), (CIGAR.EQ, 50)], ) - self.assertEqual( - [(CIGAR.S, 2), (CIGAR.EQ, 96 + 15), (CIGAR.I, 2), (CIGAR.EQ, 50 - 15)], - hgvs_standardize_cigar(read, ref), - ) + assert [ + (CIGAR.S, 2), + (CIGAR.EQ, 96 + 15), + (CIGAR.I, 2), + (CIGAR.EQ, 50 - 15), + ] == hgvs_standardize_cigar(read, ref) def test_even_deletion_in_repeat(self): rseq = ( @@ -421,7 +417,7 @@ def test_even_deletion_in_repeat(self): print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_odd_deletion_in_repeat(self): rseq = ( @@ -446,7 +442,7 @@ def test_odd_deletion_in_repeat(self): print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_unecessary_indel(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' @@ -458,10 +454,9 @@ def test_unecessary_indel(self): cigar=convert_string_to_cigar('13=1I1D12='), query_sequence=qseq, ) - reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('26=') new_cigar = hgvs_standardize_cigar(read, rseq) - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_unecessary_indel2(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' @@ -473,10 +468,9 @@ def test_unecessary_indel2(self): cigar=convert_string_to_cigar('13=2I1D12='), query_sequence=qseq, ) - reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('14=1I12=') new_cigar = hgvs_standardize_cigar(read, rseq) - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' @@ -488,10 +482,9 @@ def test_unecessary_indel_end_match(self): cigar=convert_string_to_cigar('14=5I2D10='), query_sequence=qseq, ) - reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('14=3I12=') new_cigar = hgvs_standardize_cigar(read, rseq) - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_unecessary_indel_end_match2(self): rseq = 'GGGTGCAGTGGCTTACACCT' 'GTAATCCAAACACCTTGGGAGCCGCCCCCTGAG' 'CCTCCAGGCCCGGGACAGA' @@ -503,10 +496,9 @@ def test_unecessary_indel_end_match2(self): cigar=convert_string_to_cigar('20=5I33D19='), query_sequence=qseq, ) - reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('20=4I32D20=') new_cigar = hgvs_standardize_cigar(read, rseq) - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_even_insertion_in_repeat(self): rseq = ( @@ -527,11 +519,10 @@ def test_even_insertion_in_repeat(self): cigar=convert_string_to_cigar('4S13=2I66='), query_sequence=qseq, ) - reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S26=2I53=') new_cigar = hgvs_standardize_cigar(read, rseq) read.cigar = new_cigar - self.assertEqual(exp, new_cigar) + assert new_cigar == exp def test_deletion_repeat(self): qseq = ( @@ -581,7 +572,7 @@ def test_deletion_repeat(self): print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) read.cigar = std_cigar print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) - self.assertEqual(expected_cigar, std_cigar) + assert std_cigar == expected_cigar @timeout_decorator.timeout(1) def test_complex(self): @@ -651,7 +642,7 @@ def test_complex(self): std_cigar = hgvs_standardize_cigar(read, rseq) print(new_cigar) print(std_cigar) - self.assertEqual(new_cigar, std_cigar) + assert std_cigar == new_cigar def test_deletion_partial_repeat(self): qseq = 'ATCTTAGCCAGGT' 'AGTTACATACATATC' @@ -663,7 +654,7 @@ def test_deletion_partial_repeat(self): query_sequence=qseq, cigar=convert_string_to_cigar('13=6D15='), ) - self.assertEqual(convert_string_to_cigar('15=6D13='), hgvs_standardize_cigar(read, rseq)) + assert convert_string_to_cigar('15=6D13=') == hgvs_standardize_cigar(read, rseq) def test_indel_repeat(self): qseq = 'ATCTTAGCCAGGT' 'C' 'AGTTACATACATATC' @@ -677,7 +668,7 @@ def test_indel_repeat(self): query_sequence=qseq, cigar=convert_string_to_cigar('13=1I6D15='), ) - self.assertEqual(convert_string_to_cigar('13=1I6D15='), hgvs_standardize_cigar(read, rseq)) + assert convert_string_to_cigar('13=1I6D15=') == hgvs_standardize_cigar(read, rseq) def test_shift_complex_indel(self): refseq = 'ATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAATGCTTTCTTTTATTAGTTGCCCTGTTTCAGATTCAGCTTTGTATCTATATCACCTGTTAATATGTGTGGACTCACAGAAATGATCATTGAGGGAATGCACCCTGTTTGGGTGTAAGTAGCTCAGGGAAAAAATCCTAG' @@ -690,14 +681,14 @@ def test_shift_complex_indel(self): ) print(_read.convert_cigar_to_string(read.cigar)) read.cigar = recompute_cigar_mismatch(read, refseq) - self.assertEqual(convert_string_to_cigar('44=18I63=1X17=1X6='), read.cigar) + assert read.cigar == convert_string_to_cigar('44=18I63=1X17=1X6=') print(_read.convert_cigar_to_string(read.cigar)) read.cigar = hgvs_standardize_cigar(read, refseq) print(_read.convert_cigar_to_string(read.cigar)) - self.assertEqual(convert_string_to_cigar('45=18I62=1X17=1X6='), read.cigar) + assert read.cigar == convert_string_to_cigar('45=18I62=1X17=1X6=') -class TestMergeInternalEvents(unittest.TestCase): +class TestMergeInternalEvents: def test_small_exact_match(self): cigar = convert_string_to_cigar('283M17506D5M21275D596M17506D5M21275D313M') # [(0, 283), (2, 17506), (0, 5), (2, 21275), (0, 596), (2, 17506), (0, 5), (2, 21275), (0, 313)] @@ -711,10 +702,10 @@ def test_small_exact_match(self): (CIGAR.D, 17506 + 21275 + 5), (CIGAR.M, 313), ] - self.assertEqual(exp, new_cigar) + assert new_cigar == exp -class TestConvertStringToCigar(unittest.TestCase): +class TestConvertStringToCigar: def test(self): string = '283M' '17506D' '5M' '21275D' '596M' '17506D' '5M' '21275D' '313M' exp = [ @@ -728,13 +719,10 @@ def test(self): (CIGAR.D, 21275), (CIGAR.M, 313), ] - self.assertEqual(exp, convert_string_to_cigar(string)) - + assert convert_string_to_cigar(string) == exp -class TestGetSequences(unittest.TestCase): - def setUp(self): - self.reference_genome = {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')} +class TestGetSequences: def test_deletions(self): exp = ['cde', 'nopq'] read = MockRead( @@ -743,7 +731,10 @@ def test_deletions(self): query_sequence='', cigar=convert_string_to_cigar('2=3D8=4D9='), ) - self.assertEqual(exp, SamRead.deletion_sequences(read, self.reference_genome)) + assert ( + SamRead.deletion_sequences(read, {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')}) + == exp + ) def test_insertions(self): exp = ['kkk', 'kkkk'] @@ -753,4 +744,4 @@ def test_insertions(self): query_sequence='abcdekkkfghijklmnopqkkkkrstuvwxyz', cigar=convert_string_to_cigar('5=3I12=4I9='), ) - self.assertEqual(exp, SamRead.insertion_sequences(read)) + assert SamRead.insertion_sequences(read) == exp diff --git a/tests/integration/test_blat.py b/tests/integration/test_blat.py index 69ad1d68..10fe8320 100644 --- a/tests/integration/test_blat.py +++ b/tests/integration/test_blat.py @@ -1,6 +1,5 @@ -import shutil -import unittest - +import mavis.bam.cigar as _cigar +import pytest from Bio import SeqIO from mavis.align import query_coverage_interval from mavis.annotate.file_io import load_reference_genome @@ -8,11 +7,9 @@ from mavis.blat import Blat from mavis.constants import CIGAR, reverse_complement from mavis.interval import Interval -import mavis.bam.cigar as _cigar -from . import MockBamFileHandle, MockObject, MockLongString from ..util import get_data - +from . import MockBamFileHandle, MockLongString, MockObject REFERENCE_GENOME = None @@ -29,16 +26,18 @@ def setUpModule(): BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) -class TestBlat(unittest.TestCase): - def setUp(self): - self.cache = BamCache(MockBamFileHandle({'Y': 23, 'fake': 0, 'reference3': 3, '14': 13})) +@pytest.fixture +def cache(): + return BamCache(MockBamFileHandle({'Y': 23, 'fake': 0, 'reference3': 3, '14': 13})) + +class TestBlat: def test_read_pslx(self): mapping = {} for record in SeqIO.parse(get_data('blat_input.fa'), 'fasta'): mapping[record.id] = record.seq header, rows = Blat.read_pslx(get_data('blat_output.pslx'), mapping) - self.assertEqual(11067, len(rows)) + assert len(rows) == 11067 expect_pslx_header = [ 'match', 'mismatch', @@ -64,9 +63,9 @@ def test_read_pslx(self): 'qseqs', 'tseqs', ] - self.assertEqual(expect_pslx_header, header) + assert header == expect_pslx_header - def test_pslx_row_to_pysam_single_block(self): + def test_pslx_row_to_pysam_single_block(self, cache): pslx_row = { 'score': 20, 'tseqs': ['AATACCAAATACATGATATA'], @@ -82,11 +81,11 @@ def test_pslx_row_to_pysam_single_block(self): 'qseq_full': 'AGCCTCCCAAGTAGCTGGGACTACAGGCGCCCGCCACTACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTTTT' 'AGCCAGGATGGTCTCGATCTCCTGACCTCATGATCCGCCCGCCTCGGC', } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, None) - self.assertEqual(23, read.reference_id) - self.assertEqual(Interval(93, 112), query_coverage_interval(read)) + read = Blat.pslx_row_to_pysam(pslx_row, cache, None) + assert read.reference_id == 23 + assert query_coverage_interval(read) == Interval(93, 112) - def test_pslx_row_to_pysam_full_reverse(self): + def test_pslx_row_to_pysam_full_reverse(self, cache): pslx_row = { 'match': 128, 'mismatch': 0, @@ -114,13 +113,13 @@ def test_pslx_row_to_pysam_full_reverse(self): 'percent_ident': 100.0, 'qseq_full': 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT', } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, None) - self.assertEqual(3, read.reference_id) - self.assertEqual([(CIGAR.S, 117), (CIGAR.M, 128)], read.cigar) - self.assertEqual(2187, read.reference_start) - self.assertEqual(Interval(117, 244), query_coverage_interval(read)) + read = Blat.pslx_row_to_pysam(pslx_row, cache, None) + assert read.reference_id == 3 + assert read.cigar == [(CIGAR.S, 117), (CIGAR.M, 128)] + assert read.reference_start == 2187 + assert query_coverage_interval(read) == Interval(117, 244) - def test_pslx_row_to_pysam_simple(self): + def test_pslx_row_to_pysam_simple(self, cache): pslx_row = { 'tstarts': [950], 'block_sizes': [53], @@ -131,14 +130,14 @@ def test_pslx_row_to_pysam_simple(self): 'score': 0, 'qseq_full': 'ATCTAATAACTTGATCAATA' 'TCTGTGATTATATTTTCATT' 'GCCTTCCAATTTT', } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, None) - self.assertEqual(0, read.reference_id) - self.assertEqual(Interval(0, 52), query_coverage_interval(read)) - self.assertEqual(950, read.reference_start) - self.assertEqual(1003, read.reference_end) - self.assertEqual([(CIGAR.M, 53)], read.cigar) + read = Blat.pslx_row_to_pysam(pslx_row, cache, None) + assert read.reference_id == 0 + assert query_coverage_interval(read) == Interval(0, 52) + assert read.reference_start == 950 + assert read.reference_end == 1003 + assert read.cigar == [(CIGAR.M, 53)] - def test_pslx_row_to_pysam_simple_with_reference(self): + def test_pslx_row_to_pysam_simple_with_reference(self, cache): pslx_row = { 'tstarts': [950], 'block_sizes': [53], @@ -149,14 +148,14 @@ def test_pslx_row_to_pysam_simple_with_reference(self): 'score': 0, 'qseq_full': 'ATCTAATAACTTGATCAATA' 'TCTGTGATTATATTTTCATT' 'GCCTTCCAATTTT', } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) - self.assertEqual(0, read.reference_id) - self.assertEqual(Interval(0, 52), query_coverage_interval(read)) - self.assertEqual(950, read.reference_start) - self.assertEqual(1003, read.reference_end) - self.assertEqual([(CIGAR.EQ, 53)], read.cigar) + read = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) + assert read.reference_id == 0 + assert query_coverage_interval(read) == Interval(0, 52) + assert read.reference_start == 950 + assert read.reference_end == 1003 + assert read.cigar == [(CIGAR.EQ, 53)] - def test_pslx_row_to_pysam_gapped_alignment(self): + def test_pslx_row_to_pysam_gapped_alignment(self, cache): pslx_row = { 'block_count': 1, 'tstarts': [950, 7233], @@ -175,13 +174,13 @@ def test_pslx_row_to_pysam_gapped_alignment(self): 'ATACTTCATGTTGCCATGTT', 'score': 1, } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, None) - self.assertEqual(0, read.reference_id) - self.assertEqual(Interval(0, 146), query_coverage_interval(read)) - self.assertEqual(950, read.reference_start) - self.assertEqual([(CIGAR.M, 47), (CIGAR.D, 6236), (CIGAR.M, 100)], read.cigar) + read = Blat.pslx_row_to_pysam(pslx_row, cache, None) + assert read.reference_id == 0 + assert query_coverage_interval(read) == Interval(0, 146) + assert read.reference_start == 950 + assert read.cigar == [(CIGAR.M, 47), (CIGAR.D, 6236), (CIGAR.M, 100)] - def test_pslx_row_to_pysam_gapped_alignment_with_reference(self): + def test_pslx_row_to_pysam_gapped_alignment_with_reference(self, cache): pslx_row = { 'block_count': 1, 'tstarts': [950, 7233], @@ -200,13 +199,13 @@ def test_pslx_row_to_pysam_gapped_alignment_with_reference(self): 'ATACTTCATGTTGCCATGTT', 'score': 1, } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) - self.assertEqual(0, read.reference_id) - self.assertEqual(Interval(0, 146), query_coverage_interval(read)) - self.assertEqual(950, read.reference_start) - self.assertEqual([(CIGAR.EQ, 53), (CIGAR.D, 6236), (CIGAR.EQ, 94)], read.cigar) + read = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) + assert read.reference_id == 0 + assert query_coverage_interval(read) == Interval(0, 146) + assert read.reference_start == 950 + assert read.cigar == [(CIGAR.EQ, 53), (CIGAR.D, 6236), (CIGAR.EQ, 94)] - def test_pslx_row_to_pysam_revcomp_deletion(self): + def test_pslx_row_to_pysam_revcomp_deletion(self, cache): pslx_row = { 'block_count': 2, 'tstarts': [2205, 2281], @@ -226,17 +225,15 @@ def test_pslx_row_to_pysam_revcomp_deletion(self): 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG', ], } - read = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) - self.assertEqual(3, read.reference_id) - self.assertEqual(Interval(0, 83), query_coverage_interval(read)) - self.assertEqual(2205, read.reference_start) - self.assertEqual([(CIGAR.EQ, 51), (CIGAR.D, 26), (CIGAR.EQ, 33)], read.cigar) - self.assertEqual( - 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA', read.query_sequence[0:50] - ) - self.assertEqual('CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG', read.query_sequence[50:]) + read = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) + assert read.reference_id == 3 + assert query_coverage_interval(read) == Interval(0, 83) + assert read.reference_start == 2205 + assert read.cigar == [(CIGAR.EQ, 51), (CIGAR.D, 26), (CIGAR.EQ, 33)] + assert read.query_sequence[0:50] == 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA' + assert read.query_sequence[50:] == 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' - def test_pslx_row_to_pysam_inversion(self): + def test_pslx_row_to_pysam_inversion(self, cache): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' # first part of the inversion pslx_row = { @@ -258,11 +255,11 @@ def test_pslx_row_to_pysam_inversion(self): 'TTTTCATTTCTGTATGTTAAT' ], } - read1 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) - self.assertEqual(3, read1.reference_id) - self.assertEqual(Interval(125, 244), query_coverage_interval(read1)) - self.assertEqual(1114, read1.reference_start) - self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], read1.cigar) + read1 = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) + assert read1.reference_id == 3 + assert query_coverage_interval(read1) == Interval(125, 244) + assert read1.reference_start == 1114 + assert read1.cigar == [(CIGAR.S, 125), (CIGAR.EQ, 120)] # second part of the inversion pslx_row = { @@ -284,15 +281,15 @@ def test_pslx_row_to_pysam_inversion(self): 'TCTGTGTTTACAGGGCTTTCATGCTCAG' ], } - read2 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) - self.assertEqual(3, read2.reference_id) - self.assertEqual(2187, read2.reference_start) - self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], read2.cigar) - self.assertEqual(Interval(117, 244), query_coverage_interval(read2)) - self.assertEqual(read1.query_sequence, reverse_complement(read2.query_sequence)) + read2 = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) + assert read2.reference_id == 3 + assert read2.reference_start == 2187 + assert read2.cigar == [(CIGAR.S, 117), (CIGAR.EQ, 128)] + assert query_coverage_interval(read2) == Interval(117, 244) + assert reverse_complement(read2.query_sequence) == read1.query_sequence # test that this is selected for duplication or insertion evidence - def test_pslx_row_to_pysam_duplication(self): + def test_pslx_row_to_pysam_duplication(self, cache): reference = { '14': MockObject( seq=MockLongString( @@ -312,12 +309,8 @@ def test_pslx_row_to_pysam_duplication(self): 'qseq_full': 'AAGAAGGGTAACCTTAAAAAATACATTTCCCACTCCAGAAAATACTCATATGTGGCCTGTTAGCAGCACAAGAAGGGTGAAAGCAATGCCCATTCCTGCCTCCCTCCCCCTGCTCACCTCCACGTCCCTGTTTGCCCCTTTACTCATATGTGGCCTGTTAGCAGCACAAGAAGGGTGAAAGCAATGCCCATTCCTGCCTCCCTCCCCCTGCTCACCTCCACGTCCCTGTTTGCCCCTTTGTAGGTGAAGTGAGTATATTCAGCGTCTTC', 'score': 1, } - read2 = Blat.pslx_row_to_pysam(pslx_row, self.cache, reference) - self.assertEqual(13, read2.reference_id) - self.assertEqual(73014606, read2.reference_start) - self.assertEqual( - [(CIGAR.M, 141), (CIGAR.I, 98), (CIGAR.M, 30)], _cigar.convert_for_igv(read2.cigar) - ) - self.assertEqual( - Interval(0, len(pslx_row['qseq_full']) - 1), query_coverage_interval(read2) - ) + read2 = Blat.pslx_row_to_pysam(pslx_row, cache, reference) + assert read2.reference_id == 13 + assert read2.reference_start == 73014606 + assert _cigar.convert_for_igv(read2.cigar) == [(CIGAR.M, 141), (CIGAR.I, 98), (CIGAR.M, 30)] + assert query_coverage_interval(read2) == Interval(0, len(pslx_row['qseq_full']) - 1) diff --git a/tests/integration/test_breakpoint.py b/tests/integration/test_breakpoint.py index 659cf486..f6e0b3bb 100644 --- a/tests/integration/test_breakpoint.py +++ b/tests/integration/test_breakpoint.py @@ -1,14 +1,14 @@ -import unittest from functools import partial +import pytest from mavis.annotate.file_io import load_reference_genome from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import CIGAR, ORIENT, STRAND, reverse_complement +from mavis.constants import ORIENT, STRAND from mavis.interval import Interval from mavis.validate.evidence import TranscriptomeEvidence from ..util import get_data -from . import MockObject, MockRead, get_example_genes +from . import MockObject, get_example_genes REFERENCE_GENOME = None REF_CHR = 'fake' @@ -24,55 +24,61 @@ def setUpModule(): raise AssertionError('fake genome file does not have the expected contents') -class TestNetSizeTransEGFR(unittest.TestCase): - def setUp(self): - self.evidence = MockObject( - annotations={}, - read_length=100, - max_expected_fragment_size=550, - call_error=11, - overlapping_transcripts=set(get_example_genes()['EGFR'].transcripts), - ) - setattr( - self.evidence, '_select_transcripts', lambda *pos: self.evidence.overlapping_transcripts - ) - setattr(self.evidence, 'distance', partial(TranscriptomeEvidence.distance, self.evidence)) +@pytest.fixture +def egfr_evidence(): + evidence = MockObject( + annotations={}, + read_length=100, + max_expected_fragment_size=550, + call_error=11, + overlapping_transcripts=set(get_example_genes()['EGFR'].transcripts), + ) + setattr(evidence, '_select_transcripts', lambda *pos: evidence.overlapping_transcripts) + setattr(evidence, 'distance', partial(TranscriptomeEvidence.distance, evidence)) + return evidence - def egfr_distance(self, pos1, pos2): - return TranscriptomeEvidence.distance(self.evidence, pos1, pos2) - def test_deletion_in_exon(self): +class TestNetSizeTransEGFR: + def test_deletion_in_exon(self, egfr_evidence): bpp = BreakpointPair( Breakpoint('7', 55238890, orient=ORIENT.LEFT), Breakpoint('7', 55238899, orient=ORIENT.RIGHT), untemplated_seq='', ) - self.assertEqual(Interval(-8), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(-8) bpp = BreakpointPair( Breakpoint('7', 55238890, orient=ORIENT.LEFT), Breakpoint('7', 55238899, orient=ORIENT.RIGHT), untemplated_seq='GTAC', ) - self.assertEqual(Interval(-4), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(-4) - def test_deletion_across_intron(self): + def test_deletion_across_intron(self, egfr_evidence): # 55240539_55240621 55323947_55324313 bpp = BreakpointPair( Breakpoint('7', 55240610, orient=ORIENT.LEFT), Breakpoint('7', 55323950, orient=ORIENT.RIGHT), untemplated_seq='GTAC', ) - self.assertEqual(Interval(-10), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(-10) # 55210998_55211181 55218987_55219055 bpp = BreakpointPair( Breakpoint('7', 55211180, orient=ORIENT.LEFT), Breakpoint('7', 55218990, orient=ORIENT.RIGHT), untemplated_seq='', ) - self.assertEqual(Interval(-4 + -135, -4), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(-4 + -135, -4) - def test_insertion_at_exon_start_mixed(self): + def test_insertion_at_exon_start_mixed(self, egfr_evidence): # EXON 15: 55232973-55233130 # EXON 16: 55238868-55238906 # EXON 17: 55240676-55240817 @@ -81,55 +87,67 @@ def test_insertion_at_exon_start_mixed(self): Breakpoint('7', 55238868, orient=ORIENT.RIGHT), untemplated_seq='TTATCG', ) - self.assertEqual(Interval(6), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(6) - def test_insertion_at_exon_start(self): + def test_insertion_at_exon_start(self, egfr_evidence): # 55238868_55238906 bpp = BreakpointPair( Breakpoint('7', 55233130, orient=ORIENT.LEFT), Breakpoint('7', 55238868, orient=ORIENT.RIGHT), untemplated_seq='TTATCG', ) - self.assertEqual(Interval(6), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(6) - def test_insertion_at_exon_end_mixed(self): + def test_insertion_at_exon_end_mixed(self, egfr_evidence): # 55238868_55238906 bpp = BreakpointPair( Breakpoint('7', 55238905, orient=ORIENT.LEFT), Breakpoint('7', 55238906, orient=ORIENT.RIGHT), untemplated_seq='TTATCG', ) - self.assertEqual(Interval(6), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(6) - def test_insertion_at_exon_end(self): + def test_insertion_at_exon_end(self, egfr_evidence): # 55238868_55238906 bpp = BreakpointPair( Breakpoint('7', 55238906, orient=ORIENT.LEFT), Breakpoint('7', 55240676, orient=ORIENT.RIGHT), untemplated_seq='TTATCG', ) - self.assertEqual(Interval(6), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(6) - def test_insertion_in_intron(self): + def test_insertion_in_intron(self, egfr_evidence): # 55238868_55238906 bpp = BreakpointPair( Breakpoint('7', 5523750, orient=ORIENT.LEFT), Breakpoint('7', 5523751, orient=ORIENT.RIGHT), untemplated_seq='TTATCG', ) - self.assertEqual(Interval(6), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(6) - def test_indel_in_intron(self): + def test_indel_in_intron(self, egfr_evidence): # 55238868_55238906 bpp = BreakpointPair( Breakpoint('7', 5523700, orient=ORIENT.LEFT), Breakpoint('7', 5523751, orient=ORIENT.RIGHT), untemplated_seq='TTATCG', ) - self.assertEqual(Interval(-44), bpp.net_size(self.egfr_distance)) + assert bpp.net_size( + lambda p1, p2: TranscriptomeEvidence.distance(egfr_evidence, p1, p2) + ) == Interval(-44) -class TestLt(unittest.TestCase): +class TestLt: def test_break1(self): bpp1 = BreakpointPair( Breakpoint('1', 1, 10, orient=ORIENT.LEFT), @@ -141,7 +159,7 @@ def test_break1(self): Breakpoint('2', 1, orient=ORIENT.LEFT), untemplated_seq='', ) - self.assertTrue(bpp2 < bpp1) + assert bpp2 < bpp1 def test_useq(self): bpp1 = BreakpointPair( @@ -154,7 +172,7 @@ def test_useq(self): Breakpoint('2', 1, orient=ORIENT.LEFT), untemplated_seq=None, ) - self.assertTrue(bpp2 > bpp1) + assert bpp2 > bpp1 def test_break2(self): bpp1 = BreakpointPair( @@ -167,20 +185,20 @@ def test_break2(self): Breakpoint('2', 1, orient=ORIENT.LEFT), untemplated_seq=None, ) - self.assertTrue(bpp2 < bpp1) + assert bpp2 < bpp1 -class TestBreakpointSequenceHomology(unittest.TestCase): +class TestBreakpointSequenceHomology: def test_left_pos_right_pos(self): b1 = Breakpoint(REF_CHR, 157, strand=STRAND.POS, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 1788, strand=STRAND.POS, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('CAATGC', ''), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('CAATGC', '') b1 = Breakpoint(REF_CHR, 589, strand=STRAND.POS, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 704, strand=STRAND.POS, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('TTAA', 'ATAGC'), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('TTAA', 'ATAGC') def test_left_pos_left_neg(self): # CCC|AAA ------------ TTT|GGG @@ -189,7 +207,7 @@ def test_left_pos_left_neg(self): b1 = Breakpoint(REF_CHR, 1459, strand=STRAND.POS, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 2914, strand=STRAND.NEG, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('CCC', 'TTT'), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('CCC', 'TTT') def test_left_neg_left_pos(self): # CCC|AAA ------------ TTT|GGG @@ -198,7 +216,7 @@ def test_left_neg_left_pos(self): b1 = Breakpoint(REF_CHR, 1459, strand=STRAND.NEG, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 2914, strand=STRAND.POS, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('CCC', 'TTT'), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('CCC', 'TTT') def test_right_pos_right_neg(self): # CCC|AAA ------------ TTT|GGG @@ -207,7 +225,7 @@ def test_right_pos_right_neg(self): b1 = Breakpoint(REF_CHR, 1460, strand=STRAND.POS, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 2915, strand=STRAND.NEG, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('AAA', 'GGG'), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('AAA', 'GGG') def test_right_neg_right_pos(self): # CCC|AAA ------------ TTT|GGG @@ -216,14 +234,14 @@ def test_right_neg_right_pos(self): b1 = Breakpoint(REF_CHR, 1460, strand=STRAND.NEG, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 2915, strand=STRAND.POS, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('AAA', 'GGG'), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('AAA', 'GGG') def test_close_del(self): # ....TT|TT.... b1 = Breakpoint(REF_CHR, 1001, strand=STRAND.POS, orient=ORIENT.LEFT) b2 = Breakpoint(REF_CHR, 1002, strand=STRAND.POS, orient=ORIENT.RIGHT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('', ''), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('', '') def test_close_dup(self): # ....GATACATTTCTTCTTGAAAA... @@ -234,11 +252,11 @@ def test_close_dup(self): b1 = Breakpoint(REF_CHR, 745, strand=STRAND.POS, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 747, strand=STRAND.POS, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2) - self.assertEqual(('CT', 'TT'), bpp.breakpoint_sequence_homology(REFERENCE_GENOME)) + assert bpp.breakpoint_sequence_homology(REFERENCE_GENOME) == ('CT', 'TT') def test_non_specific_error(self): b1 = Breakpoint(REF_CHR, 740, 745, strand=STRAND.POS, orient=ORIENT.RIGHT) b2 = Breakpoint(REF_CHR, 747, strand=STRAND.POS, orient=ORIENT.LEFT) bpp = BreakpointPair(b1, b2) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): bpp.breakpoint_sequence_homology(REFERENCE_GENOME) diff --git a/tests/integration/test_cluster.py b/tests/integration/test_cluster.py index cfffbf59..3434c62b 100644 --- a/tests/integration/test_cluster.py +++ b/tests/integration/test_cluster.py @@ -15,7 +15,7 @@ REF_CHR = 'fake' -class TestFullClustering(unittest.TestCase): +class TestFullClustering: def test_mocked_events(self): # none of the 24 events in the mocked file should cluster together # if we change the mock file we may need to update this function @@ -26,7 +26,7 @@ def test_mocked_events(self): if bpp.data[COLUMNS.protocol] == PROTOCOL.GENOME: bpps.append(bpp) print(bpp) - self.assertEqual(28, len(bpps)) + assert len(bpps) == 28 clusters = merge_breakpoint_pairs(bpps, 10, 10) for cluster, input_pairs in sorted( @@ -35,8 +35,8 @@ def test_mocked_events(self): print(cluster) for ip in input_pairs: print('\t', ip) - self.assertEqual(1, len(input_pairs)) - self.assertEqual(len(bpps), len(clusters)) + assert len(input_pairs) == 1 + assert len(clusters) == len(bpps) def test_clustering_events(self): # this file contains 2 events that should be clustered and produce a valid bpp @@ -47,10 +47,10 @@ def test_clustering_events(self): if bpp.data[COLUMNS.protocol] == PROTOCOL.GENOME: bpps.append(bpp) print(bpp) - self.assertEqual(2, len(bpps)) + assert len(bpps) == 2 clusters = merge_breakpoint_pairs(bpps, 200, 25) - self.assertEqual(1, len(clusters)) + assert len(clusters) == 1 for cluster, input_pairs in sorted( clusters.items(), key=lambda x: (x[1][0].break1.chr, x[1][0].break2.chr) @@ -60,17 +60,17 @@ def test_clustering_events(self): print('\t', ip) print(cluster.flatten()) # BPP(Breakpoint(15:67333604L), Breakpoint(15:67333606R), opposing=False) - self.assertEqual('L', cluster.break1.orient) - self.assertEqual('R', cluster.break2.orient) - self.assertEqual('15', cluster.break1.chr) - self.assertEqual('15', cluster.break2.chr) - self.assertEqual(67333604, cluster.break1.start) - self.assertEqual(67333606, cluster.break2.start) - self.assertEqual(67333604, cluster.break1.end) - self.assertEqual(67333606, cluster.break2.end) + assert cluster.break1.orient == 'L' + assert cluster.break2.orient == 'R' + assert cluster.break1.chr == '15' + assert cluster.break2.chr == '15' + assert cluster.break1.start == 67333604 + assert cluster.break2.start == 67333606 + assert cluster.break1.end == 67333604 + assert cluster.break2.end == 67333606 -class TestMergeBreakpointPairs(unittest.TestCase): +class TestMergeBreakpointPairs: def test_order_is_retained(self): # BPP(Breakpoint(1:1925143-1925155R), Breakpoint(1:1925144L), opposing=False) # >> BPP(Breakpoint(1:1925144L), Breakpoint(1:1925144-1925158R), opposing=False) @@ -93,10 +93,10 @@ def test_order_is_retained(self): for merge, inputs in mapping.items(): print(merge) print(inputs) - self.assertEqual(1, len(mapping)) + assert len(mapping) == 1 merge = list(mapping)[0] - self.assertEqual('L', merge.break1.orient) - self.assertEqual('R', merge.break2.orient) + assert merge.break1.orient == 'L' + assert merge.break2.orient == 'R' def test_merging_identical_large_inputs(self): b1 = BreakpointPair( @@ -110,17 +110,17 @@ def test_merging_identical_large_inputs(self): opposing_strands=False, ) mapping = merge_breakpoint_pairs([b1, b2], 100, 25, verbose=True) - self.assertEqual(1, len(mapping)) + assert len(mapping) == 1 merge = list(mapping)[0] - self.assertEqual(2, len(mapping[merge])) - self.assertEqual('L', merge.break1.orient) - self.assertEqual('R', merge.break2.orient) - self.assertEqual('11', merge.break1.chr) - self.assertEqual('11', merge.break2.chr) - self.assertEqual(12856838, merge.break1.start) - self.assertEqual(12856840, merge.break2.start) # putative indel will be shifted - self.assertEqual(12897006, merge.break1.end) - self.assertEqual(12897006, merge.break2.end) + assert len(mapping[merge]) == 2 + assert merge.break1.orient == 'L' + assert merge.break2.orient == 'R' + assert merge.break1.chr == '11' + assert merge.break2.chr == '11' + assert merge.break1.start == 12856838 + assert merge.break2.start == 12856840 # putative indel will be shifted + assert merge.break1.end == 12897006 + assert merge.break2.end == 12897006 def test_events_separate(self): bpps = [ @@ -150,28 +150,28 @@ def test_events_separate(self): ), ] mapping = merge_breakpoint_pairs(bpps, 100, 25, verbose=True) - self.assertEqual(2, len(mapping)) + assert len(mapping) == 2 -class TestMergeIntervals(unittest.TestCase): +class TestMergeIntervals: def test_merge_even_length(self): i1 = Interval(1001, 1002) result = merge_integer_intervals(i1, i1, weight_adjustment=25) - self.assertEqual(i1, result) + assert result == i1 def test_merge_odd_length(self): i1 = Interval(1001, 1003) result = merge_integer_intervals(i1, i1, weight_adjustment=25) - self.assertEqual(i1, result) + assert result == i1 def test_merge_large_length(self): i1 = Interval(1001, 5003) result = merge_integer_intervals(i1, i1, weight_adjustment=25) - self.assertEqual(i1, result) + assert result == i1 i1 = Interval(12856838, 12897006) result = merge_integer_intervals(i1, i1, weight_adjustment=25) - self.assertEqual(i1, result) + assert result == i1 if __name__ == '__main__': diff --git a/tests/integration/test_illustrate.py b/tests/integration/test_illustrate.py index ce377207..0ce2bdae 100644 --- a/tests/integration/test_illustrate.py +++ b/tests/integration/test_illustrate.py @@ -1,7 +1,6 @@ -import os import random -import unittest +import pytest from mavis.annotate import fusion, genomic, protein, variant from mavis.annotate.base import BioInterval from mavis.annotate.file_io import load_templates @@ -33,10 +32,12 @@ def setUpModule(): TEMPLATE_METADATA = load_templates(get_data('cytoBand.txt')) -class TestDraw(unittest.TestCase): - def setUp(self): - self.canvas = Drawing(height=100, width=1000) +@pytest.fixture +def canvas(): + return Drawing(height=100, width=1000) + +class TestDraw: def test_generate_interval_mapping_outside_range_error(self): temp = [ Interval(48556470, 48556646), @@ -58,14 +59,14 @@ def test_generate_interval_mapping_outside_range_error(self): Interval.convert_pos(mapping, st) Interval.convert_pos(mapping, end) - def test_generate_gene_mapping_err(self): + def test_generate_gene_mapping_err(self, canvas): # _generate_interval_mapping [genomic.IntergenicRegion(11:77361962_77361962+)] 1181.39453125 5 30 None 77356962 77366962) ir = genomic.IntergenicRegion('11', 5000, 5000, STRAND.POS) tgt_width = 1000 d = DiagramSettings(domain_name_regex_filter=r'.*') d.gene_min_buffer = 10 # (self, canvas, gene, width, height, fill, label='', reference_genome=None) - draw_genes(d, self.canvas, [ir], tgt_width, []) + draw_genes(d, canvas, [ir], tgt_width, []) # _generate_interval_mapping ['Interval(29684391, 29684391)', 'Interval(29663998, 29696515)'] 1181.39453125 5 60 None 29662998 29697515 # def generate_interval_mapping(cls, input_intervals, target_width, ratio, min_width, buffer_length=None, start=None, end=None, min_inter_width=None) @@ -77,12 +78,12 @@ def test_split_intervals_into_tracks(self): # ------======-------- # -----=============== t = split_intervals_into_tracks([(1, 3), (3, 7), (2, 2), (4, 5), (3, 10)]) - self.assertEqual(3, len(t)) - self.assertEqual([(1, 3), (4, 5)], t[0]) - self.assertEqual([(2, 2), (3, 7)], t[1]) - self.assertEqual([(3, 10)], t[2]) + assert len(t) == 3 + assert t[0] == [(1, 3), (4, 5)] + assert t[1] == [(2, 2), (3, 7)] + assert t[2] == [(3, 10)] - def test_draw_genes(self): + def test_draw_genes(self, canvas): x = genomic.Gene('1', 1000, 2000, strand=STRAND.POS) y = genomic.Gene('1', 5000, 7000, strand=STRAND.NEG) @@ -92,7 +93,7 @@ def test_draw_genes(self): breakpoints = [Breakpoint('1', 1100, 1200, orient=ORIENT.RIGHT)] g = draw_genes( d, - self.canvas, + canvas, [x, y, z], 500, breakpoints, @@ -100,24 +101,24 @@ def test_draw_genes(self): ) # test the class structure - self.assertEqual(6, len(g.elements)) - self.assertEqual('scaffold', g.elements[0].attribs.get('class', '')) + assert len(g.elements) == 6 + assert g.elements[0].attribs.get('class', '') == 'scaffold' for i in range(1, 4): - self.assertEqual('gene', g.elements[i].attribs.get('class', '')) - self.assertEqual('mask', g.elements[4].attribs.get('class', '')) - self.assertEqual('breakpoint', g.elements[5].attribs.get('class', '')) - self.assertEqual( - d.track_height * 2 + d.padding + d.breakpoint_bottom_margin + d.breakpoint_top_margin, - g.height, + assert g.elements[i].attribs.get('class', '') == 'gene' + assert g.elements[4].attribs.get('class', '') == 'mask' + assert g.elements[5].attribs.get('class', '') == 'breakpoint' + assert ( + g.height + == d.track_height * 2 + d.padding + d.breakpoint_bottom_margin + d.breakpoint_top_margin ) - self.canvas.add(g) - self.assertEqual(len(g.labels), 4) - self.assertEqual(x, g.labels['G1']) - self.assertEqual(z, g.labels['G2']) - self.assertEqual(y, g.labels['G3']) - self.assertEqual(breakpoints[0], g.labels['B1']) - - def test_draw_ustranscript(self): + canvas.add(g) + assert 4 == len(g.labels) + assert g.labels['G1'] == x + assert g.labels['G2'] == z + assert g.labels['G3'] == y + assert g.labels['B1'] == breakpoints[0] + + def test_draw_ustranscript(self, canvas): d = DiagramSettings(domain_name_regex_filter=r'.*') # domains = [protein.Domain()] d1 = protein.Domain('first', [(55, 61), (71, 73)]) @@ -132,43 +133,38 @@ def test_draw_ustranscript(self): domains=[d2, d1], ) b = Breakpoint('1', 350, 410, orient=ORIENT.LEFT) - g = draw_ustranscript( - d, self.canvas, t, 500, colors={t.exons[1]: '#FFFF00'}, breakpoints=[b] - ) - self.canvas.add(g) - # self.canvas.saveas('test_draw_ustranscript.svg') - self.assertEqual(2, len(self.canvas.elements)) - self.assertEqual(3, len(g.elements)) + g = draw_ustranscript(d, canvas, t, 500, colors={t.exons[1]: '#FFFF00'}, breakpoints=[b]) + canvas.add(g) + # canvas.saveas('test_draw_ustranscript.svg') + assert len(canvas.elements) == 2 + assert len(g.elements) == 3 for el, cls in zip(g.elements[0].elements, ['splicing', 'exon_track', 'protein']): - self.assertEqual(cls, el.attribs.get('class', '')) + assert el.attribs.get('class', '') == cls for el, cls in zip( g.elements[0].elements[1].elements, ['scaffold', 'exon', 'exon', 'exon'] ): - self.assertEqual(cls, el.attribs.get('class', '')) + assert el.attribs.get('class', '') == cls for el, cls in zip(g.elements[0].elements[2].elements, ['translation', 'domain', 'domain']): - self.assertEqual(cls, el.attribs.get('class', '')) - - self.assertEqual( - sum( - [ - d.track_height, - d.splice_height, - 2 * d.padding, - d.domain_track_height * 2, - d.translation_track_height, - d.padding, - d.breakpoint_top_margin, - d.breakpoint_bottom_margin, - ] - ), - g.height, - ) - self.assertEqual(d1.name, g.labels['D1']) - self.assertEqual(d2.name, g.labels['D2']) - - def test_draw_consec_exons(self): + assert el.attribs.get('class', '') == cls + + assert g.height == sum( + [ + d.track_height, + d.splice_height, + 2 * d.padding, + d.domain_track_height * 2, + d.translation_track_height, + d.padding, + d.breakpoint_top_margin, + d.breakpoint_bottom_margin, + ] + ) + assert g.labels['D1'] == d1.name + assert g.labels['D2'] == d2.name + + def test_draw_consec_exons(self, canvas): d = DiagramSettings(domain_name_regex_filter=r'.*') # domains = [protein.Domain()] t = build_transcript( @@ -180,32 +176,30 @@ def test_draw_consec_exons(self): domains=[], ) b = Breakpoint('1', 350, 410, orient=ORIENT.LEFT) - g = draw_ustranscript( - d, self.canvas, t, 500, colors={t.exons[1]: '#FFFF00'}, breakpoints=[b] - ) - self.canvas.add(g) + g = draw_ustranscript(d, canvas, t, 500, colors={t.exons[1]: '#FFFF00'}, breakpoints=[b]) + canvas.add(g) if OUTPUT_SVG: - self.canvas.saveas('test_draw_consec_exons.svg') + canvas.saveas('test_draw_consec_exons.svg') - # self.canvas.saveas('test_draw_ustranscript.svg') - self.assertEqual(2, len(self.canvas.elements)) - self.assertEqual(3, len(g.elements)) + # canvas.saveas('test_draw_ustranscript.svg') + assert len(canvas.elements) == 2 + assert len(g.elements) == 3 # check that only 2 splicing marks were created - self.assertEqual(2, len(g.elements[0].elements[0].elements)) + assert len(g.elements[0].elements[0].elements) == 2 # get the second exon ex2 = g.elements[0].elements[1].elements[2].elements[0] print(ex2) - self.assertAlmostEqual(120.7783426339, ex2.attribs.get('width')) + assert pytest.approx(ex2.attribs.get('width')) == 120.7783426339 # get the third exon ex3 = g.elements[0].elements[1].elements[3].elements[0] print(ex3) - self.assertAlmostEqual(96.52494419642852, ex3.attribs.get('width')) + assert pytest.approx(ex3.attribs.get('width')) == 96.52494419642852 def test_dynamic_label_color(self): - self.assertEqual(HEX_WHITE, dynamic_label_color(HEX_BLACK)) - self.assertEqual(HEX_BLACK, dynamic_label_color(HEX_WHITE)) + assert dynamic_label_color(HEX_BLACK) == HEX_WHITE + assert dynamic_label_color(HEX_WHITE) == HEX_BLACK - def test_draw_legend(self): + def test_draw_legend(self, canvas): d = DiagramSettings(domain_name_regex_filter=r'.*') swatches = [ ('#000000', 'black'), @@ -214,19 +208,20 @@ def test_draw_legend(self): ('#00FF00', 'green'), ('#FFFF00', 'yellow'), ] - g = draw_legend(d, self.canvas, swatches) - self.canvas.add(g) + g = draw_legend(d, canvas, swatches) + canvas.add(g) - self.assertEqual('legend', g.attribs.get('class', '')) - self.assertEqual( - d.legend_swatch_size * len(swatches) + d.padding * (len(swatches) - 1 + 2), g.height + assert g.attribs.get('class', '') == 'legend' + assert g.height == d.legend_swatch_size * len(swatches) + d.padding * ( + len(swatches) - 1 + 2 ) - self.assertEqual(6, len(g.elements)) - self.assertEqual( - 6 * d.legend_font_size * d.font_width_height_ratio + + assert len(g.elements) == 6 + assert ( + g.width + == 6 * d.legend_font_size * d.font_width_height_ratio + d.padding * 3 - + d.legend_swatch_size, - g.width, + + d.legend_swatch_size ) def test_draw_layout_single_transcript(self): @@ -247,7 +242,7 @@ def test_draw_layout_single_transcript(self): ft = variant.FusionTranscript.build(ann, reference_genome) ann.fusion = ft canvas, legend = draw_sv_summary_diagram(d, ann) - self.assertEqual(4, len(canvas.elements)) # defs counts as element + assert len(canvas.elements) == 4 # defs counts as element expected_height = ( d.top_margin + d.bottom_margin @@ -269,7 +264,7 @@ def test_draw_layout_single_transcript(self): ) if OUTPUT_SVG: canvas.saveas('test_draw_layout_single_transcript.svg') - self.assertEqual(expected_height, canvas.attribs['height']) + assert canvas.attribs['height'] == expected_height def test_draw_layout_single_genomic(self): d = DiagramSettings(domain_name_regex_filter=r'.*') @@ -305,12 +300,12 @@ def test_draw_layout_single_genomic(self): ft = variant.FusionTranscript.build(ann, reference_genome) ann.fusion = ft - self.assertEqual(t1.exons[0], ft.exon_mapping[ft.exons[0].position]) - self.assertEqual(t2.exons[2], ft.exon_mapping[ft.exons[1].position]) - self.assertEqual(t2.exons[3], ft.exon_mapping[ft.exons[2].position]) + assert ft.exon_mapping[ft.exons[0].position] == t1.exons[0] + assert ft.exon_mapping[ft.exons[1].position] == t2.exons[2] + assert ft.exon_mapping[ft.exons[2].position] == t2.exons[3] canvas, legend = draw_sv_summary_diagram(d, ann) - self.assertEqual(5, len(canvas.elements)) # defs counts as element + assert len(canvas.elements) == 5 # defs counts as element expected_height = ( d.top_margin @@ -331,7 +326,7 @@ def test_draw_layout_single_genomic(self): + d.track_height + d.splice_height ) - self.assertEqual(expected_height, canvas.attribs['height']) + assert canvas.attribs['height'] == expected_height if OUTPUT_SVG: canvas.saveas('test_draw_layout_single_genomic.svg') @@ -378,7 +373,7 @@ def test_draw_layout_translocation(self): ft = variant.FusionTranscript.build(ann, reference_genome) ann.fusion = ft canvas, legend = draw_sv_summary_diagram(d, ann) - self.assertEqual(6, len(canvas.elements)) # defs counts as element + assert len(canvas.elements) == 6 # defs counts as element expected_height = ( d.top_margin + d.bottom_margin @@ -399,7 +394,7 @@ def test_draw_layout_translocation(self): + d.track_height + d.splice_height ) - self.assertEqual(expected_height, canvas.attribs['height']) + assert canvas.attribs['height'] == expected_height def test_draw_template(self): # def draw_template(self, canvas, template, target_width, height, labels=None, colors=None): @@ -417,12 +412,13 @@ def test_draw_template(self): canvas = Drawing(size=(1000, 50)) g = draw_template(d, canvas, TEMPLATE_METADATA['1'], 1000) - self.assertEqual( - d.breakpoint_top_margin + d.breakpoint_bottom_margin + d.template_track_height, g.height + assert ( + g.height + == d.breakpoint_top_margin + d.breakpoint_bottom_margin + d.template_track_height ) canvas.add(g) canvas.attribs['height'] = g.height - self.assertEqual(2, len(canvas.elements)) + assert len(canvas.elements) == 2 def test_draw_translocation_with_template(self): d = DiagramSettings(domain_name_regex_filter=r'.*') @@ -473,7 +469,7 @@ def test_draw_translocation_with_template(self): ) if OUTPUT_SVG: canvas.saveas('test_draw_translocation_with_template.svg') - self.assertEqual(8, len(canvas.elements)) # defs counts as element + assert len(canvas.elements) == 8 # defs counts as element expected_height = ( d.top_margin + d.bottom_margin @@ -497,7 +493,7 @@ def test_draw_translocation_with_template(self): + d.splice_height + d.template_track_height ) - self.assertAlmostEqual(expected_height, canvas.attribs['height']) + assert pytest.approx(canvas.attribs['height']) == expected_height def test_draw_overlay(self): gene = genomic.Gene('12', 25357723, 25403870, strand=STRAND.NEG, name='KRAS') @@ -553,7 +549,7 @@ def test_draw_overlay(self): d.gene_min_buffer = 0 canvas = draw_multi_transcript_overlay(d, gene, vmarkers=[marker], plots=[s, s]) - self.assertEqual(2, len(canvas.elements)) # defs counts as element + assert len(canvas.elements) == 2 # defs counts as element if OUTPUT_SVG: canvas.saveas('test_draw_overlay.svg') diff --git a/tests/integration/test_pairing.py b/tests/integration/test_pairing.py index 3a0064f8..4a3a77e1 100644 --- a/tests/integration/test_pairing.py +++ b/tests/integration/test_pairing.py @@ -1,101 +1,130 @@ import unittest +import pytest from mavis.annotate.genomic import PreTranscript from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CALL_METHOD, COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE from mavis.pairing import pairing -class TestPairing(unittest.TestCase): - def setUp(self): - self.gev1 = BreakpointPair( - Breakpoint('1', 1), - Breakpoint('1', 10), - opposing_strands=True, - **{ - COLUMNS.event_type: SVTYPE.DEL, - COLUMNS.call_method: CALL_METHOD.CONTIG, - COLUMNS.fusion_sequence_fasta_id: None, - COLUMNS.protocol: PROTOCOL.GENOME, - }, - ) - self.gev2 = BreakpointPair( - Breakpoint('1', 1), - Breakpoint('1', 10), - opposing_strands=True, - **{ - COLUMNS.event_type: SVTYPE.DEL, - COLUMNS.call_method: CALL_METHOD.CONTIG, - COLUMNS.fusion_sequence_fasta_id: None, - COLUMNS.protocol: PROTOCOL.GENOME, - }, - ) - - self.ust1 = PreTranscript( - exons=[(1, 100), (301, 400), (501, 600)], strand=STRAND.POS, name='t1' - ) - self.ust2 = PreTranscript( - exons=[(1001, 1100), (1301, 1400), (1501, 1600)], strand=STRAND.POS, name='t2' - ) - self.distances = {CALL_METHOD.CONTIG: 0, CALL_METHOD.FLANK: 0, CALL_METHOD.SPLIT: 10} - self.TRANSCRIPTS = {self.ust1.name: self.ust1, self.ust2.name: self.ust2} - - def test_genome_protocol_diff_chrom(self): - self.gev2.break1.chr = '2' - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, self.TRANSCRIPTS)) - - def test_genome_protocol_diff_orient(self): - self.gev2.break1.orient = ORIENT.LEFT - self.gev1.break1.orient = ORIENT.RIGHT - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, self.TRANSCRIPTS)) - - def test_genome_protocol_diff_strand(self): - self.gev2.break1.strand = STRAND.POS - self.gev1.break1.strand = STRAND.NEG - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, self.TRANSCRIPTS)) - - def test_genome_protocol_diff_event_type(self): - self.gev2.data[COLUMNS.event_type] = SVTYPE.DEL - self.gev1.data[COLUMNS.event_type] = SVTYPE.INS - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, self.TRANSCRIPTS)) - - def test_genome_protocol_ns_orient(self): - self.gev2.break1.orient = ORIENT.LEFT - self.gev1.break2.orient = ORIENT.RIGHT - self.assertTrue(pairing.equivalent(self.gev1, self.gev2, self.TRANSCRIPTS)) - - def test_genome_protocol_by_contig(self): - self.gev1.call_method = CALL_METHOD.CONTIG - self.gev2.call_method = CALL_METHOD.CONTIG - self.distances[CALL_METHOD.CONTIG] = 0 - self.distances[CALL_METHOD.SPLIT] = 10 - self.assertTrue(pairing.equivalent(self.gev1, self.gev2, distances=self.distances)) - - self.gev1.break1.start = 2 - self.gev1.break1.end = 20 - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, distances=self.distances)) - - def test_genome_protocol_by_split(self): - self.gev1.call_method = CALL_METHOD.SPLIT - self.gev2.call_method = CALL_METHOD.SPLIT - self.assertTrue(pairing.equivalent(self.gev1, self.gev2, distances=self.distances)) - self.distances[CALL_METHOD.FLANK] = 100 - self.distances[CALL_METHOD.SPLIT] = 10 - self.gev1.break1.start = 11 - self.gev1.break1.end = 20 - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, distances=self.distances)) - - def test_genome_protocol_by_flanking(self): - self.gev1.call_method = CALL_METHOD.FLANK - self.gev2.call_method = CALL_METHOD.FLANK - self.assertTrue(pairing.equivalent(self.gev1, self.gev2, distances=self.distances)) - self.distances[CALL_METHOD.FLANK] = 10 - self.distances[CALL_METHOD.SPLIT] = 100 - self.gev1.break1.start = 11 - self.gev1.break1.end = 20 - self.assertFalse(pairing.equivalent(self.gev1, self.gev2, distances=self.distances)) - - def test_mixed_protocol_fusions_same_sequence(self): +@pytest.fixture +def genomic_event1(): + return BreakpointPair( + Breakpoint('1', 1), + Breakpoint('1', 10), + opposing_strands=True, + **{ + COLUMNS.event_type: SVTYPE.DEL, + COLUMNS.call_method: CALL_METHOD.CONTIG, + COLUMNS.fusion_sequence_fasta_id: None, + COLUMNS.protocol: PROTOCOL.GENOME, + }, + ) + + +@pytest.fixture +def genomic_event2(): + return BreakpointPair( + Breakpoint('1', 1), + Breakpoint('1', 10), + opposing_strands=True, + **{ + COLUMNS.event_type: SVTYPE.DEL, + COLUMNS.call_method: CALL_METHOD.CONTIG, + COLUMNS.fusion_sequence_fasta_id: None, + COLUMNS.protocol: PROTOCOL.GENOME, + }, + ) + + +@pytest.fixture +def unspliced_transcript1(): + return PreTranscript(exons=[(1, 100), (301, 400), (501, 600)], strand=STRAND.POS, name='t1') + + +@pytest.fixture +def unspliced_transcript2(): + return PreTranscript( + exons=[(1001, 1100), (1301, 1400), (1501, 1600)], strand=STRAND.POS, name='t2' + ) + + +@pytest.fixture +def transcripts(unspliced_transcript1, unspliced_transcript2): + return { + unspliced_transcript1.name: unspliced_transcript1, + unspliced_transcript2.name: unspliced_transcript2, + } + + +@pytest.fixture +def distances(): + return {CALL_METHOD.CONTIG: 0, CALL_METHOD.FLANK: 0, CALL_METHOD.SPLIT: 10} + + +class TestPairing: + def test_genome_protocol_diff_chrom(self, genomic_event1, genomic_event2, transcripts): + genomic_event2.break1.chr = '2' + assert not pairing.equivalent(genomic_event1, genomic_event2, transcripts) + + def test_genome_protocol_diff_orient(self, genomic_event1, genomic_event2, transcripts): + genomic_event2.break1.orient = ORIENT.LEFT + genomic_event1.break1.orient = ORIENT.RIGHT + assert not pairing.equivalent(genomic_event1, genomic_event2, transcripts) + + def test_genome_protocol_diff_strand(self, genomic_event1, genomic_event2, transcripts): + genomic_event2.break1.strand = STRAND.POS + genomic_event1.break1.strand = STRAND.NEG + assert not pairing.equivalent(genomic_event1, genomic_event2, transcripts) + + def test_genome_protocol_diff_event_type(self, genomic_event1, genomic_event2, transcripts): + genomic_event2.data[COLUMNS.event_type] = SVTYPE.DEL + genomic_event1.data[COLUMNS.event_type] = SVTYPE.INS + assert not pairing.equivalent(genomic_event1, genomic_event2, transcripts) + + def test_genome_protocol_ns_orient(self, genomic_event1, genomic_event2, transcripts): + genomic_event2.break1.orient = ORIENT.LEFT + genomic_event1.break2.orient = ORIENT.RIGHT + assert pairing.equivalent(genomic_event1, genomic_event2, transcripts) + + def test_genome_protocol_by_contig( + self, genomic_event1, genomic_event2, transcripts, distances + ): + genomic_event1.call_method = CALL_METHOD.CONTIG + genomic_event2.call_method = CALL_METHOD.CONTIG + distances[CALL_METHOD.CONTIG] = 0 + distances[CALL_METHOD.SPLIT] = 10 + assert pairing.equivalent(genomic_event1, genomic_event2, distances=distances) + + genomic_event1.break1.start = 2 + genomic_event1.break1.end = 20 + assert not pairing.equivalent(genomic_event1, genomic_event2, distances=distances) + + def test_genome_protocol_by_split(self, genomic_event1, genomic_event2, transcripts, distances): + genomic_event1.call_method = CALL_METHOD.SPLIT + genomic_event2.call_method = CALL_METHOD.SPLIT + assert pairing.equivalent(genomic_event1, genomic_event2, distances=distances) + distances[CALL_METHOD.FLANK] = 100 + distances[CALL_METHOD.SPLIT] = 10 + genomic_event1.break1.start = 11 + genomic_event1.break1.end = 20 + assert not pairing.equivalent(genomic_event1, genomic_event2, distances=distances) + + def test_genome_protocol_by_flanking( + self, genomic_event1, genomic_event2, transcripts, distances + ): + genomic_event1.call_method = CALL_METHOD.FLANK + genomic_event2.call_method = CALL_METHOD.FLANK + assert pairing.equivalent(genomic_event1, genomic_event2, distances=distances) + distances[CALL_METHOD.FLANK] = 10 + distances[CALL_METHOD.SPLIT] = 100 + genomic_event1.break1.start = 11 + genomic_event1.break1.end = 20 + assert not pairing.equivalent(genomic_event1, genomic_event2, distances=distances) + + def test_mixed_protocol_fusions_same_sequence( + self, genomic_event1, genomic_event2, transcripts + ): genome_ev = BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 10), @@ -126,12 +155,14 @@ def test_mixed_protocol_fusions_same_sequence(self): COLUMNS.fusion_cdna_coding_end: 10, }, ) - self.assertFalse(pairing.equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) + assert not pairing.equivalent(genome_ev, trans_ev, transcripts) genome_ev.data[COLUMNS.fusion_sequence_fasta_id] = 'a' trans_ev.data[COLUMNS.fusion_sequence_fasta_id] = 'a' - self.assertTrue(pairing.inferred_equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) + assert pairing.inferred_equivalent(genome_ev, trans_ev, transcripts) - def test_mixed_protocol_fusions_same_sequence_diff_translation(self): + def test_mixed_protocol_fusions_same_sequence_diff_translation( + self, genomic_event1, genomic_event2, transcripts + ): genome_ev = BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 10), @@ -162,9 +193,11 @@ def test_mixed_protocol_fusions_same_sequence_diff_translation(self): COLUMNS.fusion_cdna_coding_end: 50, }, ) - self.assertFalse(pairing.inferred_equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) + assert not pairing.inferred_equivalent(genome_ev, trans_ev, transcripts) - def test_mixed_protocol_fusions_different_sequence(self): + def test_mixed_protocol_fusions_different_sequence( + self, genomic_event1, genomic_event2, transcripts + ): genome_ev = BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 10), @@ -195,9 +228,11 @@ def test_mixed_protocol_fusions_different_sequence(self): COLUMNS.fusion_cdna_coding_end: 10, }, ) - self.assertFalse(pairing.inferred_equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) + assert not pairing.inferred_equivalent(genome_ev, trans_ev, transcripts) - def test_mixed_protocol_one_predicted_one_match(self): + def test_mixed_protocol_one_predicted_one_match( + self, genomic_event1, genomic_event2, transcripts, unspliced_transcript1 + ): genome_ev = BreakpointPair( Breakpoint('1', 350, orient=ORIENT.LEFT), Breakpoint('1', 400, orient=ORIENT.RIGHT), @@ -207,7 +242,7 @@ def test_mixed_protocol_one_predicted_one_match(self): COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, COLUMNS.protocol: PROTOCOL.GENOME, - COLUMNS.transcript1: self.ust1.name, + COLUMNS.transcript1: unspliced_transcript1.name, COLUMNS.transcript2: None, }, ) @@ -220,21 +255,23 @@ def test_mixed_protocol_one_predicted_one_match(self): COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, COLUMNS.protocol: PROTOCOL.TRANS, - COLUMNS.transcript1: self.ust1.name, + COLUMNS.transcript1: unspliced_transcript1.name, COLUMNS.transcript2: None, }, ) - self.assertTrue(pairing.equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) - self.assertTrue(pairing.equivalent(trans_ev, genome_ev, self.TRANSCRIPTS)) + assert pairing.equivalent(genome_ev, trans_ev, transcripts) + assert pairing.equivalent(trans_ev, genome_ev, transcripts) - genome_ev.data[COLUMNS.transcript2] = self.ust1.name + genome_ev.data[COLUMNS.transcript2] = unspliced_transcript1.name genome_ev.data[COLUMNS.transcript1] = None - trans_ev.data[COLUMNS.transcript2] = self.ust1.name + trans_ev.data[COLUMNS.transcript2] = unspliced_transcript1.name trans_ev.data[COLUMNS.transcript1] = None - self.assertTrue(pairing.inferred_equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) - self.assertTrue(pairing.inferred_equivalent(trans_ev, genome_ev, self.TRANSCRIPTS)) + assert pairing.inferred_equivalent(genome_ev, trans_ev, transcripts) + assert pairing.inferred_equivalent(trans_ev, genome_ev, transcripts) - def test_mixed_protocol_one_predicted_one_mismatch(self): + def test_mixed_protocol_one_predicted_one_mismatch( + self, genomic_event1, genomic_event2, transcripts, unspliced_transcript1 + ): genome_ev = BreakpointPair( Breakpoint('1', 350, orient=ORIENT.LEFT), Breakpoint('1', 400, orient=ORIENT.RIGHT), @@ -244,7 +281,7 @@ def test_mixed_protocol_one_predicted_one_mismatch(self): COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, COLUMNS.protocol: PROTOCOL.GENOME, - COLUMNS.transcript1: self.ust1.name, + COLUMNS.transcript1: unspliced_transcript1.name, COLUMNS.transcript2: None, }, ) @@ -257,19 +294,19 @@ def test_mixed_protocol_one_predicted_one_mismatch(self): COLUMNS.call_method: CALL_METHOD.CONTIG, COLUMNS.fusion_sequence_fasta_id: None, COLUMNS.protocol: PROTOCOL.TRANS, - COLUMNS.transcript1: self.ust1.name, + COLUMNS.transcript1: unspliced_transcript1.name, COLUMNS.transcript2: None, }, ) - self.assertTrue(pairing.equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) - self.assertTrue(pairing.equivalent(trans_ev, genome_ev, self.TRANSCRIPTS)) + assert pairing.equivalent(genome_ev, trans_ev, transcripts) + assert pairing.equivalent(trans_ev, genome_ev, transcripts) - genome_ev.data[COLUMNS.transcript2] = self.ust1.name + genome_ev.data[COLUMNS.transcript2] = unspliced_transcript1.name genome_ev.data[COLUMNS.transcript1] = None - trans_ev.data[COLUMNS.transcript2] = self.ust1.name + trans_ev.data[COLUMNS.transcript2] = unspliced_transcript1.name trans_ev.data[COLUMNS.transcript1] = None - self.assertTrue(pairing.inferred_equivalent(genome_ev, trans_ev, self.TRANSCRIPTS)) - self.assertTrue(pairing.inferred_equivalent(trans_ev, genome_ev, self.TRANSCRIPTS)) + assert pairing.inferred_equivalent(genome_ev, trans_ev, transcripts) + assert pairing.inferred_equivalent(trans_ev, genome_ev, transcripts) def test_mixed_protocol_both_predicted(self): @@ -288,101 +325,107 @@ def test_transcriptome_protocol(self): raise unittest.SkipTest('TODO') -class TestBreakpointPrediction(unittest.TestCase): - def setUp(self): - self.pre_transcript = PreTranscript([(101, 200), (301, 400), (501, 600)], strand=STRAND.POS) - self.n_ust = PreTranscript([(101, 200), (301, 400), (501, 600)], strand=STRAND.NEG) +@pytest.fixture +def positive_transcript(): + return PreTranscript([(101, 200), (301, 400), (501, 600)], strand=STRAND.POS) + + +@pytest.fixture +def negative_transcript(): + return PreTranscript([(101, 200), (301, 400), (501, 600)], strand=STRAND.NEG) + - def test_exonic_five_prime(self): +class TestBreakpointPrediction: + def test_exonic_five_prime(self, positive_transcript): b = Breakpoint('1', 350, orient=ORIENT.LEFT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) - self.assertEqual(2, len(breaks)) - self.assertEqual(200, breaks[0].start) - self.assertEqual(b, breaks[1]) + breaks = pairing.predict_transcriptome_breakpoint(b, positive_transcript) + assert len(breaks) == 2 + assert breaks[0].start == 200 + assert breaks[1] == b - def test_exonic_five_prime_first_exon(self): + def test_exonic_five_prime_first_exon(self, positive_transcript): b = Breakpoint('1', 150, orient=ORIENT.LEFT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) - self.assertEqual(1, len(breaks)) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, positive_transcript) + assert len(breaks) == 1 + assert breaks[0] == b - def test_exonic_three_prime(self): + def test_exonic_three_prime(self, positive_transcript): b = Breakpoint('1', 350, orient=ORIENT.RIGHT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) - self.assertEqual(2, len(breaks)) - self.assertEqual(501, breaks[1].start) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, positive_transcript) + assert len(breaks) == 2 + assert breaks[1].start == 501 + assert breaks[0] == b - def test_exonic_three_prime_last_exon(self): + def test_exonic_three_prime_last_exon(self, positive_transcript): b = Breakpoint('1', 550, orient=ORIENT.RIGHT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) - self.assertEqual(1, len(breaks)) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, positive_transcript) + assert len(breaks) == 1 + assert breaks[0] == b - def test_intronic_five_prime(self): + def test_intronic_five_prime(self, positive_transcript): b = Breakpoint('1', 450, orient=ORIENT.LEFT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) - self.assertEqual(1, len(breaks)) - self.assertEqual(400, breaks[0].start) + breaks = pairing.predict_transcriptome_breakpoint(b, positive_transcript) + assert len(breaks) == 1 + assert breaks[0].start == 400 - def test_intronic_three_prime(self): + def test_intronic_three_prime(self, positive_transcript): b = Breakpoint('1', 250, orient=ORIENT.RIGHT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) - self.assertEqual(1, len(breaks)) - self.assertEqual(301, breaks[0].start) + breaks = pairing.predict_transcriptome_breakpoint(b, positive_transcript) + assert len(breaks) == 1 + assert breaks[0].start == 301 - def test_outside_transcript(self): + def test_outside_transcript(self, positive_transcript): b = Breakpoint('1', 100, orient=ORIENT.RIGHT) - with self.assertRaises(AssertionError): - pairing.predict_transcriptome_breakpoint(b, self.pre_transcript) + with pytest.raises(AssertionError): + pairing.predict_transcriptome_breakpoint(b, positive_transcript) # for neg transcripts - def test_exonic_three_prime_neg(self): + def test_exonic_three_prime_neg(self, negative_transcript): b = Breakpoint('1', 350, orient=ORIENT.LEFT, strand=STRAND.NEG) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(2, len(breaks)) - self.assertEqual(200, breaks[0].start) - self.assertEqual(b, breaks[1]) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 2 + assert breaks[0].start == 200 + assert breaks[1] == b - def test_intronic_three_prime_neg(self): + def test_intronic_three_prime_neg(self, negative_transcript): b = Breakpoint('1', 450, orient=ORIENT.LEFT, strand=STRAND.NEG) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(1, len(breaks)) - self.assertEqual(400, breaks[0].start) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 1 + assert breaks[0].start == 400 - def test_exonic_five_prime_neg_first_exon(self): + def test_exonic_five_prime_neg_first_exon(self, negative_transcript): b = Breakpoint('1', 150, orient=ORIENT.LEFT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(1, len(breaks)) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 1 + assert breaks[0] == b - def test_exonic_three_prime_neg_first_exon(self): + def test_exonic_three_prime_neg_first_exon(self, negative_transcript): b = Breakpoint('1', 150, orient=ORIENT.LEFT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(1, len(breaks)) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 1 + assert breaks[0] == b - def test_exonic_five_prime_neg(self): + def test_exonic_five_prime_neg(self, negative_transcript): b = Breakpoint('1', 350, orient=ORIENT.RIGHT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(2, len(breaks)) - self.assertEqual(501, breaks[1].start) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 2 + assert breaks[1].start == 501 + assert breaks[0] == b - def test_exonic_five_prime_neg_last_exon(self): + def test_exonic_five_prime_neg_last_exon(self, negative_transcript): b = Breakpoint('1', 550, orient=ORIENT.RIGHT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(1, len(breaks)) - self.assertEqual(b, breaks[0]) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 1 + assert breaks[0] == b - def test_intronic_five_prime_neg(self): + def test_intronic_five_prime_neg(self, negative_transcript): b = Breakpoint('1', 250, orient=ORIENT.RIGHT) - breaks = pairing.predict_transcriptome_breakpoint(b, self.n_ust) - self.assertEqual(1, len(breaks)) - self.assertEqual(301, breaks[0].start) + breaks = pairing.predict_transcriptome_breakpoint(b, negative_transcript) + assert len(breaks) == 1 + assert breaks[0].start == 301 -class TestEquivalent(unittest.TestCase): +class TestEquivalent: def test_useq_uncertainty(self): event1 = BreakpointPair( Breakpoint('1', 157540650, orient='L'), @@ -397,7 +440,7 @@ def test_useq_uncertainty(self): event_type='deletion', call_method='spanning reads', ) - self.assertTrue(pairing.equivalent(event1, event2)) + assert pairing.equivalent(event1, event2) def test_useq_uncertainty2(self): event1 = BreakpointPair( @@ -414,4 +457,4 @@ def test_useq_uncertainty2(self): call_method='contig', untemplated_seq='TTTTTTTTT', ) - self.assertTrue(pairing.equivalent(event1, event2)) + assert pairing.equivalent(event1, event2) diff --git a/tests/integration/test_splicing.py b/tests/integration/test_splicing.py index a80adb11..471c2bf1 100644 --- a/tests/integration/test_splicing.py +++ b/tests/integration/test_splicing.py @@ -1,8 +1,7 @@ -import os -import unittest +import argparse +import pytest from mavis.annotate.constants import SPLICE_SITE_RADIUS -from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.genomic import Exon, PreTranscript from mavis.annotate.splicing import predict_splice_sites from mavis.annotate.variant import annotate_events @@ -10,7 +9,7 @@ from mavis.constants import PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, reverse_complement from mavis.interval import Interval -from . import DATA_DIR, MockLongString, MockObject, get_example_genes +from . import MockLongString, MockObject, get_example_genes EXAMPLE_GENES = None @@ -20,340 +19,316 @@ def setUpModule(): EXAMPLE_GENES = get_example_genes() -class TestSplicingPatterns(unittest.TestCase): - def setUp(self): - self.setup_by_strand(STRAND.POS) - - def setup_by_strand(self, strand): - self.ex1 = Exon(100, 199, strand=strand) # C - self.ex2 = Exon(500, 599, strand=strand) # G - self.ex3 = Exon(1200, 1299, strand=strand) # T - self.ex4 = Exon(1500, 1599, strand=strand) # C - self.ex5 = Exon(1700, 1799, strand=strand) # G - self.ex6 = Exon(2000, 2099, strand=strand) # C - # introns: 99, 300, 600, 200, 100, ... - reference_sequence = 'a' * 99 + 'C' * 100 + 'a' * 300 + 'G' * 100 - reference_sequence += 'a' * 600 + 'T' * 100 + 'a' * 200 + 'C' * 100 - reference_sequence += 'a' * 100 + 'G' * 100 + 'a' * 200 + 'C' * 100 - self.reference_sequence = reference_sequence - self.pre_transcript = PreTranscript( - exons=[self.ex1, self.ex2, self.ex3, self.ex4, self.ex5, self.ex6], strand=strand - ) - +@pytest.fixture +def neg_splicing_pattern(): + n = argparse.Namespace() + n.ex1 = Exon(100, 199, strand=STRAND.NEG) # C + n.ex2 = Exon(500, 599, strand=STRAND.NEG) # G + n.ex3 = Exon(1200, 1299, strand=STRAND.NEG) # T + n.ex4 = Exon(1500, 1599, strand=STRAND.NEG) # C + n.ex5 = Exon(1700, 1799, strand=STRAND.NEG) # G + n.ex6 = Exon(2000, 2099, strand=STRAND.NEG) # C + # introns: 99, 300, 600, 200, 100, ... + reference_sequence = 'a' * 99 + 'C' * 100 + 'a' * 300 + 'G' * 100 + reference_sequence += 'a' * 600 + 'T' * 100 + 'a' * 200 + 'C' * 100 + reference_sequence += 'a' * 100 + 'G' * 100 + 'a' * 200 + 'C' * 100 + n.reference_sequence = reference_sequence + n.pre_transcript = PreTranscript( + exons=[n.ex1, n.ex2, n.ex3, n.ex4, n.ex5, n.ex6], strand=STRAND.NEG + ) + return n + + +@pytest.fixture +def pos_splicing_pattern(): + n = argparse.Namespace() + n.ex1 = Exon(100, 199, strand=STRAND.POS) # C + n.ex2 = Exon(500, 599, strand=STRAND.POS) # G + n.ex3 = Exon(1200, 1299, strand=STRAND.POS) # T + n.ex4 = Exon(1500, 1599, strand=STRAND.POS) # C + n.ex5 = Exon(1700, 1799, strand=STRAND.POS) # G + n.ex6 = Exon(2000, 2099, strand=STRAND.POS) # C + # introns: 99, 300, 600, 200, 100, ... + reference_sequence = 'a' * 99 + 'C' * 100 + 'a' * 300 + 'G' * 100 + reference_sequence += 'a' * 600 + 'T' * 100 + 'a' * 200 + 'C' * 100 + reference_sequence += 'a' * 100 + 'G' * 100 + 'a' * 200 + 'C' * 100 + n.reference_sequence = reference_sequence + n.pre_transcript = PreTranscript( + exons=[n.ex1, n.ex2, n.ex3, n.ex4, n.ex5, n.ex6], strand=STRAND.POS + ) + return n + + +class TestSplicingPatterns: def test_single_exon(self): t = PreTranscript([(3, 4)], strand=STRAND.POS) patt = t.generate_splicing_patterns() - self.assertEqual(1, len(patt)) - self.assertEqual(0, len(patt[0])) - self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type) - - def test_normal_pattern_pos(self): - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(1, len(patt)) - self.assertEqual( - [ - self.ex1.end, - self.ex2.start, - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type) - - def test_normal_pattern_neg(self): - self.setup_by_strand(STRAND.NEG) - self.assertTrue(self.pre_transcript.is_reverse) - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(1, len(patt)) - self.assertEqual( - [ - self.ex1.end, - self.ex2.start, - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - sorted([s.pos for s in patt[0]]), - ) - self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type) - - def test_abrogate_a_pos(self): - self.ex2.start_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(2, len(patt)) - - self.assertEqual( - [ - self.ex1.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.SKIP, patt[0].splice_type) - - self.assertEqual( - [ - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[1]], - ) - self.assertEqual(SPLICE_TYPE.RETAIN, patt[1].splice_type) - - def test_abrogate_a_neg(self): - self.setup_by_strand(STRAND.NEG) - self.ex2.start_splice_site.intact = False - patt = sorted(self.pre_transcript.generate_splicing_patterns()) - self.assertEqual(2, len(patt)) - self.assertEqual( - [ - self.ex1.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - sorted([s.pos for s in patt[0]]), - ) - self.assertEqual(SPLICE_TYPE.SKIP, patt[0].splice_type) - self.assertEqual( - [ - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - sorted([s.pos for s in patt[1]]), - ) - self.assertEqual(SPLICE_TYPE.RETAIN, patt[1].splice_type) - - def test_abrogate_a_last_exon(self): - self.ex6.start_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(1, len(patt)) - self.assertEqual( - [ - self.ex1.end, - self.ex2.start, - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.RETAIN, patt[0].splice_type) - - def test_abrogate_d_first_exon(self): - self.ex1.end_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(1, len(patt)) - self.assertEqual( - [ - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.RETAIN, patt[0].splice_type) - - def test_abrogate_ad(self): - self.ex2.start_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(2, len(patt)) - self.assertEqual( - [ - self.ex1.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.SKIP, patt[0].splice_type) - - self.assertEqual( - [ - self.ex2.end, - self.ex3.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[1]], - ) - self.assertEqual(SPLICE_TYPE.RETAIN, patt[1].splice_type) - - def test_abrogate_da(self): - self.ex2.end_splice_site.intact = False - self.ex3.start_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(1, len(patt)) - self.assertEqual( - [ - self.ex1.end, - self.ex2.start, - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.RETAIN, patt[0].splice_type) - - def test_multiple_exons_or_multiple_introns_abrogate_ada(self): - self.ex2.start_splice_site.intact = False - self.ex2.end_splice_site.intact = False - self.ex3.start_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(2, len(patt)) - - self.assertEqual( - [ - self.ex1.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.MULTI_SKIP, patt[0].splice_type) - - self.assertEqual( - [ - self.ex3.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[1]], - ) - self.assertEqual(SPLICE_TYPE.MULTI_RETAIN, patt[1].splice_type) - - def test_multiple_exons_or_multiple_introns_abrogate_dad(self): - self.ex2.end_splice_site.intact = False - self.ex3.start_splice_site.intact = False - self.ex3.end_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(2, len(patt)) - - self.assertEqual( - [ - self.ex1.end, - self.ex2.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[0]], - ) - self.assertEqual(SPLICE_TYPE.MULTI_RETAIN, patt[0].splice_type) - - self.assertEqual( - [ - self.ex1.end, - self.ex4.start, - self.ex4.end, - self.ex5.start, - self.ex5.end, - self.ex6.start, - ], - [s.pos for s in patt[1]], - ) - self.assertEqual(SPLICE_TYPE.MULTI_SKIP, patt[1].splice_type) - - def test_complex(self): - self.ex2.end_splice_site.intact = False - self.ex4.end_splice_site.intact = False - patt = self.pre_transcript.generate_splicing_patterns() - self.assertEqual(4, len(patt)) - self.assertTrue(SPLICE_TYPE.COMPLEX in [p.splice_type for p in patt]) - - -class TestExonSpliceSites(unittest.TestCase): + assert len(patt) == 1 + assert len(patt[0]) == 0 + assert patt[0].splice_type == SPLICE_TYPE.NORMAL + + def test_normal_pattern_pos(self, pos_splicing_pattern): + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 1 + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex2.start, + pos_splicing_pattern.ex2.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.NORMAL + + def test_normal_pattern_neg(self, neg_splicing_pattern): + assert neg_splicing_pattern.pre_transcript.is_reverse + patt = neg_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 1 + assert sorted([s.pos for s in patt[0]]) == [ + neg_splicing_pattern.ex1.end, + neg_splicing_pattern.ex2.start, + neg_splicing_pattern.ex2.end, + neg_splicing_pattern.ex3.start, + neg_splicing_pattern.ex3.end, + neg_splicing_pattern.ex4.start, + neg_splicing_pattern.ex4.end, + neg_splicing_pattern.ex5.start, + neg_splicing_pattern.ex5.end, + neg_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.NORMAL + + def test_abrogate_a_pos(self, pos_splicing_pattern): + pos_splicing_pattern.ex2.start_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 2 + + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.SKIP + + assert [s.pos for s in patt[1]] == [ + pos_splicing_pattern.ex2.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[1].splice_type == SPLICE_TYPE.RETAIN + + def test_abrogate_a_neg(self, neg_splicing_pattern): + neg_splicing_pattern.ex2.start_splice_site.intact = False + patt = sorted(neg_splicing_pattern.pre_transcript.generate_splicing_patterns()) + assert len(patt) == 2 + assert sorted([s.pos for s in patt[0]]) == [ + neg_splicing_pattern.ex1.end, + neg_splicing_pattern.ex3.start, + neg_splicing_pattern.ex3.end, + neg_splicing_pattern.ex4.start, + neg_splicing_pattern.ex4.end, + neg_splicing_pattern.ex5.start, + neg_splicing_pattern.ex5.end, + neg_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.SKIP + assert sorted([s.pos for s in patt[1]]) == [ + neg_splicing_pattern.ex2.end, + neg_splicing_pattern.ex3.start, + neg_splicing_pattern.ex3.end, + neg_splicing_pattern.ex4.start, + neg_splicing_pattern.ex4.end, + neg_splicing_pattern.ex5.start, + neg_splicing_pattern.ex5.end, + neg_splicing_pattern.ex6.start, + ] + assert patt[1].splice_type == SPLICE_TYPE.RETAIN + + def test_abrogate_a_last_exon(self, pos_splicing_pattern): + pos_splicing_pattern.ex6.start_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 1 + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex2.start, + pos_splicing_pattern.ex2.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.RETAIN + + def test_abrogate_d_first_exon(self, pos_splicing_pattern): + pos_splicing_pattern.ex1.end_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 1 + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex2.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.RETAIN + + def test_abrogate_ad(self, pos_splicing_pattern): + pos_splicing_pattern.ex2.start_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 2 + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.SKIP + + assert [s.pos for s in patt[1]] == [ + pos_splicing_pattern.ex2.end, + pos_splicing_pattern.ex3.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[1].splice_type == SPLICE_TYPE.RETAIN + + def test_abrogate_da(self, pos_splicing_pattern): + pos_splicing_pattern.ex2.end_splice_site.intact = False + pos_splicing_pattern.ex3.start_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 1 + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex2.start, + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.RETAIN + + def test_multiple_exons_or_multiple_introns_abrogate_ada(self, pos_splicing_pattern): + pos_splicing_pattern.ex2.start_splice_site.intact = False + pos_splicing_pattern.ex2.end_splice_site.intact = False + pos_splicing_pattern.ex3.start_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 2 + + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.MULTI_SKIP + + assert [s.pos for s in patt[1]] == [ + pos_splicing_pattern.ex3.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[1].splice_type == SPLICE_TYPE.MULTI_RETAIN + + def test_multiple_exons_or_multiple_introns_abrogate_dad(self, pos_splicing_pattern): + pos_splicing_pattern.ex2.end_splice_site.intact = False + pos_splicing_pattern.ex3.start_splice_site.intact = False + pos_splicing_pattern.ex3.end_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 2 + + assert [s.pos for s in patt[0]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex2.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[0].splice_type == SPLICE_TYPE.MULTI_RETAIN + + assert [s.pos for s in patt[1]] == [ + pos_splicing_pattern.ex1.end, + pos_splicing_pattern.ex4.start, + pos_splicing_pattern.ex4.end, + pos_splicing_pattern.ex5.start, + pos_splicing_pattern.ex5.end, + pos_splicing_pattern.ex6.start, + ] + assert patt[1].splice_type == SPLICE_TYPE.MULTI_SKIP + + def test_complex(self, pos_splicing_pattern): + pos_splicing_pattern.ex2.end_splice_site.intact = False + pos_splicing_pattern.ex4.end_splice_site.intact = False + patt = pos_splicing_pattern.pre_transcript.generate_splicing_patterns() + assert len(patt) == 4 + assert SPLICE_TYPE.COMPLEX in [p.splice_type for p in patt] + + +class TestExonSpliceSites: def test_end_splice_site(self): e = Exon(100, 199, strand=STRAND.POS) - self.assertEqual(2, SPLICE_SITE_RADIUS) - self.assertEqual(Interval(198, 201), e.end_splice_site) + assert SPLICE_SITE_RADIUS == 2 + print(e.end_splice_site) + assert Interval(198, 201) == e.end_splice_site def test_start_splice_site(self): e = Exon(100, 199, strand=STRAND.POS) - self.assertEqual(2, SPLICE_SITE_RADIUS) - self.assertEqual(Interval(98, 101), e.start_splice_site) + assert SPLICE_SITE_RADIUS == 2 + print(e.start_splice_site) + assert Interval(98, 101) == e.start_splice_site -class TestPredictSpliceSites(unittest.TestCase): +class TestPredictSpliceSites: def test_gimap4(self): gimap4 = EXAMPLE_GENES['GIMAP4'] donors = predict_splice_sites(gimap4.seq) for d in donors: print(d) - self.assertEqual(5, len(donors)) + assert len(donors) == 5 def test_gimap4_reverse(self): gimap4 = EXAMPLE_GENES['GIMAP4'] gimap4_seq = reverse_complement(gimap4.seq) donors = predict_splice_sites(gimap4_seq, True) for d in donors: - self.assertEqual(d.seq, gimap4_seq[d.start - 1 : d.end]) - self.assertEqual(5, len(donors)) + assert gimap4_seq[d.start - 1 : d.end] == d.seq + assert len(donors) == 5 + @pytest.mark.skip(reason='TODO: dependent functionality not yet implemented') def test_fusion_with_novel_splice_site(self): - raise unittest.SkipTest('TODO: dependent functionality not yet implemented') bpp = BreakpointPair( Breakpoint('7', 150268089, 150268089, 'L', '+'), Breakpoint('8', 79715940, 79715940, 'L', '-'), @@ -368,7 +343,7 @@ def test_fusion_with_novel_splice_site(self): il7.chr: MockObject(seq=MockLongString(il7.seq, offset=il7.start - 1)), } annotations = annotate_events([bpp], {gimap4.chr: [gimap4], il7.chr: [il7]}, ref_genome) - self.assertEqual(1, len(annotations)) + assert len(annotations) == 1 ann = annotations[0] print(ann, ann.transcript1, ann.transcript2) print(ann.fusion) @@ -378,4 +353,4 @@ def test_fusion_with_novel_splice_site(self): ) for ex in ann.fusion.transcripts[0].exons: print(ex, len(ex)) - self.assertTrue(False) + assert False diff --git a/tests/integration/test_validate.py b/tests/integration/test_validate.py index 3458aa68..8f0d8471 100644 --- a/tests/integration/test_validate.py +++ b/tests/integration/test_validate.py @@ -1,5 +1,4 @@ -import unittest - +import pytest from mavis.annotate.file_io import load_reference_genome from mavis.bam import cigar as _cigar from mavis.bam.cache import BamCache @@ -10,8 +9,8 @@ from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence -from ..util import get_data -from . import RUN_FULL, MockLongString, MockObject, MockRead, mock_read_pair +from ..util import get_data, long_running_test +from . import MockLongString, MockObject, MockRead, mock_read_pair REFERENCE_GENOME = None @@ -42,10 +41,8 @@ def setUpModule(): # add a check to determine if it is the expected bam file -@unittest.skipIf( - not RUN_FULL, 'slower tests will not be run unless the environment variable RUN_FULL is given' -) -class TestFullEvidenceGathering(unittest.TestCase): +@long_running_test +class TestFullEvidenceGathering: # need to make the assertions more specific by checking the actual names of the reads found in each bin # rather than just the counts. def genome_evidence(self, break1, break2, opposing_strands): @@ -130,9 +127,9 @@ def test_load_evidence_translocation(self): ) ev1.load_evidence() print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(14, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(20, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(21, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 14 + assert self.count_original_reads(ev1.split_reads[1]) == 20 + assert len(ev1.flanking_pairs) == 21 # second example ev1 = self.genome_evidence( @@ -142,10 +139,10 @@ def test_load_evidence_translocation(self): ) ev1.load_evidence() print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(21, self.count_original_reads(ev1.split_reads[0])) + assert self.count_original_reads(ev1.split_reads[0]) == 21 # one of the reads that appears to look good in the bam is too low quality % match - self.assertEqual(40, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(57, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[1]) == 40 + assert len(ev1.flanking_pairs) == 57 def test_load_evidence_inversion(self): # first example @@ -157,9 +154,9 @@ def test_load_evidence_inversion(self): ev1.load_evidence() print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(54, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(20, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(104, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 54 + assert self.count_original_reads(ev1.split_reads[1]) == 20 + assert len(ev1.flanking_pairs) == 104 # second example ev1 = self.genome_evidence( @@ -169,9 +166,9 @@ def test_load_evidence_inversion(self): ) ev1.load_evidence() print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(15, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(27, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(52, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[1]) == 15 + assert self.count_original_reads(ev1.split_reads[0]) == 27 + assert len(ev1.flanking_pairs) == 52 def test_load_evidence_duplication(self): ev1 = self.genome_evidence( @@ -182,9 +179,9 @@ def test_load_evidence_duplication(self): ev1.load_evidence() self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(35, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(11, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(64, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 35 + assert self.count_original_reads(ev1.split_reads[1]) == 11 + assert len(ev1.flanking_pairs) == 64 def test_load_evidence_deletion1(self): # first example @@ -196,9 +193,9 @@ def test_load_evidence_deletion1(self): ev1.load_evidence() self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(49, len(ev1.flanking_pairs)) - self.assertEqual(22, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(14, self.count_original_reads(ev1.split_reads[1])) + assert len(ev1.flanking_pairs) == 49 + assert self.count_original_reads(ev1.split_reads[0]) == 22 + assert self.count_original_reads(ev1.split_reads[1]) == 14 def test_load_evidence_deletion2(self): # second example @@ -210,9 +207,9 @@ def test_load_evidence_deletion2(self): ev1.load_evidence() self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(4, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(10, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(27, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 4 + assert self.count_original_reads(ev1.split_reads[1]) == 10 + assert len(ev1.flanking_pairs) == 27 def test_load_evidence_deletion3(self): # third example @@ -224,9 +221,9 @@ def test_load_evidence_deletion3(self): ev1.load_evidence() self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(8, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(9, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(26, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 8 + assert self.count_original_reads(ev1.split_reads[1]) == 9 + assert len(ev1.flanking_pairs) == 26 def test_load_evidence_deletion4(self): # forth example @@ -238,9 +235,9 @@ def test_load_evidence_deletion4(self): ev1.load_evidence() self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(20, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(18, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(40, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 20 + assert self.count_original_reads(ev1.split_reads[1]) == 18 + assert len(ev1.flanking_pairs) == 40 def test_load_evidence_small_deletion1(self): # first example @@ -255,10 +252,10 @@ def test_load_evidence_small_deletion1(self): print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) print(len(ev1.spanning_reads)) - self.assertEqual(5, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(3, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(20, len(ev1.spanning_reads)) - self.assertEqual(6, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 5 + assert self.count_original_reads(ev1.split_reads[1]) == 3 + assert len(ev1.spanning_reads) == 20 + assert len(ev1.flanking_pairs) == 6 def test_load_evidence_small_deletion2(self): # second example @@ -275,10 +272,10 @@ def test_load_evidence_small_deletion2(self): for read, mate in ev1.flanking_pairs: print(read.query_name) - self.assertEqual(27, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(52, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(19, len(ev1.spanning_reads)) - self.assertEqual(7, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 27 + assert self.count_original_reads(ev1.split_reads[1]) == 52 + assert len(ev1.spanning_reads) == 19 + assert len(ev1.flanking_pairs) == 7 def test_load_evidence_small_deletion_test1(self): ev1 = self.genome_evidence( @@ -294,10 +291,10 @@ def test_load_evidence_small_deletion_test1(self): for read, mate in ev1.flanking_pairs: print(read.query_name) - self.assertEqual(18, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(16, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.spanning_reads)) - self.assertEqual(22, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 18 + assert self.count_original_reads(ev1.split_reads[1]) == 16 + assert len(ev1.spanning_reads) == 0 + assert len(ev1.flanking_pairs) == 22 def test_load_evidence_small_deletion_test2(self): ev1 = self.genome_evidence( @@ -307,10 +304,10 @@ def test_load_evidence_small_deletion_test2(self): ) ev1.load_evidence() self.print_evidence(ev1) - self.assertEqual(20, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(18, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.spanning_reads)) - self.assertEqual(40, len(set(ev1.flanking_pairs))) + assert self.count_original_reads(ev1.split_reads[0]) == 20 + assert self.count_original_reads(ev1.split_reads[1]) == 18 + assert len(ev1.spanning_reads) == 0 + assert len(set(ev1.flanking_pairs)) == 40 def test_load_evidence_small_deletion_test3(self): ev1 = self.genome_evidence( @@ -326,10 +323,10 @@ def test_load_evidence_small_deletion_test3(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(27, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(5, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.spanning_reads)) - self.assertEqual(53, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 27 + assert self.count_original_reads(ev1.split_reads[1]) == 5 + assert len(ev1.spanning_reads) == 0 + assert len(ev1.flanking_pairs) == 53 def test_load_evidence_small_deletion_test4(self): ev1 = self.genome_evidence( @@ -345,10 +342,10 @@ def test_load_evidence_small_deletion_test4(self): print(self.count_original_reads(ev1.split_reads[0])) print(self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(33, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(6, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.spanning_reads)) - self.assertEqual(77, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 33 + assert self.count_original_reads(ev1.split_reads[1]) == 6 + assert len(ev1.spanning_reads) == 0 + assert len(ev1.flanking_pairs) == 77 def test_load_evidence_small_deletion_test5(self): ev1 = self.genome_evidence( @@ -364,10 +361,10 @@ def test_load_evidence_small_deletion_test5(self): print(self.count_original_reads(ev1.split_reads[0])) print(self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(19, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(11, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.spanning_reads)) - self.assertEqual(48, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 19 + assert self.count_original_reads(ev1.split_reads[1]) == 11 + assert len(ev1.spanning_reads) == 0 + assert len(ev1.flanking_pairs) == 48 def test_load_evidence_small_deletion_test6(self): ev1 = self.genome_evidence( @@ -382,9 +379,9 @@ def test_load_evidence_small_deletion_test6(self): print(self.count_original_reads(ev1.split_reads[0])) print(self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(18, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(13, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(53, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 18 + assert self.count_original_reads(ev1.split_reads[1]) == 13 + assert len(ev1.flanking_pairs) == 53 def test_load_evidence_small_deletion_test7(self): ev1 = self.genome_evidence( @@ -400,9 +397,9 @@ def test_load_evidence_small_deletion_test7(self): print(self.count_original_reads(ev1.split_reads[0])) print(self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(39, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(13, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(49, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 39 + assert self.count_original_reads(ev1.split_reads[1]) == 13 + assert len(ev1.flanking_pairs) == 49 def test_load_evidence_small_deletion_test8(self): ev1 = self.genome_evidence( @@ -418,11 +415,11 @@ def test_load_evidence_small_deletion_test8(self): print(self.count_original_reads(ev1.split_reads[0])) print(self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(59, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(8, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(59, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 59 + assert self.count_original_reads(ev1.split_reads[1]) == 8 + assert len(ev1.flanking_pairs) == 59 - @unittest.skip('skip because too complex') + @pytest.mark.skip(reason='skip because too complex') def test_load_evidence_complex_deletion(self): ev1 = self.genome_evidence( Breakpoint('reference12', 6001, orient=ORIENT.LEFT), @@ -440,12 +437,12 @@ def test_load_evidence_complex_deletion(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(76, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(83, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(1, len(ev1.spanning_reads)) - self.assertEqual(2, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 76 + assert self.count_original_reads(ev1.split_reads[1]) == 83 + assert len(ev1.spanning_reads) == 1 + assert len(ev1.flanking_pairs) == 2 - @unittest.skip('skip because high coverage') + @pytest.mark.skip(reason='skip because high coverage') def test_load_evidence_small_insertion(self): ev1 = self.genome_evidence( Breakpoint('reference1', 2000, orient=ORIENT.LEFT), @@ -460,12 +457,12 @@ def test_load_evidence_small_insertion(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(17, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(17, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(48, len(ev1.spanning_reads)) - self.assertEqual(4, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 17 + assert self.count_original_reads(ev1.split_reads[1]) == 17 + assert len(ev1.spanning_reads) == 48 + assert len(ev1.flanking_pairs) == 4 - @unittest.skip('skip because too high coverage') + @pytest.mark.skip(reason='skip because too high coverage') def test_load_evidence_small_insertion_high_coverage(self): ev1 = self.genome_evidence( Breakpoint('reference9', 2000, orient=ORIENT.LEFT), @@ -480,10 +477,10 @@ def test_load_evidence_small_insertion_high_coverage(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(37, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(52, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(37, len(ev1.spanning_reads)) - self.assertEqual(9, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 37 + assert self.count_original_reads(ev1.split_reads[1]) == 52 + assert len(ev1.spanning_reads) == 37 + assert len(ev1.flanking_pairs) == 9 ev1 = self.genome_evidence( Breakpoint('reference16', 2000, orient=ORIENT.LEFT), @@ -498,10 +495,10 @@ def test_load_evidence_small_insertion_high_coverage(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(27, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(52, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(19, len(ev1.spanning_reads)) - self.assertEqual(9, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 27 + assert self.count_original_reads(ev1.split_reads[1]) == 52 + assert len(ev1.spanning_reads) == 19 + assert len(ev1.flanking_pairs) == 9 def test_load_evidence_small_duplication(self): ev1 = self.genome_evidence( @@ -517,10 +514,10 @@ def test_load_evidence_small_duplication(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(29, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(51, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.spanning_reads)) - self.assertEqual(0, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 29 + assert self.count_original_reads(ev1.split_reads[1]) == 51 + assert len(ev1.spanning_reads) == 0 + assert len(ev1.flanking_pairs) == 0 # Example 2 ev1 = self.genome_evidence( @@ -536,10 +533,10 @@ def test_load_evidence_small_duplication(self): for read in sorted(ev1.spanning_reads, key=lambda x: x.query_name): print(read.query_name) - self.assertEqual(25, self.count_original_reads(ev1.split_reads[0])) - self.assertEqual(56, self.count_original_reads(ev1.split_reads[1])) - self.assertEqual(3, len(ev1.spanning_reads)) - self.assertEqual(0, len(ev1.flanking_pairs)) + assert self.count_original_reads(ev1.split_reads[0]) == 25 + assert self.count_original_reads(ev1.split_reads[1]) == 56 + assert len(ev1.spanning_reads) == 3 + assert len(ev1.flanking_pairs) == 0 def test_load_evidence_low_qual_deletion(self): ev1 = self.genome_evidence( @@ -551,31 +548,32 @@ def test_load_evidence_low_qual_deletion(self): self.print_evidence(ev1) print(len(ev1.spanning_reads)) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) - self.assertEqual(0, len(ev1.split_reads[0])) - self.assertEqual(0, len(ev1.split_reads[1])) - self.assertEqual(0, len(ev1.flanking_pairs)) - - -class TestEvidenceGathering(unittest.TestCase): - def setUp(self): - # test loading of evidence for event found on reference3 1114 2187 - self.ev1 = GenomeEvidence( - Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), - Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), - BAM_CACHE, - REFERENCE_GENOME, - opposing_strands=True, - read_length=125, - stdev_fragment_size=100, - median_fragment_size=380, - config={ - 'validate.stdev_count_abnormal': 3, - 'validate.min_flanking_pairs_resolution': 3, - 'validate.assembly_min_edge_trim_weight': 3, - }, - ) - - def test_collect_split_read(self): + assert len(ev1.split_reads[0]) == 0 + assert len(ev1.split_reads[1]) == 0 + assert len(ev1.flanking_pairs) == 0 + + +@pytest.fixture +def ev_gathering_setup(): + return GenomeEvidence( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + BAM_CACHE, + REFERENCE_GENOME, + opposing_strands=True, + read_length=125, + stdev_fragment_size=100, + median_fragment_size=380, + config={ + 'validate.stdev_count_abnormal': 3, + 'validate.min_flanking_pairs_resolution': 3, + 'validate.assembly_min_edge_trim_weight': 3, + }, + ) + + +class TestEvidenceGathering: + def test_collect_split_read(self, ev_gathering_setup): ev1_sr = MockRead( query_name='HISEQX1_11:3:1105:15351:25130:split', reference_id=1, @@ -589,10 +587,10 @@ def test_collect_split_read(self): next_reference_id=1, next_reference_start=2341, ) - self.ev1.collect_split_read(ev1_sr, True) - self.assertEqual(ev1_sr, list(self.ev1.split_reads[0])[0]) + ev_gathering_setup.collect_split_read(ev1_sr, True) + assert list(ev_gathering_setup.split_reads[0])[0] == ev1_sr - def test_collect_split_read_failure(self): + def test_collect_split_read_failure(self, ev_gathering_setup): # wrong cigar string ev1_sr = MockRead( query_name='HISEQX1_11:4:1203:3062:55280:split', @@ -607,10 +605,10 @@ def test_collect_split_read_failure(self): next_reference_id=1, next_reference_start=2550, ) - self.assertFalse(self.ev1.collect_split_read(ev1_sr, True)) + assert not ev_gathering_setup.collect_split_read(ev1_sr, True) - def test_collect_flanking_pair(self): - self.ev1.collect_flanking_pair( + def test_collect_flanking_pair(self, ev_gathering_setup): + ev_gathering_setup.collect_flanking_pair( MockRead( reference_id=1, reference_start=2214, @@ -631,47 +629,45 @@ def test_collect_flanking_pair(self): is_read1=False, ), ) - self.assertEqual(1, len(self.ev1.flanking_pairs)) + assert len(ev_gathering_setup.flanking_pairs) == 1 - def test_collect_flanking_pair_not_overlapping_evidence_window(self): + def test_collect_flanking_pair_not_overlapping_evidence_window(self, ev_gathering_setup): # first read in pair does not overlap the first evidence window # therefore this should return False and not add to the flanking_pairs pair = mock_read_pair( MockRead(reference_id=1, reference_start=1903, reference_end=2053, is_reverse=True), MockRead(reference_id=1, reference_start=2052, reference_end=2053, is_reverse=True), ) - self.assertFalse(self.ev1.collect_flanking_pair(*pair)) - self.assertEqual(0, len(self.ev1.flanking_pairs)) + assert not ev_gathering_setup.collect_flanking_pair(*pair) + assert len(ev_gathering_setup.flanking_pairs) == 0 - # @unittest.skip("demonstrating skipping") - def test_load_evidence(self): - print(self.ev1) - self.ev1.load_evidence() - print(self.ev1.spanning_reads) - self.assertEqual( - 2, + def test_load_evidence(self, ev_gathering_setup): + print(ev_gathering_setup) + ev_gathering_setup.load_evidence() + print(ev_gathering_setup.spanning_reads) + assert ( len( [ r - for r in self.ev1.split_reads[0] + for r in ev_gathering_setup.split_reads[0] if not r.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) ] - ), + ) + == 2 ) - self.assertEqual(7, len(self.ev1.flanking_pairs)) - self.assertEqual( - 2, + assert len(ev_gathering_setup.flanking_pairs) == 7 + assert ( len( [ r - for r in self.ev1.split_reads[1] + for r in ev_gathering_setup.split_reads[1] if not r.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) ] - ), + ) + == 2 ) - # @unittest.skip("demonstrating skipping") - def test_assemble_split_reads(self): + def test_assemble_split_reads(self, ev_gathering_setup): sr1 = MockRead( query_name='HISEQX1_11:3:1105:15351:25130:split', query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', @@ -709,23 +705,23 @@ def test_assemble_split_reads(self): query_sequence='CTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTT', flag=113, ) - self.ev1.split_reads = ( + ev_gathering_setup.split_reads = ( {sr1}, {sr1, sr3, sr7, sr9, sr12, sr15, sr19, sr24}, ) # subset needed to make a contig - # self.ev1.split_reads=([],[sr1,sr3,sr5,sr6,sr7,sr8,sr9,sr10,sr11,sr12,sr13,sr14,sr15,sr16,sr17,sr18,sr19,sr20,sr21,sr22,sr23,sr24]) #full set of reads produces different contig from subset. + # ev_gathering_setup.split_reads=([],[sr1,sr3,sr5,sr6,sr7,sr8,sr9,sr10,sr11,sr12,sr13,sr14,sr15,sr16,sr17,sr18,sr19,sr20,sr21,sr22,sr23,sr24]) #full set of reads produces different contig from subset. # full contig with more read support should be # CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT - self.ev1.half_mapped = (set(), {sr2}) - self.ev1.assemble_contig() - print(self.ev1.contigs) + ev_gathering_setup.half_mapped = (set(), {sr2}) + ev_gathering_setup.assemble_contig() + print(ev_gathering_setup.contigs) exp = 'CAACAATATGTAGGAAGCCATTATCTGAAGTGTAAGCAACTGCATAGTGCTATTTTAATTATGCATTGCAGGGAAACTGTGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATC' - self.assertEqual(exp, self.ev1.contigs[0].seq) + assert ev_gathering_setup.contigs[0].seq == exp -class TestStandardizeRead(unittest.TestCase): - def setUp(self): - self.mock_evidence = MockObject( +class TestStandardizeRead: + def test_bwa_mem(self): + mock_evidence = MockObject( reference_genome={ '1': MockObject( seq=MockLongString( @@ -749,8 +745,6 @@ def setUp(self): **DEFAULTS, }, ) - - def test_bwa_mem(self): # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='') @@ -762,17 +756,11 @@ def test_bwa_mem(self): read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19=')) read.query_name = 'name' read.mapping_quality = NA_MAPPING_QUALITY - std_read = Evidence.standardize_read(self.mock_evidence, read) - print(SamRead.__repr__(read)) - print(SamRead.__repr__(std_read)) - self.assertEqual(_cigar.convert_string_to_cigar('186=12D16='), std_read.cigar) - self.assertEqual(read.reference_start, std_read.reference_start) + std_read = Evidence.standardize_read(mock_evidence, read) + assert std_read.cigar == _cigar.convert_string_to_cigar('186=12D16=') + assert std_read.reference_start == read.reference_start class MockEvidence: def __init__(self, ref=None): self.HUMAN_REFERENCE_GENOME = ref - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/integration/test_validate_call.py b/tests/integration/test_validate_call.py index 5be1962a..23d53e37 100644 --- a/tests/integration/test_validate_call.py +++ b/tests/integration/test_validate_call.py @@ -1,6 +1,6 @@ -import unittest from unittest import mock +import pytest from mavis.align import call_paired_read_event, select_contig_alignments from mavis.annotate.file_io import load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript @@ -15,7 +15,7 @@ from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence -from ..util import get_data +from ..util import get_data, todo from . import MockBamFileHandle, MockLongString, MockRead, get_example_genes, mock_read_pair REFERENCE_GENOME = None @@ -52,7 +52,7 @@ def tearDownModule(): mock.patch.stopall() -class TestCallByContig(unittest.TestCase): +class TestCallByContig: def test_EGFR_small_del_transcriptome(self): gene = get_example_genes()['EGFR'] reference_annotations = {gene.chr: [gene]} @@ -93,35 +93,14 @@ def test_EGFR_small_del_transcriptome(self): for ev in events: print(ev) print(evidence.distance(ev.break1.start, ev.break2.start)) - self.assertEqual(1, len(events)) - self.assertEqual(Breakpoint('7', 55242465, orient='L', strand='+'), events[0].break1) - self.assertEqual(Breakpoint('7', 55242481, orient='R', strand='+'), events[0].break2) + assert len(events) == 1 + assert events[0].break1 == Breakpoint('7', 55242465, orient='L', strand='+') + assert events[0].break2 == Breakpoint('7', 55242481, orient='R', strand='+') print(events[0].contig_alignment.score()) - self.assertTrue(events[0].contig_alignment.score() > 0.99) + assert events[0].contig_alignment.score() > 0.99 -class TestEventCall(unittest.TestCase): - def setUp(self): - self.ev1 = GenomeEvidence( - Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), - Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), - BAM_CACHE, - REFERENCE_GENOME, - opposing_strands=True, - read_length=125, - stdev_fragment_size=100, - median_fragment_size=380, - stdev_count_abnormal=3, - min_flanking_pairs_resolution=3, - ) - self.ev = call.EventCall( - Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), - Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), - source_evidence=self.ev1, - event_type=SVTYPE.INV, - call_method=CALL_METHOD.SPLIT, - ) - +class TestEventCall: def test_bad_deletion(self): evidence = GenomeEvidence( Breakpoint('reference3', 16, orient='L'), @@ -132,7 +111,7 @@ def test_bad_deletion(self): stdev_fragment_size=100, median_fragment_size=380, ) - with self.assertRaises(ValueError): + with pytest.raises(ValueError): call.EventCall( Breakpoint('reference3', 43, orient='L'), Breakpoint('reference3', 44, orient='R'), @@ -142,12 +121,49 @@ def test_bad_deletion(self): ) def test_flanking_support_empty(self): - self.assertEqual(0, len(self.ev.flanking_pairs)) + + ev = call.EventCall( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + source_evidence=GenomeEvidence( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + BAM_CACHE, + REFERENCE_GENOME, + opposing_strands=True, + read_length=125, + stdev_fragment_size=100, + median_fragment_size=380, + stdev_count_abnormal=3, + min_flanking_pairs_resolution=3, + ), + event_type=SVTYPE.INV, + call_method=CALL_METHOD.SPLIT, + ) + assert len(ev.flanking_pairs) == 0 def test_flanking_support(self): # 1114 ++ # 2187 ++ - self.ev.flanking_pairs.add( + ev = call.EventCall( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + source_evidence=GenomeEvidence( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + BAM_CACHE, + REFERENCE_GENOME, + opposing_strands=True, + read_length=125, + stdev_fragment_size=100, + median_fragment_size=380, + stdev_count_abnormal=3, + min_flanking_pairs_resolution=3, + ), + event_type=SVTYPE.INV, + call_method=CALL_METHOD.SPLIT, + ) + ev.flanking_pairs.add( mock_read_pair( MockRead( query_name='test1', @@ -160,7 +176,7 @@ def test_flanking_support(self): MockRead(reference_id=3, reference_start=2200, reference_end=2250, is_reverse=True), ) ) - self.ev.flanking_pairs.add( + ev.flanking_pairs.add( mock_read_pair( MockRead( query_name='test2', @@ -173,36 +189,53 @@ def test_flanking_support(self): MockRead(reference_id=3, reference_start=2200, reference_end=2250, is_reverse=True), ) ) - median, stdev = self.ev.flanking_metrics() - self.assertEqual(2, len(self.ev.flanking_pairs)) - self.assertEqual(530, median) - self.assertEqual(30, stdev) + median, stdev = ev.flanking_metrics() + assert len(ev.flanking_pairs) == 2 + assert median == 530 + assert stdev == 30 def test_split_read_support_empty(self): - self.assertEqual(0, len(self.ev.break1_split_reads) + len(self.ev.break2_split_reads)) + ev = call.EventCall( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + source_evidence=GenomeEvidence( + Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), + Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), + BAM_CACHE, + REFERENCE_GENOME, + opposing_strands=True, + read_length=125, + stdev_fragment_size=100, + median_fragment_size=380, + stdev_count_abnormal=3, + min_flanking_pairs_resolution=3, + ), + event_type=SVTYPE.INV, + call_method=CALL_METHOD.SPLIT, + ) + assert len(ev.break1_split_reads) + len(ev.break2_split_reads) == 0 + @todo def test_call_by_split_delins_del_only(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_call_by_split_delins_both(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_call_by_split_delins_ins_only(self): # not implemented yet?? - raise unittest.SkipTest('TODO') - + pass -class TestPullFlankingSupport(unittest.TestCase): - def setUp(self): - self.bam_cache = BamCache(MockBamFileHandle({'1': 0, '2': 1})) - self.REFERENCE_GENOME = None +class TestPullFlankingSupport: def build_genome_evidence(self, b1, b2, opposing_strands=False): evidence = GenomeEvidence( b1, b2, - self.bam_cache, - self.REFERENCE_GENOME, + BamCache(MockBamFileHandle({'1': 0, '2': 1})), + None, opposing_strands=opposing_strands, read_length=100, median_fragment_size=500, @@ -230,7 +263,7 @@ def test_deletion(self): ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 # now test one where the read pair type is right but the positioning of the reads doesn't # support the current call @@ -241,7 +274,7 @@ def test_deletion(self): ) ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 def test_small_deletion_flanking_for_larger_deletion(self): evidence = self.build_genome_evidence( @@ -262,7 +295,7 @@ def test_small_deletion_flanking_for_larger_deletion(self): ) event.add_flanking_support(flanking_pairs) - self.assertEqual(0, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 0 def test_insertion(self): evidence = self.build_genome_evidence( @@ -283,7 +316,7 @@ def test_insertion(self): CALL_METHOD.SPLIT, ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 def test_inversion(self): evidence = self.build_genome_evidence( @@ -306,7 +339,7 @@ def test_inversion(self): ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 # test read that is the right type but the positioning does not support the current call flanking_pairs.append( @@ -316,7 +349,7 @@ def test_inversion(self): ) ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 def test_inverted_translocation(self): evidence = self.build_genome_evidence( @@ -338,7 +371,7 @@ def test_inverted_translocation(self): CALL_METHOD.SPLIT, ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 def test_translocation_rl(self): b1 = Breakpoint('11', 128675261, orient=ORIENT.RIGHT, strand=STRAND.POS) @@ -372,7 +405,7 @@ def test_translocation_rl(self): ), ] event.add_flanking_support(flanking_pairs) - self.assertEqual(len(flanking_pairs), len(event.flanking_pairs)) + assert len(event.flanking_pairs) == len(flanking_pairs) def test_translocation_rl_filter_nonsupporting(self): evidence = self.build_genome_evidence( @@ -393,7 +426,7 @@ def test_translocation_rl_filter_nonsupporting(self): ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 # test read that is the right type but the positioning does not support the current call # the mate is on the wrong chromosome (not sure if this would actually be added as flanking support) @@ -404,7 +437,7 @@ def test_translocation_rl_filter_nonsupporting(self): ) ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 def test_duplication(self): evidence = self.build_genome_evidence( @@ -427,23 +460,20 @@ def test_duplication(self): ) event.add_flanking_support(flanking_pairs) - self.assertEqual(1, len(event.flanking_pairs)) + assert len(event.flanking_pairs) == 1 + @todo def test_outside_call_range(self): - raise unittest.SkipTest('TODO') - + pass -class TestEvidenceConsumption(unittest.TestCase): - def setUp(self): - self.bam_cache = BamCache(MockBamFileHandle({'1': 0, '2': 1})) - self.REFERENCE_GENOME = None +class TestEvidenceConsumption: def build_genome_evidence(self, b1, b2, opposing_strands=False): evidence = GenomeEvidence( b1, b2, - self.bam_cache, - self.REFERENCE_GENOME, + BamCache(MockBamFileHandle({'1': 0, '2': 1})), + None, opposing_strands=opposing_strands, read_length=100, median_fragment_size=200, @@ -551,25 +581,25 @@ def test_call_all_methods(self): events = call.call_events(evidence) for ev in events: print(ev, ev.event_type, ev.call_method) - self.assertEqual(4, len(events)) - self.assertEqual('contig', events[0].call_method) - self.assertEqual(100, events[0].break1.start) - self.assertEqual(481, events[0].break2.start) - self.assertEqual('deletion', events[0].event_type) - self.assertEqual('split reads', events[1].call_method) - self.assertEqual(120, events[1].break1.start) - self.assertEqual(501, events[1].break2.start) - self.assertEqual('deletion', events[1].event_type) - self.assertEqual('flanking reads', events[2].call_method) - self.assertEqual(90, events[2].break1.start) - self.assertEqual(299, events[2].break1.end) - self.assertEqual(591, events[2].break2.start) - self.assertEqual(806, events[2].break2.end) - self.assertEqual('deletion', events[2].event_type) - self.assertEqual('split reads', events[3].call_method) - self.assertEqual(120, events[3].break1.start) - self.assertEqual(501, events[3].break2.start) - self.assertEqual('insertion', events[3].event_type) + assert len(events) == 4 + assert events[0].call_method == 'contig' + assert events[0].break1.start == 100 + assert events[0].break2.start == 481 + assert events[0].event_type == 'deletion' + assert events[1].call_method == 'split reads' + assert events[1].break1.start == 120 + assert events[1].break2.start == 501 + assert events[1].event_type == 'deletion' + assert events[2].call_method == 'flanking reads' + assert events[2].break1.start == 90 + assert events[2].break1.end == 299 + assert events[2].break2.start == 591 + assert events[2].break2.end == 806 + assert events[2].event_type == 'deletion' + assert events[3].call_method == 'split reads' + assert events[3].break1.start == 120 + assert events[3].break2.start == 501 + assert events[3].event_type == 'insertion' def test_call_contig_only(self): # event should only be 100L+, 501R+ deletion @@ -660,10 +690,10 @@ def test_call_contig_only(self): events = call.call_events(evidence) for ev in events: print(ev, ev.event_type, ev.call_method) - self.assertEqual(1, len(events)) - self.assertEqual(100, events[0].break1.start) - self.assertEqual(501, events[0].break2.start) - self.assertEqual('contig', events[0].call_method) + assert len(events) == 1 + assert events[0].break1.start == 100 + assert events[0].break2.start == 501 + assert events[0].call_method == 'contig' def test_call_contig_and_split(self): # contig breakpoint is 100L 501R, split reads is 120L 521R @@ -748,17 +778,17 @@ def test_call_contig_and_split(self): events = call.call_events(evidence) for ev in events: print(ev, ev.event_type, ev.call_method) - self.assertEqual(3, len(events)) - self.assertEqual(100, events[0].break1.start) - self.assertEqual(501, events[0].break2.start) - self.assertEqual('contig', events[0].call_method) - self.assertEqual('split reads', events[1].call_method) - self.assertEqual(120, events[1].break1.start) - self.assertEqual(521, events[1].break2.start) - self.assertEqual('insertion', events[2].event_type) - self.assertEqual('split reads', events[2].call_method) - self.assertEqual(120, events[2].break1.start) - self.assertEqual(521, events[2].break2.start) + assert len(events) == 3 + assert events[0].break1.start == 100 + assert events[0].break2.start == 501 + assert events[0].call_method == 'contig' + assert events[1].call_method == 'split reads' + assert events[1].break1.start == 120 + assert events[1].break2.start == 521 + assert events[2].event_type == 'insertion' + assert events[2].call_method == 'split reads' + assert events[2].break1.start == 120 + assert events[2].break2.start == 521 def test_call_split_only(self): evidence = self.build_genome_evidence( @@ -795,14 +825,14 @@ def test_call_split_only(self): events = call.call_events(evidence) for ev in events: print(ev, ev.event_type, ev.call_method) - self.assertEqual(2, len(events)) - self.assertEqual(170, events[0].break1.start) - self.assertEqual(871, events[0].break2.start) - self.assertEqual('split reads', events[0].call_method) - self.assertEqual('split reads', events[1].call_method) - self.assertEqual(170, events[1].break1.start) - self.assertEqual(871, events[1].break2.start) - self.assertEqual('insertion', events[1].event_type) + assert len(events) == 2 + assert events[0].break1.start == 170 + assert events[0].break2.start == 871 + assert events[0].call_method == 'split reads' + assert events[1].call_method == 'split reads' + assert events[1].break1.start == 170 + assert events[1].break2.start == 871 + assert events[1].event_type == 'insertion' def test_call_flanking_only(self): evidence = self.build_genome_evidence( @@ -833,70 +863,76 @@ def test_call_flanking_only(self): events = call.call_events(evidence) for ev in events: print(ev, ev.event_type, ev.call_method) - self.assertEqual(1, len(events)) - self.assertEqual(140, events[0].break1.start) - self.assertEqual(292, events[0].break1.end) - self.assertEqual('flanking reads', events[0].call_method) - self.assertEqual(656, events[0].break2.start) - self.assertEqual(886, events[0].break2.end) - - -class TestCallBySupportingReads(unittest.TestCase): - def setUp(self): - self.ev = GenomeEvidence( - Breakpoint('fake', 50, 150, orient=ORIENT.RIGHT), - Breakpoint('fake', 450, 550, orient=ORIENT.RIGHT), - BamCache(MockBamFileHandle()), - None, - opposing_strands=True, - read_length=40, - stdev_fragment_size=25, - median_fragment_size=100, - config={ - 'validate.stdev_count_abnormal': 2, - 'validate.min_splits_reads_resolution': 1, - 'validate.min_flanking_pairs_resolution': 1, - 'validate.min_linking_split_reads': 1, - 'validate.min_spanning_reads_resolution': 3, - 'validate. min_call_complexity': 0, - }, - ) - self.dup = GenomeEvidence( - Breakpoint('fake', 50, orient=ORIENT.RIGHT), - Breakpoint('fake', 90, orient=ORIENT.LEFT), - BamCache(MockBamFileHandle()), - None, - opposing_strands=False, - read_length=40, - stdev_fragment_size=25, - median_fragment_size=100, - config={ - 'validate.stdev_count_abnormal': 2, - 'validate.min_splits_reads_resolution': 1, - 'validate.min_flanking_pairs_resolution': 1, - 'validate.min_linking_split_reads': 1, - 'validate.min_spanning_reads_resolution': 3, - 'validate. min_call_complexity': 0, - }, - ) - - def test_empty(self): - with self.assertRaises(AssertionError): - bpp = call._call_by_flanking_pairs(self.ev, SVTYPE.INV)[0] - - def test_call_no_duplication_by_split_reads(self): - self.dup.split_reads[0].add( + assert len(events) == 1 + assert events[0].break1.start == 140 + assert events[0].break1.end == 292 + assert events[0].call_method == 'flanking reads' + assert events[0].break2.start == 656 + assert events[0].break2.end == 886 + + +@pytest.fixture +def duplication_ev(): + return GenomeEvidence( + Breakpoint('fake', 50, orient=ORIENT.RIGHT), + Breakpoint('fake', 90, orient=ORIENT.LEFT), + BamCache(MockBamFileHandle()), + None, + opposing_strands=False, + read_length=40, + stdev_fragment_size=25, + median_fragment_size=100, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_linking_split_reads': 1, + 'validate.min_spanning_reads_resolution': 3, + 'validate. min_call_complexity': 0, + }, + ) + + +@pytest.fixture +def inversion_evidence(): + return GenomeEvidence( + Breakpoint('fake', 50, 150, orient=ORIENT.RIGHT), + Breakpoint('fake', 450, 550, orient=ORIENT.RIGHT), + BamCache(MockBamFileHandle()), + None, + opposing_strands=True, + read_length=40, + stdev_fragment_size=25, + median_fragment_size=100, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_splits_reads_resolution': 1, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_linking_split_reads': 1, + 'validate.min_spanning_reads_resolution': 3, + 'validate. min_call_complexity': 0, + }, + ) + + +class TestCallBySupportingReads: + def test_empty(self, inversion_evidence): + with pytest.raises(AssertionError): + call._call_by_flanking_pairs(inversion_evidence, SVTYPE.INV)[0] + + def test_call_no_duplication_by_split_reads(self, duplication_ev, inversion_evidence): + duplication_ev.split_reads[0].add( MockRead(query_name='t1', reference_start=30, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 20)]) ) - self.dup.split_reads[1].add( + duplication_ev.split_reads[1].add( MockRead(query_name='t1', reference_start=90, cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)]) ) - bpps = call._call_by_split_reads(self.ev, SVTYPE.DUP) - self.assertEqual(0, len(bpps)) + bpps = call._call_by_split_reads(inversion_evidence, SVTYPE.DUP) + assert len(bpps) == 0 - def test_by_split_read(self): - self.ev.split_reads[0].add( + def test_by_split_read(self, inversion_evidence): + inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, @@ -904,7 +940,7 @@ def test_by_split_read(self): query_sequence='A' * 40, ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t1', reference_start=500, @@ -912,7 +948,7 @@ def test_by_split_read(self): query_sequence='G' * 40, ) ) - self.ev.split_reads[0].add( + inversion_evidence.split_reads[0].add( MockRead( query_name='t2', reference_start=100, @@ -920,7 +956,7 @@ def test_by_split_read(self): query_sequence='C' * 40, ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t2', reference_start=500, @@ -929,17 +965,17 @@ def test_by_split_read(self): ) ) - events = call._call_by_split_reads(self.ev, SVTYPE.INV) - self.assertEqual(1, len(events)) + events = call._call_by_split_reads(inversion_evidence, SVTYPE.INV) + assert len(events) == 1 event = events[0] - self.assertEqual(4, len(event.support())) - self.assertEqual(101, event.break1.start) - self.assertEqual(101, event.break1.end) - self.assertEqual(501, event.break2.start) - self.assertEqual(501, event.break2.end) - - def test_call_by_split_read_low_resolution(self): - self.ev.split_reads[0].add( + assert len(event.support()) == 4 + assert event.break1.start == 101 + assert event.break1.end == 101 + assert event.break2.start == 501 + assert event.break2.end == 501 + + def test_call_by_split_read_low_resolution(self, inversion_evidence): + inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, @@ -947,7 +983,7 @@ def test_call_by_split_read_low_resolution(self): query_sequence='A' * 40, ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t1', reference_start=500, @@ -956,17 +992,17 @@ def test_call_by_split_read_low_resolution(self): ) ) - bpp = call._call_by_split_reads(self.ev, SVTYPE.INV) - self.assertEqual(1, len(bpp)) + bpp = call._call_by_split_reads(inversion_evidence, SVTYPE.INV) + assert len(bpp) == 1 bpp = bpp[0] - self.assertEqual(101, bpp.break1.start) - self.assertEqual(101, bpp.break1.end) - self.assertEqual(501, bpp.break2.start) - self.assertEqual(501, bpp.break2.end) + assert bpp.break1.start == 101 + assert bpp.break1.end == 101 + assert bpp.break2.start == 501 + assert bpp.break2.end == 501 - def test_call_by_split_read_resolve_untemp(self): - self.ev.split_reads[0].add( + def test_call_by_split_read_resolve_untemp(self, inversion_evidence): + inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, @@ -974,7 +1010,7 @@ def test_call_by_split_read_resolve_untemp(self): query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t1', reference_start=500, @@ -984,16 +1020,16 @@ def test_call_by_split_read_resolve_untemp(self): ) ) - event = call._call_by_split_reads(self.ev, SVTYPE.INV)[0] + event = call._call_by_split_reads(inversion_evidence, SVTYPE.INV)[0] - self.assertEqual(101, event.break1.start) - self.assertEqual(101, event.break1.end) - self.assertEqual(501, event.break2.start) - self.assertEqual(501, event.break2.end) - self.assertEqual('', event.untemplated_seq) + assert event.break1.start == 101 + assert event.break1.end == 101 + assert event.break2.start == 501 + assert event.break2.end == 501 + assert event.untemplated_seq == '' - def test_call_by_split_read_resolve_untemp_exists(self): - self.ev.split_reads[0].add( + def test_call_by_split_read_resolve_untemp_exists(self, inversion_evidence): + inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, @@ -1001,7 +1037,7 @@ def test_call_by_split_read_resolve_untemp_exists(self): query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t1', reference_start=500, @@ -1011,16 +1047,16 @@ def test_call_by_split_read_resolve_untemp_exists(self): ) ) - event = call._call_by_split_reads(self.ev, SVTYPE.INV)[0] + event = call._call_by_split_reads(inversion_evidence, SVTYPE.INV)[0] - self.assertEqual(101, event.break1.start) - self.assertEqual(101, event.break1.end) - self.assertEqual(501, event.break2.start) - self.assertEqual(501, event.break2.end) - self.assertEqual('TA', event.untemplated_seq) + assert event.break1.start == 101 + assert event.break1.end == 101 + assert event.break2.start == 501 + assert event.break2.end == 501 + assert event.untemplated_seq == 'TA' - def test_call_by_split_read_shift_overlap(self): - self.ev.split_reads[0].add( + def test_call_by_split_read_shift_overlap(self, inversion_evidence): + inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, @@ -1028,7 +1064,7 @@ def test_call_by_split_read_shift_overlap(self): query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t1', reference_start=500, @@ -1038,37 +1074,37 @@ def test_call_by_split_read_shift_overlap(self): ) ) - event = call._call_by_split_reads(self.ev, SVTYPE.INV)[0] + event = call._call_by_split_reads(inversion_evidence, SVTYPE.INV)[0] - self.assertEqual(101, event.break1.start) - self.assertEqual(101, event.break1.end) - self.assertEqual(503, event.break2.start) - self.assertEqual(503, event.break2.end) - self.assertEqual('', event.untemplated_seq) + assert event.break1.start == 101 + assert event.break1.end == 101 + assert event.break2.start == 503 + assert event.break2.end == 503 + assert event.untemplated_seq == '' - def test_both_by_flanking_pairs(self): - self.ev.flanking_pairs.add( + def test_both_by_flanking_pairs(self, inversion_evidence): + inversion_evidence.flanking_pairs.add( mock_read_pair( MockRead(query_name='t1', reference_id=0, reference_start=150, reference_end=150), MockRead(query_name='t1', reference_id=0, reference_start=500, reference_end=520), ) ) - self.ev.flanking_pairs.add( + inversion_evidence.flanking_pairs.add( mock_read_pair( MockRead(query_name='t2', reference_id=0, reference_start=120, reference_end=140), MockRead(query_name='t2', reference_id=0, reference_start=520, reference_end=520), ) ) - bpp = call._call_by_flanking_pairs(self.ev, SVTYPE.INV) + bpp = call._call_by_flanking_pairs(inversion_evidence, SVTYPE.INV) # 120-149 ..... 500-519 # max frag = 150 - 80 = 70 - self.assertEqual(42, bpp.break1.start) - self.assertEqual(120, bpp.break1.end) - self.assertEqual(412, bpp.break2.start) # 70 - 21 = 49 - self.assertEqual(500, bpp.break2.end) + assert bpp.break1.start == 42 + assert bpp.break1.end == 120 + assert bpp.break2.start == 412 # 70 - 21 = 49 + assert bpp.break2.end == 500 - def test_by_split_reads_multiple_calls(self): - self.ev.split_reads[0].add( + def test_by_split_reads_multiple_calls(self, inversion_evidence): + inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, @@ -1076,7 +1112,7 @@ def test_by_split_reads_multiple_calls(self): query_sequence='A' * 40, ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t1', reference_start=500, @@ -1084,7 +1120,7 @@ def test_by_split_reads_multiple_calls(self): query_sequence='T' * 40, ) ) - self.ev.split_reads[0].add( + inversion_evidence.split_reads[0].add( MockRead( query_name='t2', reference_start=110, @@ -1092,7 +1128,7 @@ def test_by_split_reads_multiple_calls(self): query_sequence='T' * 40, ) ) - self.ev.split_reads[1].add( + inversion_evidence.split_reads[1].add( MockRead( query_name='t2', reference_start=520, @@ -1101,8 +1137,8 @@ def test_by_split_reads_multiple_calls(self): ) ) - evs = call._call_by_split_reads(self.ev, SVTYPE.INV) - self.assertEqual(2, len(evs)) + evs = call._call_by_split_reads(inversion_evidence, SVTYPE.INV) + assert len(evs) == 2 def test_call_by_split_reads_consume_flanking(self): evidence = GenomeEvidence( @@ -1182,36 +1218,38 @@ def test_call_by_split_reads_consume_flanking(self): events = call._call_by_split_reads(evidence, event_type=SVTYPE.INV) for ev in events: print(ev, ev.event_type, ev.call_method) - self.assertEqual(1, len(events)) + assert len(events) == 1 event = events[0] - self.assertEqual(1, len(event.flanking_pairs)) - self.assertEqual(2, len(event.break1_split_reads)) - self.assertEqual(2, len(event.break2_split_reads)) + assert len(event.flanking_pairs) == 1 + assert len(event.break1_split_reads) == 2 + assert len(event.break2_split_reads) == 2 b1 = set([read.query_name for read in event.break1_split_reads]) b2 = set([read.query_name for read in event.break2_split_reads]) - self.assertEqual(1, len(b1 & b2)) - - -class TestCallByFlankingReadsGenome(unittest.TestCase): - def setUp(self): - self.ev_LR = GenomeEvidence( - Breakpoint('fake', 100, orient=ORIENT.LEFT), - Breakpoint('fake', 200, orient=ORIENT.RIGHT), - BamCache(MockBamFileHandle()), - None, - opposing_strands=False, - read_length=25, - stdev_fragment_size=25, - median_fragment_size=100, - config={ - 'validate.stdev_count_abnormal': 2, - 'validate.min_flanking_pairs_resolution': 1, - 'validate.min_call_complexity': 0, - }, - ) - + assert len(b1 & b2) == 1 + + +@pytest.fixture +def left_right_ev(): + return GenomeEvidence( + Breakpoint('fake', 100, orient=ORIENT.LEFT), + Breakpoint('fake', 200, orient=ORIENT.RIGHT), + BamCache(MockBamFileHandle()), + None, + opposing_strands=False, + read_length=25, + stdev_fragment_size=25, + median_fragment_size=100, + config={ + 'validate.stdev_count_abnormal': 2, + 'validate.min_flanking_pairs_resolution': 1, + 'validate.min_call_complexity': 0, + }, + ) + + +class TestCallByFlankingReadsGenome: def test_call_coverage_too_large(self): - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): call._call_interval_by_flanking_coverage( Interval(1901459, 1902200), ORIENT.RIGHT, @@ -1221,13 +1259,13 @@ def test_call_coverage_too_large(self): Evidence.traverse, ) - def test_intrachromosomal_lr(self): + def test_intrachromosomal_lr(self, left_right_ev): # --LLL-100------------500-RRR------- # max fragment size: 100 + 2 * 25 = 150 # max distance = 150 - read_length = 125 # coverage ranges: 20->80 (61) 600->675 (76) - self.assertEqual(150, self.ev_LR.max_expected_fragment_size) - self.ev_LR.flanking_pairs.add( + assert left_right_ev.max_expected_fragment_size == 150 + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, @@ -1244,7 +1282,7 @@ def test_intrachromosomal_lr(self): ), ) ) - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=39, @@ -1262,7 +1300,7 @@ def test_intrachromosomal_lr(self): ) ) # add a pair that will be ignored - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=39, @@ -1279,18 +1317,18 @@ def test_intrachromosomal_lr(self): ), ) ) - bpp = call._call_by_flanking_pairs(self.ev_LR, SVTYPE.DEL) + bpp = call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) print(bpp, bpp.flanking_pairs) - self.assertEqual(80, bpp.break1.start) - self.assertEqual(80 + 125 - 45, bpp.break1.end) - self.assertEqual(600 - 125 + 75, bpp.break2.start) - self.assertEqual(600, bpp.break2.end) + assert bpp.break1.start == 80 + assert bpp.break1.end == 80 + 125 - 45 + assert bpp.break2.start == 600 - 125 + 75 + assert bpp.break2.end == 600 - def test_intrachromosomal_lr_coverage_overlaps_range(self): + def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): # this test is for ensuring that if a theoretical window calculated for the # first breakpoint overlaps the actual coverage for the second breakpoint (or the reverse) # that we adjust the theoretical window accordingly - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=21, @@ -1307,7 +1345,7 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self): ), ) ) - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=41, @@ -1325,7 +1363,7 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self): ) ) # pair to skip - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=39, @@ -1342,14 +1380,14 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self): ), ) ) - break1, break2 = call._call_by_flanking_pairs(self.ev_LR, SVTYPE.INS) - self.assertEqual(80, break1.start) - self.assertEqual(80, break1.end) # 119 - self.assertEqual(81, break2.start) - self.assertEqual(81, break2.end) + break1, break2 = call._call_by_flanking_pairs(left_right_ev, SVTYPE.INS) + assert break1.start == 80 + assert break1.end == 80 # 119 + assert break2.start == 81 + assert break2.end == 81 - def test_intrachromosomal_flanking_coverage_overlap_error(self): - self.ev_LR.flanking_pairs.add( + def test_intrachromosomal_flanking_coverage_overlap_error(self, left_right_ev): + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, @@ -1365,7 +1403,7 @@ def test_intrachromosomal_flanking_coverage_overlap_error(self): ), ) ) - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=620, @@ -1381,11 +1419,11 @@ def test_intrachromosomal_flanking_coverage_overlap_error(self): ), ) ) - with self.assertRaises(AssertionError): - call._call_by_flanking_pairs(self.ev_LR, SVTYPE.DEL) + with pytest.raises(AssertionError): + call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) - def test_coverage_larger_than_max_expected_variance_error(self): - self.ev_LR.flanking_pairs.add( + def test_coverage_larger_than_max_expected_variance_error(self, left_right_ev): + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, @@ -1401,7 +1439,7 @@ def test_coverage_larger_than_max_expected_variance_error(self): ), ) ) - self.ev_LR.flanking_pairs.add( + left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=301, @@ -1417,10 +1455,10 @@ def test_coverage_larger_than_max_expected_variance_error(self): ), ) ) - with self.assertRaises(AssertionError): - call._call_by_flanking_pairs(self.ev_LR, SVTYPE.DEL) + with pytest.raises(AssertionError): + call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) - def test_close_to_zero(self): + def test_close_to_zero(self, left_right_ev): # this test is for ensuring that if a theoretical window calculated for the # first breakpoint overlaps the actual coverage for the second breakpoint (or the reverse) # that we adjust the theoretical window accordingly @@ -1472,12 +1510,12 @@ def test_close_to_zero(self): ) break1, break2 = call._call_by_flanking_pairs(ev, SVTYPE.INV) - self.assertEqual(1, break1.start) - self.assertEqual(20, break1.end) - self.assertEqual(65, break2.start) - self.assertEqual(150, break2.end) + assert break1.start == 1 + assert break1.end == 20 + assert break2.start == 65 + assert break2.end == 150 - def test_call_with_overlapping_coverage_intervals(self): + def test_call_with_overlapping_coverage_intervals(self, left_right_ev): evidence = GenomeEvidence( Breakpoint('1', 76185710, 76186159, orient=ORIENT.RIGHT), Breakpoint('1', 76186430, 76186879, orient=ORIENT.LEFT), @@ -1505,11 +1543,11 @@ def test_call_with_overlapping_coverage_intervals(self): ), ) ) - with self.assertRaises(AssertionError): - bpp = call._call_by_flanking_pairs(evidence, SVTYPE.DUP) + with pytest.raises(AssertionError): + call._call_by_flanking_pairs(evidence, SVTYPE.DUP) -class TestCallByFlankingReadsTranscriptome(unittest.TestCase): +class TestCallByFlankingReadsTranscriptome: def build_transcriptome_evidence(self, b1, b2, opposing_strands=False): return TranscriptomeEvidence( {}, # fake the annotations @@ -1531,17 +1569,20 @@ def build_transcriptome_evidence(self, b1, b2, opposing_strands=False): }, ) + @todo def test_call_translocation(self): # transcriptome test will use exonic coordinates for the associated transcripts - raise unittest.SkipTest('TODO') + pass + @todo def test_call_inversion(self): # transcriptome test will use exonic coordinates for the associated transcripts - raise unittest.SkipTest('TODO') + pass + @todo def test_call_inversion_overlapping_breakpoint_calls(self): # transcriptome test will use exonic coordinates for the associated transcripts - raise unittest.SkipTest('TODO') + pass def test_call_deletion(self): # transcriptome test will use exonic coordinates for the associated transcripts @@ -1562,32 +1603,32 @@ def test_call_deletion(self): ) print(read_pair_type(pair[0])) # following help in debugging the mockup - self.assertFalse(pair[0].is_reverse) - self.assertFalse(pair[0].is_read1) - self.assertTrue(pair[0].is_read2) - self.assertTrue(pair[1].is_reverse) - self.assertTrue(pair[1].is_read1) - self.assertFalse(pair[1].is_read2) - self.assertEqual(STRAND.POS, sequenced_strand(pair[0], 2)) - self.assertEqual(STRAND.POS, evidence.decide_sequenced_strand([pair[0]])) - self.assertEqual(STRAND.POS, sequenced_strand(pair[1], 2)) - self.assertEqual(STRAND.POS, evidence.decide_sequenced_strand([pair[1]])) + assert not pair[0].is_reverse + assert not pair[0].is_read1 + assert pair[0].is_read2 + assert pair[1].is_reverse + assert pair[1].is_read1 + assert not pair[1].is_read2 + assert sequenced_strand(pair[0], 2) == STRAND.POS + assert evidence.decide_sequenced_strand([pair[0]]) == STRAND.POS + assert sequenced_strand(pair[1], 2) == STRAND.POS + assert evidence.decide_sequenced_strand([pair[1]]) == STRAND.POS print(evidence.max_expected_fragment_size, evidence.read_length) evidence.flanking_pairs.add(pair) breakpoint1, breakpoint2 = call._call_by_flanking_pairs(evidence, SVTYPE.DEL) print(breakpoint1, breakpoint2) - self.assertEqual(Breakpoint('1', 1051, 1351, 'L', '+'), breakpoint1) - self.assertEqual(Breakpoint('1', 2000, 2300, 'R', '+'), breakpoint2) + assert breakpoint1 == Breakpoint('1', 1051, 1351, 'L', '+') + assert breakpoint2 == Breakpoint('1', 2000, 2300, 'R', '+') # now add the transcript and call again evidence.overlapping_transcripts.add(pre_transcript) breakpoint1, breakpoint2 = call._call_by_flanking_pairs(evidence, SVTYPE.DEL) print(breakpoint1, breakpoint2) - self.assertEqual(Breakpoint('1', 1051, 2051, 'L', '+'), breakpoint1) - self.assertEqual(Breakpoint('1', 1600, 2300, 'R', '+'), breakpoint2) + assert breakpoint1 == Breakpoint('1', 1051, 2051, 'L', '+') + assert breakpoint2 == Breakpoint('1', 1600, 2300, 'R', '+') -class TestCallBySpanningReads(unittest.TestCase): +class TestCallBySpanningReads: def test_deletion(self): # ATCGATCTAGATCTAGGATAGTTCTAGCAGTCATAGCTAT ev = GenomeEvidence( @@ -1623,8 +1664,8 @@ def test_deletion(self): ] ev.spanning_reads = set(spanning_reads) calls = call._call_by_spanning_reads(ev, set()) - self.assertEqual(1, len(calls)) - self.assertEqual(2, len(calls[0].support())) + assert len(calls) == 1 + assert len(calls[0].support()) == 2 def test_insertion(self): pass @@ -1639,7 +1680,7 @@ def test_duplication(self): pass -class TestCharacterizeRepeatRegion(unittest.TestCase): +class TestCharacterizeRepeatRegion: def test_bad_deletion_call(self): reference_genome = { '19': mock.Mock( @@ -1662,7 +1703,7 @@ def test_bad_deletion_call(self): untemplated_seq='', event_type=SVTYPE.DEL, ) - self.assertEqual((0, ''), call.EventCall.characterize_repeat_region(bpp, reference_genome)) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (0, '') def test_homopolymer_insertion(self): bpp = BreakpointPair( @@ -1679,7 +1720,7 @@ def test_homopolymer_insertion(self): 'upto and including the first breakpoint', reference_genome['1'].seq[bpp.break1.start - 10 : bpp.break1.start], ) - self.assertEqual((4, 'T'), call.EventCall.characterize_repeat_region(bpp, reference_genome)) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (4, 'T') def test_homopolymer_deletion(self): bpp = BreakpointPair( @@ -1696,7 +1737,7 @@ def test_homopolymer_deletion(self): 'upto and including the first breakpoint', reference_genome['1'].seq[bpp.break1.start - 10 : bpp.break1.start], ) - self.assertEqual((4, 'T'), call.EventCall.characterize_repeat_region(bpp, reference_genome)) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (4, 'T') def test_homopolymer_duplication(self): bpp = BreakpointPair( @@ -1713,7 +1754,7 @@ def test_homopolymer_duplication(self): 'upto and including the first breakpoint', reference_genome['1'].seq[bpp.break1.start - 10 : bpp.break1.start], ) - self.assertEqual((4, 'T'), call.EventCall.characterize_repeat_region(bpp, reference_genome)) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (4, 'T') def test_repeat_duplication(self): bpp = BreakpointPair( @@ -1732,9 +1773,7 @@ def test_repeat_duplication(self): 'upto and including the first breakpoint', reference_genome['1'].seq[bpp.break1.start - 10 : bpp.break1.start], ) - self.assertEqual( - (2, 'TAG'), call.EventCall.characterize_repeat_region(bpp, reference_genome) - ) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (2, 'TAG') def test_repeat_insertion(self): bpp = BreakpointPair( @@ -1753,9 +1792,7 @@ def test_repeat_insertion(self): 'upto and including the first breakpoint', reference_genome['1'].seq[bpp.break1.start - 10 : bpp.break1.start], ) - self.assertEqual( - (3, 'TAG'), call.EventCall.characterize_repeat_region(bpp, reference_genome) - ) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (3, 'TAG') def test_repeat_deletion(self): bpp = BreakpointPair( @@ -1774,9 +1811,7 @@ def test_repeat_deletion(self): 'upto and including the second breakpoint', reference_genome['1'].seq[bpp.break2.start - 10 : bpp.break2.start], ) - self.assertEqual( - (3, 'TAG'), call.EventCall.characterize_repeat_region(bpp, reference_genome) - ) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (3, 'TAG') def test_norepeat_insertion(self): bpp = BreakpointPair( @@ -1795,9 +1830,7 @@ def test_norepeat_insertion(self): 'upto and including the first breakpoint', reference_genome['1'].seq[bpp.break1.start - 10 : bpp.break1.start], ) - self.assertEqual( - (0, 'TTG'), call.EventCall.characterize_repeat_region(bpp, reference_genome) - ) + assert call.EventCall.characterize_repeat_region(bpp, reference_genome) == (0, 'TTG') def test_invalid_event_type(self): bpp = BreakpointPair( @@ -1806,9 +1839,5 @@ def test_invalid_event_type(self): untemplated_seq='TTG', event_type=SVTYPE.INV, ) - with self.assertRaises(ValueError): + with pytest.raises(ValueError): call.EventCall.characterize_repeat_region(bpp, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/integration/test_validate_evidence.py b/tests/integration/test_validate_evidence.py index d4b747af..9a53cf82 100644 --- a/tests/integration/test_validate_evidence.py +++ b/tests/integration/test_validate_evidence.py @@ -1,12 +1,13 @@ -import unittest +import argparse from functools import partial +import pytest from mavis.annotate.genomic import Gene, PreTranscript, Transcript from mavis.bam import cigar as _cigar from mavis.bam.cache import BamCache from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import CIGAR, ORIENT, STRAND +from mavis.constants import ORIENT, STRAND from mavis.interval import Interval from mavis.schemas import DEFAULTS from mavis.validate.base import Evidence @@ -17,69 +18,73 @@ REFERENCE_GENOME = None -class TestDistance(unittest.TestCase): - def setUp(self): - self.transcript = PreTranscript( - [(1001, 1100), (1501, 1600), (2001, 2100), (2201, 2300)], strand='+' - ) - for patt in self.transcript.generate_splicing_patterns(): - self.transcript.transcripts.append(Transcript(self.transcript, patt)) - self.trans_evidence = MockObject( - annotations={}, - read_length=100, - max_expected_fragment_size=550, - call_error=11, - overlapping_transcripts={self.transcript}, - ) - setattr( - self.trans_evidence, - '_select_transcripts', - lambda *pos: self.trans_evidence.overlapping_transcripts, - ) - setattr( - self.trans_evidence, - 'distance', - partial(TranscriptomeEvidence.distance, self.trans_evidence), - ) - - def test_exonic(self): - self.assertEqual(Interval(149), self.trans_evidence.distance(1001, 1550)) - - def test_intergenic_exonic(self): - dist = self.trans_evidence.distance(101, 1550) - self.assertEqual(Interval(1049, 1049), dist) - - def test_intergenic_intergenic(self): - dist = self.trans_evidence.distance(101, 300) - self.assertEqual(Interval(199), dist) - - def test_aligned_intronic(self): - dist = self.trans_evidence.distance(1102, 1499) - self.assertEqual(Interval(5), dist) - - def test_indel_at_exon_boundary(self): - self.assertEqual(Interval(2), self.trans_evidence.distance(1101, 1501)) - - def test_no_annotations(self): - dist = self.trans_evidence.distance(101, 300, []) - self.assertEqual(Interval(199), dist) - - def test_intergenic_intronic(self): - dist = self.trans_evidence.distance(101, 1400) - self.assertEqual(Interval(1101), dist) - - def test_empty_intron(self): +@pytest.fixture +def distance_setup(): + n = argparse.Namespace() + n.transcript = PreTranscript( + [(1001, 1100), (1501, 1600), (2001, 2100), (2201, 2300)], strand='+' + ) + for patt in n.transcript.generate_splicing_patterns(): + n.transcript.transcripts.append(Transcript(n.transcript, patt)) + n.trans_evidence = MockObject( + annotations={}, + read_length=100, + max_expected_fragment_size=550, + call_error=11, + overlapping_transcripts={n.transcript}, + ) + setattr( + n.trans_evidence, + '_select_transcripts', + lambda *pos: n.trans_evidence.overlapping_transcripts, + ) + setattr( + n.trans_evidence, + 'distance', + partial(TranscriptomeEvidence.distance, n.trans_evidence), + ) + return n + + +class TestDistance: + def test_exonic(self, distance_setup): + assert distance_setup.trans_evidence.distance(1001, 1550) == Interval(149) + + def test_intergenic_exonic(self, distance_setup): + dist = distance_setup.trans_evidence.distance(101, 1550) + assert dist == Interval(1049, 1049) + + def test_intergenic_intergenic(self, distance_setup): + dist = distance_setup.trans_evidence.distance(101, 300) + assert dist == Interval(199) + + def test_aligned_intronic(self, distance_setup): + dist = distance_setup.trans_evidence.distance(1102, 1499) + assert dist == Interval(5) + + def test_indel_at_exon_boundary(self, distance_setup): + assert distance_setup.trans_evidence.distance(1101, 1501) == Interval(2) + + def test_no_annotations(self, distance_setup): + dist = distance_setup.trans_evidence.distance(101, 300, []) + assert dist == Interval(199) + + def test_intergenic_intronic(self, distance_setup): + dist = distance_setup.trans_evidence.distance(101, 1400) + assert dist == Interval(1101) + + def test_empty_intron(self, distance_setup): t2 = PreTranscript([(1001, 1100), (1501, 1600), (2001, 2200), (2201, 2300)], strand='+') for patt in t2.generate_splicing_patterns(): t2.transcripts.append(Transcript(t2, patt)) print(t2) - print(self.trans_evidence.overlapping_transcripts) - self.trans_evidence.overlapping_transcripts.add(t2) - dist = self.trans_evidence.distance(1001, 2301) - self.assertEqual(Interval(400, 400), dist) + print(distance_setup.trans_evidence.overlapping_transcripts) + distance_setup.trans_evidence.overlapping_transcripts.add(t2) + dist = distance_setup.trans_evidence.distance(1001, 2301) + assert dist == Interval(400, 400) -class TestTransStandardize(unittest.TestCase): +class TestTransStandardize: def test_shift_overaligned(self): # qwertyuiopas---kkkkk------dfghjklzxcvbnm # .......... ................ @@ -106,7 +111,7 @@ def test_shift_overaligned(self): ) evidence.overlapping_transcripts.add(transcript) new_read = evidence.standardize_read(read) - self.assertEqual(_cigar.convert_string_to_cigar('12=7N14='), new_read.cigar) + assert new_read.cigar == _cigar.convert_string_to_cigar('12=7N14=') def test_shift_overaligned_left(self): # qwertyuiopasdf---kkkkkdf------ghjklzxcvbnm @@ -134,7 +139,7 @@ def test_shift_overaligned_left(self): ) evidence.overlapping_transcripts.add(transcript) new_read = evidence.standardize_read(read) - self.assertEqual(_cigar.convert_string_to_cigar('14=7N12='), new_read.cigar) + assert new_read.cigar == _cigar.convert_string_to_cigar('14=7N12=') def test_shift_no_transcripts(self): read = SamRead( @@ -154,296 +159,317 @@ def test_shift_no_transcripts(self): median_fragment_size=220, ) new_cigar = evidence.exon_boundary_shift_cigar(read) - self.assertEqual(_cigar.convert_string_to_cigar('14=7D18='), new_cigar) - - -class TestComputeFragmentSizes(unittest.TestCase): - def setUp(self): - b1 = Breakpoint('1', 1051, 1051, 'L') - b2 = Breakpoint('1', 1551, 1551, 'R') - self.read_length = 50 - self.trans_ev = TranscriptomeEvidence( - {}, # fake the annotations - b1, - b2, - None, - None, # bam_cache and reference_genome - opposing_strands=False, - read_length=self.read_length, - stdev_fragment_size=100, - median_fragment_size=100, - config={'validate.stdev_count_abnormal': 1}, - ) - self.genomic_ev = GenomeEvidence( - b1, - b2, - None, - None, # bam_cache and reference_genome - opposing_strands=False, - read_length=self.read_length, - stdev_fragment_size=100, - median_fragment_size=100, - config={'validate.stdev_count_abnormal': 1}, - ) - - def test_genomic_vs_trans_no_annotations(self): + assert new_cigar == _cigar.convert_string_to_cigar('14=7D18=') + + +@pytest.fixture +def read_length(): + return 50 + + +@pytest.fixture +def trans_evidence(read_length): + return TranscriptomeEvidence( + {}, # fake the annotations + Breakpoint('1', 1051, 1051, 'L'), + Breakpoint('1', 1551, 1551, 'R'), + None, + None, # bam_cache and reference_genome + opposing_strands=False, + read_length=read_length, + stdev_fragment_size=100, + median_fragment_size=100, + config={'validate.stdev_count_abnormal': 1}, + ) + + +@pytest.fixture +def genomic_evidence(read_length): + return GenomeEvidence( + Breakpoint('1', 1051, 1051, 'L'), + Breakpoint('1', 1551, 1551, 'R'), + None, + None, # bam_cache and reference_genome + opposing_strands=False, + read_length=read_length, + stdev_fragment_size=100, + median_fragment_size=100, + config={'validate.stdev_count_abnormal': 1}, + ) + + +class TestComputeFragmentSizes: + def test_genomic_vs_trans_no_annotations(self, genomic_evidence, read_length, trans_evidence): # should be identical read, mate = mock_read_pair( - MockRead('name', '1', 1051 - self.read_length + 1, 1051, is_reverse=False), - MockRead('name', '1', 2300, 2300 + self.read_length - 1, is_reverse=True), - ) - self.assertEqual( - self.trans_ev.compute_fragment_size(read, mate), - self.genomic_ev.compute_fragment_size(read, mate), + MockRead('name', '1', 1051 - read_length + 1, 1051, is_reverse=False), + MockRead('name', '1', 2300, 2300 + read_length - 1, is_reverse=True), ) + assert genomic_evidence.compute_fragment_size( + read, mate + ) == trans_evidence.compute_fragment_size(read, mate) - def test_reverse_reads(self): + def test_reverse_reads(self, genomic_evidence, trans_evidence): read, mate = mock_read_pair( MockRead('name', '1', 1001, 1100, is_reverse=False), MockRead('name', '1', 2201, 2301, is_reverse=True), ) - self.assertEqual(Interval(1300), self.genomic_ev.compute_fragment_size(read, mate)) - self.assertEqual(Interval(1300), self.genomic_ev.compute_fragment_size(mate, read)) - self.assertEqual(Interval(1300), self.trans_ev.compute_fragment_size(read, mate)) - self.assertEqual(Interval(1300), self.trans_ev.compute_fragment_size(mate, read)) - - -class TestTraverse(unittest.TestCase): - def setUp(self): - self.transcript = PreTranscript( - [(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.POS - ) - for patt in self.transcript.generate_splicing_patterns(): - self.transcript.transcripts.append(Transcript(self.transcript, patt)) - - self.trans_evidence = MockObject( - annotations={}, - read_length=100, - max_expected_fragment_size=550, - call_error=11, - overlapping_transcripts={self.transcript}, - ) - setattr( - self.trans_evidence, - '_select_transcripts', - lambda *pos: self.trans_evidence.overlapping_transcripts, - ) - setattr( - self.trans_evidence, - 'traverse', - partial(TranscriptomeEvidence.traverse, self.trans_evidence), - ) - - def test_left_before_transcript(self): + assert genomic_evidence.compute_fragment_size(read, mate) == Interval(1300) + assert genomic_evidence.compute_fragment_size(mate, read) == Interval(1300) + assert trans_evidence.compute_fragment_size(read, mate) == Interval(1300) + assert trans_evidence.compute_fragment_size(mate, read) == Interval(1300) + + +@pytest.fixture +def traverse_setup(): + n = argparse.Namespace() + n.transcript = PreTranscript([(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.POS) + for patt in n.transcript.generate_splicing_patterns(): + n.transcript.transcripts.append(Transcript(n.transcript, patt)) + + n.trans_evidence = MockObject( + annotations={}, + read_length=100, + max_expected_fragment_size=550, + call_error=11, + overlapping_transcripts={n.transcript}, + ) + setattr( + n.trans_evidence, + '_select_transcripts', + lambda *pos: n.trans_evidence.overlapping_transcripts, + ) + setattr( + n.trans_evidence, + 'traverse', + partial(TranscriptomeEvidence.traverse, n.trans_evidence), + ) + return n + + +class TestTraverse: + def test_left_before_transcript(self, traverse_setup): exp_pos = Evidence.traverse(900, 500 - 1, ORIENT.LEFT) - self.assertEqual(exp_pos, self.trans_evidence.traverse(900, 500 - 1, ORIENT.LEFT)) + assert traverse_setup.trans_evidence.traverse(900, 500 - 1, ORIENT.LEFT) == exp_pos - def test_left_after_transcript(self): + def test_left_after_transcript(self, traverse_setup): exp_pos = Evidence.traverse(2200, 100, ORIENT.LEFT) - self.assertEqual(exp_pos, self.trans_evidence.traverse(2200, 100, ORIENT.LEFT)) - - def test_left_at_end(self): - gpos = self.trans_evidence.traverse(1900, 500, ORIENT.LEFT) - self.assertEqual(Interval(900), gpos) - - def test_left_within_transcript_exonic(self): - gpos = self.trans_evidence.traverse(1750, 200 - 1, ORIENT.LEFT) - self.assertEqual(Interval(1051), gpos) - - def test_left_within_exon(self): - gpos = self.trans_evidence.traverse(1750, 20 - 1, ORIENT.LEFT) - self.assertEqual(1731, gpos.start) - self.assertEqual(1731, gpos.end) - - def test_left_within_transcript_intronic(self): - gpos = self.trans_evidence.traverse(1600, 150 - 1, ORIENT.LEFT) - self.assertEqual(Interval(1451), gpos) - - def test_right_before_transcript(self): - gpos = self.trans_evidence.traverse(500, 100 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(599), gpos) - - def test_right_before_transcript2(self): - gpos = self.trans_evidence.traverse(901, 500 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(1900), gpos) - - def test_right_after_transcript(self): - gpos = self.trans_evidence.traverse(2201, 100 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(2300), gpos) - - def test_right_within_transcript(self): - gpos = self.trans_evidence.traverse(1351, 100 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(1750), gpos) - - def test_right_within_exon(self): - gpos = self.trans_evidence.traverse(1351, 10 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(1360), gpos) - - -class TestTraverseTransRev(unittest.TestCase): - def setUp(self): - self.transcript = PreTranscript( - [(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.NEG - ) - for patt in self.transcript.generate_splicing_patterns(): - self.transcript.transcripts.append(Transcript(self.transcript, patt)) - - self.trans_evidence = MockObject( - annotations={}, - read_length=100, - max_expected_fragment_size=550, - call_error=11, - overlapping_transcripts={self.transcript}, - ) - setattr( - self.trans_evidence, - '_select_transcripts', - lambda *pos: self.trans_evidence.overlapping_transcripts, - ) - setattr( - self.trans_evidence, - 'traverse', - partial(TranscriptomeEvidence.traverse, self.trans_evidence), - ) - - def test_left_before_transcript(self): - gpos = self.trans_evidence.traverse(900, 500 - 1, ORIENT.LEFT) - self.assertEqual(Interval(401), gpos) - self.assertEqual(gpos, GenomeEvidence.traverse(900, 500 - 1, ORIENT.LEFT)) - - def test_left_after_transcript(self): - gpos = self.trans_evidence.traverse(2200, 100, ORIENT.LEFT) - self.assertEqual(gpos, GenomeEvidence.traverse(2200, 100, ORIENT.LEFT)) - self.assertEqual(Interval(2100), gpos) - - def test_left_after_transcript2(self): - gpos = self.trans_evidence.traverse(1900, 500 - 1, ORIENT.LEFT) - self.assertEqual(Interval(901), gpos) - - def test_left_within_transcript_exonic(self): - gpos = self.trans_evidence.traverse(1750, 200 - 1, ORIENT.LEFT) - self.assertEqual(Interval(1051), gpos) - - def test_left_within_exon(self): - gpos = self.trans_evidence.traverse(1750, 20 - 1, ORIENT.LEFT) - self.assertEqual(1731, gpos.start) - self.assertEqual(1731, gpos.end) - - def test_left_within_transcript_intronic(self): - gpos = self.trans_evidence.traverse(1600, 150 - 1, ORIENT.LEFT) - self.assertEqual(Interval(1451), gpos) - - def test_right_before_transcript(self): - gpos = self.trans_evidence.traverse(500, 100 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(599), gpos) - - def test_right_before_transcript2(self): - gpos = self.trans_evidence.traverse(901, 500 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(1900), gpos) - - def test_right_after_transcript(self): - gpos = self.trans_evidence.traverse(2201, 100 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(2300), gpos) - - def test_right_within_transcript(self): - gpos = self.trans_evidence.traverse(1351, 100 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(1750), gpos) - - def test_right_within_exon(self): - gpos = self.trans_evidence.traverse(1351, 10 - 1, ORIENT.RIGHT) - self.assertEqual(Interval(1360), gpos) - - -class TestTranscriptomeEvidenceWindow(unittest.TestCase): - def setUp(self): - gene = Gene('1', 1, 9999, name='KRAS', strand=STRAND.POS) - self.pre_transcript = PreTranscript( - gene=gene, exons=[(1001, 1100), (1401, 1500), (1701, 1750), (3001, 4000)] - ) - gene.unspliced_transcripts.append(self.pre_transcript) - for spl in self.pre_transcript.generate_splicing_patterns(): - self.pre_transcript.transcripts.append(Transcript(self.pre_transcript, spl)) - self.annotations = {gene.chr: [gene]} - self.genome_evidence = MockObject( - annotations={}, - read_length=100, - max_expected_fragment_size=550, - config={**DEFAULTS, 'validate.call_error': 11}, - ) - self.trans_evidence = MockObject( - annotations={}, - read_length=100, - max_expected_fragment_size=550, - overlapping_transcripts={self.pre_transcript}, - config={**DEFAULTS, 'validate.call_error': 11}, - ) - setattr( - self.trans_evidence, - '_select_transcripts', - lambda *pos: self.trans_evidence.overlapping_transcripts, - ) - setattr( - self.trans_evidence, - 'traverse', - partial(TranscriptomeEvidence.traverse, self.trans_evidence), - ) - - def transcriptome_window(self, breakpoint, transcripts=None): - if transcripts: - self.trans_evidence.overlapping_transcripts.update(transcripts) - return TranscriptomeEvidence.generate_window(self.trans_evidence, breakpoint) - - def genome_window(self, breakpoint): - return GenomeEvidence.generate_window(self.genome_evidence, breakpoint) - - def test_before_start(self): + assert traverse_setup.trans_evidence.traverse(2200, 100, ORIENT.LEFT) == exp_pos + + def test_left_at_end(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(1900, 500, ORIENT.LEFT) + assert gpos == Interval(900) + + def test_left_within_transcript_exonic(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(1750, 200 - 1, ORIENT.LEFT) + assert gpos == Interval(1051) + + def test_left_within_exon(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(1750, 20 - 1, ORIENT.LEFT) + assert gpos.start == 1731 + assert gpos.end == 1731 + + def test_left_within_transcript_intronic(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(1600, 150 - 1, ORIENT.LEFT) + assert gpos == Interval(1451) + + def test_right_before_transcript(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(500, 100 - 1, ORIENT.RIGHT) + assert gpos == Interval(599) + + def test_right_before_transcript2(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(901, 500 - 1, ORIENT.RIGHT) + assert gpos == Interval(1900) + + def test_right_after_transcript(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(2201, 100 - 1, ORIENT.RIGHT) + assert gpos == Interval(2300) + + def test_right_within_transcript(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(1351, 100 - 1, ORIENT.RIGHT) + assert gpos == Interval(1750) + + def test_right_within_exon(self, traverse_setup): + gpos = traverse_setup.trans_evidence.traverse(1351, 10 - 1, ORIENT.RIGHT) + assert gpos == Interval(1360) + + +@pytest.fixture +def tranverse_trans_rev_setup(): + n = argparse.Namespace() + n.transcript = PreTranscript([(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.NEG) + for patt in n.transcript.generate_splicing_patterns(): + n.transcript.transcripts.append(Transcript(n.transcript, patt)) + + n.trans_evidence = MockObject( + annotations={}, + read_length=100, + max_expected_fragment_size=550, + call_error=11, + overlapping_transcripts={n.transcript}, + ) + setattr( + n.trans_evidence, + '_select_transcripts', + lambda *pos: n.trans_evidence.overlapping_transcripts, + ) + setattr( + n.trans_evidence, + 'traverse', + partial(TranscriptomeEvidence.traverse, n.trans_evidence), + ) + return n + + +class TestTraverseTransRev: + def test_left_before_transcript(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(900, 500 - 1, ORIENT.LEFT) + assert gpos == Interval(401) + assert GenomeEvidence.traverse(900, 500 - 1, ORIENT.LEFT) == gpos + + def test_left_after_transcript(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(2200, 100, ORIENT.LEFT) + assert GenomeEvidence.traverse(2200, 100, ORIENT.LEFT) == gpos + assert gpos == Interval(2100) + + def test_left_after_transcript2(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(1900, 500 - 1, ORIENT.LEFT) + assert gpos == Interval(901) + + def test_left_within_transcript_exonic(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(1750, 200 - 1, ORIENT.LEFT) + assert gpos == Interval(1051) + + def test_left_within_exon(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(1750, 20 - 1, ORIENT.LEFT) + assert gpos.start == 1731 + assert gpos.end == 1731 + + def test_left_within_transcript_intronic(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(1600, 150 - 1, ORIENT.LEFT) + assert gpos == Interval(1451) + + def test_right_before_transcript(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(500, 100 - 1, ORIENT.RIGHT) + assert gpos == Interval(599) + + def test_right_before_transcript2(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(901, 500 - 1, ORIENT.RIGHT) + assert gpos == Interval(1900) + + def test_right_after_transcript(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(2201, 100 - 1, ORIENT.RIGHT) + assert gpos == Interval(2300) + + def test_right_within_transcript(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(1351, 100 - 1, ORIENT.RIGHT) + assert gpos == Interval(1750) + + def test_right_within_exon(self, tranverse_trans_rev_setup): + gpos = tranverse_trans_rev_setup.trans_evidence.traverse(1351, 10 - 1, ORIENT.RIGHT) + assert gpos == Interval(1360) + + +@pytest.fixture +def trans_window_setup(): + n = argparse.Namespace() + gene = Gene('1', 1, 9999, name='KRAS', strand=STRAND.POS) + n.pre_transcript = PreTranscript( + gene=gene, exons=[(1001, 1100), (1401, 1500), (1701, 1750), (3001, 4000)] + ) + gene.unspliced_transcripts.append(n.pre_transcript) + for spl in n.pre_transcript.generate_splicing_patterns(): + n.pre_transcript.transcripts.append(Transcript(n.pre_transcript, spl)) + n.annotations = {gene.chr: [gene]} + n.genome_evidence = MockObject( + annotations={}, + read_length=100, + max_expected_fragment_size=550, + config={**DEFAULTS, 'validate.call_error': 11}, + ) + n.trans_evidence = MockObject( + annotations={}, + read_length=100, + max_expected_fragment_size=550, + overlapping_transcripts={n.pre_transcript}, + config={**DEFAULTS, 'validate.call_error': 11}, + ) + setattr( + n.trans_evidence, + '_select_transcripts', + lambda *pos: n.trans_evidence.overlapping_transcripts, + ) + setattr( + n.trans_evidence, + 'traverse', + partial(TranscriptomeEvidence.traverse, n.trans_evidence), + ) + return n + + +def transcriptome_window(ev, breakpoint, transcripts=None): + if transcripts: + ev.overlapping_transcripts.update(transcripts) + return TranscriptomeEvidence.generate_window(ev, breakpoint) + + +class TestTranscriptomeEvidenceWindow: + def test_before_start(self, trans_window_setup): b = Breakpoint(chr='1', start=100, orient=ORIENT.RIGHT) - self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) + assert transcriptome_window( + trans_window_setup.trans_evidence, b + ) == GenomeEvidence.generate_window(trans_window_setup.genome_evidence, b) b = Breakpoint(chr='1', start=500, orient=ORIENT.RIGHT) - self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) + assert transcriptome_window( + trans_window_setup.trans_evidence, b + ) == GenomeEvidence.generate_window(trans_window_setup.genome_evidence, b) - def test_after_end(self): + def test_after_end(self, trans_window_setup): b = Breakpoint(chr='1', start=6000, orient=ORIENT.RIGHT) - self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) + assert transcriptome_window( + trans_window_setup.trans_evidence, b + ) == GenomeEvidence.generate_window(trans_window_setup.genome_evidence, b) - def test_exonic_long_exon(self): + def test_exonic_long_exon(self, trans_window_setup): b = Breakpoint(chr='1', start=3200, orient=ORIENT.RIGHT) - self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) + assert transcriptome_window( + trans_window_setup.trans_evidence, b + ) == GenomeEvidence.generate_window(trans_window_setup.genome_evidence, b) - def test_intronic_long_exon(self): + def test_intronic_long_exon(self, trans_window_setup): b = Breakpoint(chr='1', start=2970, orient=ORIENT.RIGHT) - self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) + assert transcriptome_window( + trans_window_setup.trans_evidence, b + ) == GenomeEvidence.generate_window(trans_window_setup.genome_evidence, b) - def test_intronic_long_intron(self): + def test_intronic_long_intron(self, trans_window_setup): b = Breakpoint(chr='1', start=1800, orient=ORIENT.RIGHT) - print(self.genome_window(b)) - self.assertEqual(Interval(1490, 2360), self.transcriptome_window(b)) + assert transcriptome_window(trans_window_setup.trans_evidence, b) == Interval(1490, 2360) - def test_intronic_short_exon_right(self): + def test_intronic_short_exon_right(self, trans_window_setup): b = Breakpoint(chr='1', start=1690, orient=ORIENT.RIGHT) - print(self.genome_window(b)) - self.assertEqual(Interval(1580, 3500), self.transcriptome_window(b)) + assert transcriptome_window(trans_window_setup.trans_evidence, b) == Interval(1580, 3500) - def test_intronic_short_exon_left(self): + def test_intronic_short_exon_left(self, trans_window_setup): b = Breakpoint(chr='1', start=2200, orient=ORIENT.LEFT) - self.assertEqual(Interval(1440, 2310), self.transcriptome_window(b)) + assert transcriptome_window(trans_window_setup.trans_evidence, b) == Interval(1440, 2310) - def test_multiple_transcripts(self): + def test_multiple_transcripts(self, trans_window_setup): # [(1001, 1100), (1401, 1500), (1701, 1750), (3001, 4000)]) b = Breakpoint(chr='1', start=1150, orient=ORIENT.RIGHT) - gene = self.annotations['1'][0] + gene = trans_window_setup.annotations['1'][0] t2 = PreTranscript(gene=gene, exons=[(1001, 1100), (1200, 1300), (2100, 2200)]) for patt in t2.generate_splicing_patterns(): t2.transcripts.append(Transcript(t2, patt)) gene.transcripts.append(t2) # 989 - 2561 # 989 - 3411 - self.assertEqual( - Interval(1040, 3160), self.transcriptome_window(b, [self.pre_transcript, t2]) - ) + assert transcriptome_window( + trans_window_setup.trans_evidence, b, [trans_window_setup.pre_transcript, t2] + ) == Interval(1040, 3160) - def test_many_small_exons(self): + def test_many_small_exons(self, trans_window_setup): g = Gene('fake', 17271277, 17279592, strand='+') pre_transcript = PreTranscript( gene=g, @@ -463,48 +489,45 @@ def test_many_small_exons(self): for patt in pre_transcript.generate_splicing_patterns(): pre_transcript.transcripts.append(Transcript(pre_transcript, patt)) b = Breakpoint(chr='fake', start=17279591, orient=ORIENT.LEFT) - self.assertEqual( - Interval(17277321, 17279701), self.transcriptome_window(b, [pre_transcript]) - ) + assert transcriptome_window( + trans_window_setup.trans_evidence, b, [pre_transcript] + ) == Interval(17277321, 17279701) -class TestNetSizeTrans(unittest.TestCase): - def setUp(self): - self.transcript = PreTranscript( - [(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.POS - ) - for patt in self.transcript.generate_splicing_patterns(): - self.transcript.transcripts.append(Transcript(self.transcript, patt)) - self.trans_evidence = MockObject( +class TestNetSizeTrans: + def test_net_zero(self): + transcript = PreTranscript([(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.POS) + for patt in transcript.generate_splicing_patterns(): + transcript.transcripts.append(Transcript(transcript, patt)) + trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11, - overlapping_transcripts={self.transcript}, + overlapping_transcripts={transcript}, ) setattr( - self.trans_evidence, + trans_evidence, '_select_transcripts', - lambda *pos: self.trans_evidence.overlapping_transcripts, + lambda *pos: trans_evidence.overlapping_transcripts, ) setattr( - self.trans_evidence, + trans_evidence, 'distance', - partial(TranscriptomeEvidence.distance, self.trans_evidence), + partial(TranscriptomeEvidence.distance, trans_evidence), ) - def test_net_zero(self): bpp = BreakpointPair( Breakpoint('1', 1099, orient=ORIENT.LEFT), Breakpoint('1', 1302, orient=ORIENT.RIGHT), untemplated_seq='TT', ) - dist = partial(TranscriptomeEvidence.distance, self.trans_evidence) - self.assertEqual(Interval(-200), bpp.net_size()) - self.assertEqual(Interval(0), bpp.net_size(dist)) + dist = partial(TranscriptomeEvidence.distance, trans_evidence) + assert bpp.net_size() == Interval(-200) + assert bpp.net_size(dist) == Interval(0) -class TestGenomeEvidenceWindow(unittest.TestCase): +class TestGenomeEvidenceWindow: def test_orient_ns(self): bpp = Breakpoint(chr='1', start=1000, end=1000, orient=ORIENT.NS) window = GenomeEvidence.generate_window( @@ -515,9 +538,9 @@ def test_orient_ns(self): ), bpp, ) - self.assertEqual(440, window.start) - self.assertEqual(1560, window.end) - self.assertEqual(1121, len(window)) + assert window.start == 440 + assert window.end == 1560 + assert len(window) == 1121 def test_orient_left(self): bpp = Breakpoint(chr='1', start=1000, end=1000, orient=ORIENT.LEFT) @@ -529,9 +552,9 @@ def test_orient_left(self): ), bpp, ) - self.assertEqual(440, window.start) - self.assertEqual(1110, window.end) - self.assertEqual(671, len(window)) + assert window.start == 440 + assert window.end == 1110 + assert len(window) == 671 def test_orient_right(self): bpp = Breakpoint(chr='1', start=1000, end=1000, orient=ORIENT.RIGHT) @@ -543,9 +566,9 @@ def test_orient_right(self): ), bpp, ) - self.assertEqual(890, window.start) - self.assertEqual(1560, window.end) - self.assertEqual(671, len(window)) + assert window.start == 890 + assert window.end == 1560 + assert len(window) == 671 def test_window_accessors(self): ge = GenomeEvidence( @@ -559,87 +582,89 @@ def test_window_accessors(self): median_fragment_size=100, config={'validate.stdev_count_abnormal': 1, 'validate.call_error': 0}, ) - self.assertEqual(901, ge.outer_window1.start) - self.assertEqual(1649, ge.outer_window1.end) - self.assertEqual(6600, ge.outer_window2.end) - self.assertEqual(5852, ge.outer_window2.start) - - self.assertEqual(1351, ge.inner_window1.start) - self.assertEqual(1649, ge.inner_window1.end) - self.assertEqual(6150, ge.inner_window2.end) - self.assertEqual(5852, ge.inner_window2.start) - - -class TestGenomeEvidenceAddReads(unittest.TestCase): - def setUp(self): - self.ge = GenomeEvidence( - Breakpoint('1', 1500, orient=ORIENT.LEFT), - Breakpoint('1', 6001, orient=ORIENT.RIGHT), - BamCache(MockBamFileHandle({'1': 0})), - None, # reference_genome - opposing_strands=False, - read_length=150, - stdev_fragment_size=500, - median_fragment_size=100, - config={'validate.stdev_count_abnormal': 1, 'validate.call_error': 0}, - ) - # outer windows (901, 1649) (5852, 6600) - # inner windows (1351, 1649) (5852, 6150) - - def test_collect_flanking_pair_error_unmapped_read(self): + assert ge.outer_window1.start == 901 + assert ge.outer_window1.end == 1649 + assert ge.outer_window2.end == 6600 + assert ge.outer_window2.start == 5852 + + assert ge.inner_window1.start == 1351 + assert ge.inner_window1.end == 1649 + assert ge.inner_window2.end == 6150 + assert ge.inner_window2.start == 5852 + + +@pytest.fixture +def flanking_ge(read_length): + return GenomeEvidence( + Breakpoint('1', 1500, orient=ORIENT.LEFT), + Breakpoint('1', 6001, orient=ORIENT.RIGHT), + BamCache(MockBamFileHandle({'1': 0})), + None, # reference_genome + opposing_strands=False, + read_length=150, + stdev_fragment_size=500, + median_fragment_size=100, + config={'validate.stdev_count_abnormal': 1, 'validate.call_error': 0}, + ) + # outer windows (901, 1649) (5852, 6600) + # inner windows (1351, 1649) (5852, 6150) + + +class TestGenomeEvidenceAddReads: + def test_collect_flanking_pair_error_unmapped_read(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 0, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) read.is_unmapped = True - with self.assertRaises(ValueError): - self.ge.collect_flanking_pair(read, mate) + with pytest.raises(ValueError): + flanking_ge.collect_flanking_pair(read, mate) - def test_collect_flanking_pair_error_mate_unmapped(self): + def test_collect_flanking_pair_error_mate_unmapped(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 0, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) mate.is_unmapped = True - with self.assertRaises(ValueError): - self.ge.collect_flanking_pair(read, mate) + with pytest.raises(ValueError): + flanking_ge.collect_flanking_pair(read, mate) - def test_collect_flanking_pair_error_query_names_dont_match(self): + def test_collect_flanking_pair_error_query_names_dont_match(self, flanking_ge): read, mate = mock_read_pair( MockRead('test1', 0, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) - with self.assertRaises(ValueError): - self.ge.collect_flanking_pair(read, mate) + with pytest.raises(ValueError): + flanking_ge.collect_flanking_pair(read, mate) - def test_collect_flanking_pair_error_template_lengths_dont_match(self): + def test_collect_flanking_pair_error_template_lengths_dont_match(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 0, 900, 1000, is_reverse=False, template_length=50), MockRead('test', 0, 6000, 6099, is_reverse=True), ) mate.template_length = 55 - with self.assertRaises(ValueError): - self.ge.collect_flanking_pair(read, mate) + with pytest.raises(ValueError): + flanking_ge.collect_flanking_pair(read, mate) - def test_collect_flanking_pair_read_low_mq(self): + def test_collect_flanking_pair_read_low_mq(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 0, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) read.mapping_quality = 0 - self.assertFalse(self.ge.collect_flanking_pair(read, mate)) + assert not flanking_ge.collect_flanking_pair(read, mate) - def test_collect_flanking_pair_mate_low_mq(self): + def test_collect_flanking_pair_mate_low_mq(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 0, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) mate.mapping_quality = 0 - self.assertFalse(self.ge.collect_flanking_pair(read, mate)) + assert not flanking_ge.collect_flanking_pair(read, mate) - def test_collect_flanking_pair_interchromosomal(self): + def test_collect_flanking_pair_interchromosomal(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 1, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) - self.assertFalse(self.ge.collect_flanking_pair(read, mate)) + assert not flanking_ge.collect_flanking_pair(read, mate) diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py index 0a6bea71..339d86ed 100644 --- a/tests/unit/test_annotate.py +++ b/tests/unit/test_annotate.py @@ -1,18 +1,16 @@ import itertools import os -import unittest +import pytest +import timeout_decorator from mavis.annotate.base import ReferenceName -from mavis.annotate.protein import calculate_orf, Domain, DomainRegion +from mavis.annotate.protein import Domain, DomainRegion, calculate_orf from mavis.annotate.variant import IndelCall -import timeout_decorator - -from .mock import Mock, MockFunction DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -class TestDomainAlignSeq(unittest.TestCase): +class TestDomainAlignSeq: def test_large_combinations_finishes_with_error(self): input_seq = ( 'MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSETSKPALAQPALAQPASAKPVERRKVIRKKVDPSK' @@ -273,100 +271,98 @@ def test_large_combinations_finishes_with_error(self): regions.append(DomainRegion(p, p + len(seq) - 1, seq=seq)) p += len(seq) d = Domain('name', regions=regions) - with self.assertRaises(UserWarning): + with pytest.raises(UserWarning): d.align_seq(input_seq) -class TestCalculateORF(unittest.TestCase): - def setUp(self): - # load the sequence - with open(os.path.join(DATA_DIR, 'calc_orf_test_sequence.fa'), 'r') as fh: - self.seq = fh.readlines()[0].strip() - +class TestCalculateORF: @timeout_decorator.timeout(20) def test_very_long(self): - calculate_orf(self.seq, 300) + # load the sequence + with open(os.path.join(DATA_DIR, 'calc_orf_test_sequence.fa'), 'r') as fh: + seq = fh.readlines()[0].strip() + calculate_orf(seq, 300) -class TestReferenceName(unittest.TestCase): +class TestReferenceName: def test_naked_vs_naked_str(self): - self.assertEqual('1', ReferenceName('1')) - self.assertNotEqual('2', ReferenceName('1')) - self.assertTrue(ReferenceName('1') == '1') - self.assertTrue(ReferenceName('1') != '2') + assert ReferenceName('1') == '1' + assert ReferenceName('1') != '2' + assert ReferenceName('1') == '1' + assert ReferenceName('1') != '2' def test_naked_vs_prefixed_str(self): - self.assertEqual('chr1', ReferenceName('1')) - self.assertNotEqual('chr2', ReferenceName('1')) - self.assertTrue(ReferenceName('1') == 'chr1') - self.assertTrue(ReferenceName('1') != 'chr2') + assert ReferenceName('1') == 'chr1' + assert ReferenceName('1') != 'chr2' + assert ReferenceName('1') == 'chr1' + assert ReferenceName('1') != 'chr2' def test_prefixed_vs_prefixed_str(self): - self.assertEqual('chr1', ReferenceName('chr1')) - self.assertNotEqual('chr2', ReferenceName('chr1')) - self.assertTrue(ReferenceName('chr1') == 'chr1') - self.assertTrue(ReferenceName('chr1') != 'chr2') + assert ReferenceName('chr1') == 'chr1' + assert ReferenceName('chr1') != 'chr2' + assert ReferenceName('chr1') == 'chr1' + assert ReferenceName('chr1') != 'chr2' def test_prefixed_vs_naked_str(self): - self.assertEqual('1', ReferenceName('chr1')) - self.assertNotEqual('2', ReferenceName('chr1')) - self.assertTrue(ReferenceName('chr1') == '1') + assert ReferenceName('chr1') == '1' + assert ReferenceName('chr1') != '2' + assert ReferenceName('chr1') == '1' def test_obj_comparison(self): r = ReferenceName('1') rprefix = ReferenceName('chr1') r2 = ReferenceName('2') r2prefix = ReferenceName('chr2') - self.assertEqual(r, rprefix) - self.assertEqual(rprefix, r) - self.assertEqual(rprefix, ReferenceName('chr1')) - self.assertEqual(r, ReferenceName('1')) - self.assertNotEqual(r2, rprefix) - self.assertNotEqual(r2prefix, rprefix) - self.assertNotEqual(r2, r) - self.assertNotEqual(r2prefix, r) - self.assertTrue(r == rprefix) - self.assertTrue(r != r2prefix) - self.assertFalse(r != rprefix) + assert rprefix == r + assert r == rprefix + assert ReferenceName('chr1') == rprefix + assert ReferenceName('1') == r + assert rprefix != r2 + assert rprefix != r2prefix + assert r != r2 + assert r != r2prefix + assert r == rprefix + assert r != r2prefix + assert not r != rprefix def test_lt(self): r = ReferenceName('1') rprefix = ReferenceName('chr1') r2 = ReferenceName('2') r2prefix = ReferenceName('chr2') - self.assertTrue(r <= rprefix) - self.assertFalse(r < rprefix) - self.assertFalse(rprefix < r) - self.assertTrue(rprefix <= r) + assert r <= rprefix + assert not r < rprefix + assert not rprefix < r + assert rprefix <= r for chr1, chr2 in itertools.product([r, rprefix], [r2, r2prefix]): - self.assertTrue(chr1 < chr2) - self.assertTrue(chr1 <= chr2) + assert chr1 < chr2 + assert chr1 <= chr2 def test_alpha_sort(self): - self.assertTrue(ReferenceName('10') < ReferenceName('3')) - self.assertTrue(ReferenceName('10') < ReferenceName('chr3')) - self.assertTrue(ReferenceName('chr10') < ReferenceName('3')) - self.assertTrue(ReferenceName('chr10') < ReferenceName('chr3')) + assert ReferenceName('10') < ReferenceName('3') + assert ReferenceName('10') < ReferenceName('chr3') + assert ReferenceName('chr10') < ReferenceName('3') + assert ReferenceName('chr10') < ReferenceName('chr3') def test_gt(self): r = ReferenceName('1') rprefix = ReferenceName('chr1') r2 = ReferenceName('2') r2prefix = ReferenceName('chr2') - self.assertTrue(rprefix >= r) - self.assertTrue(r >= rprefix) - self.assertFalse(r > rprefix) - self.assertFalse(rprefix > r) + assert rprefix >= r + assert r >= rprefix + assert not r > rprefix + assert not rprefix > r for chr1, chr2 in itertools.product([r, rprefix], [r2, r2prefix]): - self.assertTrue(chr2 > chr1) - self.assertTrue(chr2 >= chr1) + assert chr2 > chr1 + assert chr2 >= chr1 def test_hash(self): - self.assertTrue(ReferenceName('3') in {ReferenceName('3')}) - self.assertTrue(ReferenceName('3') in {ReferenceName('chr3')}) + assert ReferenceName('3') in {ReferenceName('3')} + assert ReferenceName('3') in {ReferenceName('chr3')} -class TestIndelCall(unittest.TestCase): +class TestIndelCall: def test_duplication_in_repeat(self): ref = 'ASFHGHGSFSFSLLLLLL' 'FLLLLSFSLMVPWSFKW' mut = 'ASFHGHGSFSFSLLLLLLL' 'FLLLLSFSLMVPWSFKW' @@ -374,11 +370,11 @@ def test_duplication_in_repeat(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(18, call.nterm_aligned) - self.assertEqual(len(ref) - 13 + 1, call.cterm_aligned) - self.assertTrue(call.is_dup) + assert call.nterm_aligned == 18 + assert call.cterm_aligned == len(ref) - 13 + 1 + assert call.is_dup - self.assertEqual('p.L18dupL', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.L18dupL' def test_nterminal_extension(self): @@ -387,13 +383,13 @@ def test_nterminal_extension(self): call = IndelCall(ref, mut) print(call) - self.assertFalse(call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 1 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('MAF', call.ins_seq) - self.assertEqual('', call.del_seq) + assert not call.nterm_aligned + assert call.cterm_aligned == len(call.ref_seq) - 1 + 1 + assert not call.is_dup + assert call.ins_seq == 'MAF' + assert call.del_seq == '' - self.assertEqual('p.M1ext-3', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.M1ext-3' def test_nterminal_deletion(self): ref = 'MABCDEFGH' @@ -401,13 +397,13 @@ def test_nterminal_deletion(self): call = IndelCall(ref, mut) print(call) - self.assertFalse(call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 4 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('', call.ins_seq) - self.assertEqual('MAB', call.del_seq) + assert not call.nterm_aligned + assert call.cterm_aligned == len(call.ref_seq) - 4 + 1 + assert not call.is_dup + assert call.ins_seq == '' + assert call.del_seq == 'MAB' - self.assertEqual('p.M1_B3delMAB', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.M1_B3delMAB' def test_cterminal_deletion(self): ref = 'MABCDEFGH' @@ -415,13 +411,13 @@ def test_cterminal_deletion(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(6, call.nterm_aligned) - self.assertFalse(call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('', call.ins_seq) - self.assertEqual('FGH', call.del_seq) + assert call.nterm_aligned == 6 + assert not call.cterm_aligned + assert not call.is_dup + assert call.ins_seq == '' + assert call.del_seq == 'FGH' - self.assertEqual('p.F7_H9delFGH', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.F7_H9delFGH' def test_cterminal_extension(self): ref = 'MABCDEFGH' @@ -429,13 +425,13 @@ def test_cterminal_extension(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(9, call.nterm_aligned) - self.assertFalse(call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('IJK', call.ins_seq) - self.assertEqual('', call.del_seq) + assert call.nterm_aligned == 9 + assert not call.cterm_aligned + assert not call.is_dup + assert call.ins_seq == 'IJK' + assert call.del_seq == '' - self.assertEqual('p.H9ext3', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.H9ext3' def test_cterminal_stop_extension(self): ref = 'MABCDEFGH*' @@ -443,13 +439,13 @@ def test_cterminal_stop_extension(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(9, call.nterm_aligned) - self.assertFalse(call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('IJK', call.ins_seq) - self.assertEqual('', call.del_seq) + assert call.nterm_aligned == 9 + assert not call.cterm_aligned + assert not call.is_dup + assert call.ins_seq == 'IJK' + assert call.del_seq == '' - self.assertEqual('p.*10ext*3', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.*10ext*3' def test_cterminal_no_orf_ext(self): ref = 'MABCDEFGH' @@ -457,13 +453,13 @@ def test_cterminal_no_orf_ext(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(9, call.nterm_aligned) - self.assertFalse(call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('IJK*', call.ins_seq) - self.assertEqual('', call.del_seq) + assert call.nterm_aligned == 9 + assert not call.cterm_aligned + assert not call.is_dup + assert call.ins_seq == 'IJK*' + assert call.del_seq == '' - self.assertEqual('p.H9ext*4', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.H9ext*4' def test_single_aa_insertion(self): ref = 'MABCDEFGH' @@ -471,13 +467,13 @@ def test_single_aa_insertion(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(4, call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 5 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('K', call.ins_seq) - self.assertEqual('', call.del_seq) + assert call.nterm_aligned == 4 + assert call.cterm_aligned == len(call.ref_seq) - 5 + 1 + assert not call.is_dup + assert call.ins_seq == 'K' + assert call.del_seq == '' - self.assertEqual('p.C4_D5insK', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.C4_D5insK' def test_insertion(self): ref = 'MABCDEFGH' @@ -485,13 +481,13 @@ def test_insertion(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(4, call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 5 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('KA', call.ins_seq) - self.assertEqual('', call.del_seq) + assert call.nterm_aligned == 4 + assert call.cterm_aligned == len(call.ref_seq) - 5 + 1 + assert not call.is_dup + assert call.ins_seq == 'KA' + assert call.del_seq == '' - self.assertEqual('p.C4_D5insKA', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.C4_D5insKA' def test_single_aa_deletion(self): ref = 'MABCDEFGH' @@ -499,13 +495,13 @@ def test_single_aa_deletion(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(4, call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 6 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('', call.ins_seq) - self.assertEqual('D', call.del_seq) + assert call.nterm_aligned == 4 + assert call.cterm_aligned == len(call.ref_seq) - 6 + 1 + assert not call.is_dup + assert call.ins_seq == '' + assert call.del_seq == 'D' - self.assertEqual('p.D5delD', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.D5delD' def test_deletion(self): ref = 'MABCDEFGH' @@ -513,13 +509,13 @@ def test_deletion(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(4, call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 7 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('', call.ins_seq) - self.assertEqual('DE', call.del_seq) + assert call.nterm_aligned == 4 + assert call.cterm_aligned == len(call.ref_seq) - 7 + 1 + assert not call.is_dup + assert call.ins_seq == '' + assert call.del_seq == 'DE' - self.assertEqual('p.D5_E6delDE', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.D5_E6delDE' def test_deletion_in_repeat(self): ref = 'MABCDEEEEEEFGH' @@ -527,13 +523,13 @@ def test_deletion_in_repeat(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(9, call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 8 + 1, call.cterm_aligned) - self.assertFalse(call.is_dup) - self.assertEqual('', call.ins_seq) - self.assertEqual('EE', call.del_seq) + assert call.nterm_aligned == 9 + assert call.cterm_aligned == len(call.ref_seq) - 8 + 1 + assert not call.is_dup + assert call.ins_seq == '' + assert call.del_seq == 'EE' - self.assertEqual('p.E10_E11delEE', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.E10_E11delEE' def test_insertion_in_repeat(self): ref = 'MABCDEEEEFGH' @@ -541,10 +537,10 @@ def test_insertion_in_repeat(self): call = IndelCall(ref, mut) print(call) - self.assertEqual(9, call.nterm_aligned) - self.assertEqual(len(call.ref_seq) - 6 + 1, call.cterm_aligned) - self.assertTrue(call.is_dup) - self.assertEqual('EE', call.ins_seq) - self.assertEqual('', call.del_seq) + assert call.nterm_aligned == 9 + assert call.cterm_aligned == len(call.ref_seq) - 6 + 1 + assert call.is_dup + assert call.ins_seq == 'EE' + assert call.del_seq == '' - self.assertEqual('p.E8_E9dupEE', call.hgvs_protein_notation()) + assert call.hgvs_protein_notation() == 'p.E8_E9dupEE' diff --git a/tests/unit/test_assemble.py b/tests/unit/test_assemble.py index 3aa18592..fbd5d0cb 100644 --- a/tests/unit/test_assemble.py +++ b/tests/unit/test_assemble.py @@ -1,81 +1,82 @@ import itertools -import random import os -import unittest -import pytest +import random -from mavis.assemble import assemble, Contig, DeBruijnGraph, filter_contigs, kmers +import pytest +from mavis.assemble import Contig, DeBruijnGraph, assemble, filter_contigs, kmers from mavis.constants import DNA_ALPHABET +from ..util import long_running_test + DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -class TestModule(unittest.TestCase): +class TestModule: """ test class for functions in the validate namespace that are not associated with a class """ def test_alphabet_matching(self): - self.assertTrue(DNA_ALPHABET.match('N', 'A')) - self.assertTrue(DNA_ALPHABET.match('A', 'N')) + assert DNA_ALPHABET.match('N', 'A') + assert DNA_ALPHABET.match('A', 'N') def test_kmers(self): k = kmers('ABCDEFG', 2) - self.assertEqual(['AB', 'BC', 'CD', 'DE', 'EF', 'FG'], k) + assert k == ['AB', 'BC', 'CD', 'DE', 'EF', 'FG'] k = kmers('ABCDEFG', 3) - self.assertEqual(['ABC', 'BCD', 'CDE', 'DEF', 'EFG'], k) + assert k == ['ABC', 'BCD', 'CDE', 'DEF', 'EFG'] def test_assemble(self): sequences = ['ABCD', 'BCDE', 'CDEF', 'ABCDE', 'DEFG'] c = assemble(sequences, 3, min_edge_trim_weight=1, remap_min_exact_match=1) - self.assertEqual(1, len(c)) - self.assertEqual('ABCDEFG', c[0].seq) - self.assertEqual(5, c[0].remap_score()) + assert len(c) == 1 + assert c[0].seq == 'ABCDEFG' + assert c[0].remap_score() == 5 def test_assemble_empty_list(self): - self.assertEqual([], assemble([], 1)) + assert assemble([], 1) == [] def test_repeat_region_assembly(self): rep = 'ABCDEF' seqs = kmers(rep + rep, len(rep)) contigs = assemble(seqs, len(rep) - 1, remap_min_exact_match=1) - self.assertEqual(0, len(contigs)) + assert len(contigs) == 0 -class TestFilterContigs(unittest.TestCase): +class TestFilterContigs: def test_drop_reverse_complement(self): c1 = Contig('atcgatcgatcgatcgatcgatcgatatagggcatcagc', 1) c2 = Contig('gctgatgccctatatcgatcgatcgatcgatcgatcgat', 1) result = filter_contigs([c2, c1], 0.10) - self.assertEqual(1, len(result)) - self.assertEqual(c1.seq, result[0].seq) + assert len(result) == 1 + assert result[0].seq == c1.seq def test_drop_alt_allele_alphabetically(self): c1 = Contig('atcgatcgatcgatcgatcgatcgatatagggcatcagc', 1) c2 = Contig('atcgatcgatcgatcgatctatcgatatagggcatcagc', 1) result = filter_contigs([c2, c1], 0.10) - self.assertEqual(1, len(result)) - self.assertEqual(c1.seq, result[0].seq) + assert len(result) == 1 + assert result[0].seq == c1.seq def test_drop_alt_allele_by_score(self): c1 = Contig('atcgatcgatcgatcgatcgatcgatatagggcatcagc', 2) c2 = Contig('atcgatcgatcgatcgatctatcgatatagggcatcagc', 1) result = filter_contigs([c2, c1], 0.10) - self.assertEqual(1, len(result)) - self.assertEqual(c1.seq, result[0].seq) + assert len(result) == 1 + assert result[0].seq == c1.seq def test_retain_disimilar(self): c1 = Contig('atcgatcgatcgatcgatcgatcgatatagggcatcagc', 2) c2 = Contig('atcgadatcgatcgatcgatctgtdstcgatatagggca', 1) result = filter_contigs([c2, c1], 0.10) - self.assertEqual(2, len(result)) + assert len(result) == 2 def test_retain_disimilar_different_lengths(self): c1 = Contig('atcgatcgatcgatcgatcgatcgatatagggcatcagc', 2) c2 = Contig('atcgatcgatcgatcgatcgatcccgtgatatagggcatcagc', 1) result = filter_contigs([c2, c1], 0.10) - self.assertEqual(2, len(result)) + assert len(result) == 2 def test_drop_similar_different_lengths(self): c1 = Contig( @@ -87,12 +88,12 @@ def test_drop_similar_different_lengths(self): 1, ) result = filter_contigs([c2, c1], 0.10) - self.assertEqual(1, len(result)) - self.assertEqual(c1.seq, result[0].seq) + assert len(result) == 1 + assert result[0].seq == c1.seq -class TestDeBruijnGraph(unittest.TestCase): - @pytest.mark.skipif(os.environ.get('RUN_FULL', '0') != '1', reason='running short tests only') +class TestDeBruijnGraph: + @long_running_test def test_trim_tails_by_freq_forks(self): g = DeBruijnGraph() for s, t in itertools.combinations([1, 2, 3, 4, 5, 6], 2): @@ -104,7 +105,7 @@ def test_trim_tails_by_freq_forks(self): g.add_edge(8, 7) g.add_edge(9, 8) g.trim_tails_by_freq(2) - self.assertEqual([1, 2, 3, 4, 5, 6], sorted(g.nodes())) + assert sorted(g.nodes()) == [1, 2, 3, 4, 5, 6] g = DeBruijnGraph() for s, t in itertools.combinations([1, 2, 3, 4, 5, 6], 2): @@ -116,7 +117,7 @@ def test_trim_tails_by_freq_forks(self): g.add_edge(8, 7) g.add_edge(9, 8) g.trim_tails_by_freq(2) - self.assertEqual([1, 2, 3, 4, 5, 6, 7, 8], sorted(g.nodes())) + assert sorted(g.nodes()) == [1, 2, 3, 4, 5, 6, 7, 8] g = DeBruijnGraph() for s, t in itertools.combinations([1, 2, 3, 4, 5, 6], 2): @@ -127,16 +128,16 @@ def test_trim_tails_by_freq_forks(self): g.add_edge(7, 8) g.add_edge(9, 8) g.trim_tails_by_freq(2) - self.assertEqual([1, 2, 3, 4, 5, 6], sorted(g.nodes())) + assert sorted(g.nodes()) == [1, 2, 3, 4, 5, 6] def test_add_edge(self): g = DeBruijnGraph() g.add_edge(1, 2) - self.assertEqual(1, g.get_edge_freq(1, 2)) + assert g.get_edge_freq(1, 2) == 1 g.add_edge(1, 2) - self.assertEqual(2, g.get_edge_freq(1, 2)) + assert g.get_edge_freq(1, 2) == 2 g.add_edge(1, 2, 5) - self.assertEqual(7, g.get_edge_freq(1, 2)) + assert g.get_edge_freq(1, 2) == 7 def test_trim_noncutting_paths_by_freq_degree_stop(self): g = DeBruijnGraph() @@ -150,7 +151,7 @@ def test_trim_noncutting_paths_by_freq_degree_stop(self): for edge in g.edges(): print(edge) g.trim_noncutting_paths_by_freq(3) - self.assertEqual(list(range(1, 9)) + path1[1:-1], g.nodes()) + assert g.nodes() == list(range(1, 9)) + path1[1:-1] # add an equal weight path to force namesorting path2 = [5, 13, 14, 15, 16, 1] @@ -158,14 +159,14 @@ def test_trim_noncutting_paths_by_freq_degree_stop(self): g.add_edge(s, t) g.trim_noncutting_paths_by_freq(3) - self.assertEqual(list(range(1, 9)) + path2[1:-1], g.nodes()) + assert g.nodes() == list(range(1, 9)) + path2[1:-1] # add back the original path with a higher (but still low) weight for s, t in zip(path1, path1[1:]): g.add_edge(s, t, freq=2) g.trim_noncutting_paths_by_freq(3) - self.assertEqual(list(range(1, 9)) + path1[1:-1], g.nodes()) + assert g.nodes() == list(range(1, 9)) + path1[1:-1] # add the second path with 1 high weight edge path2 = [5, 13, 14, 15, 16, 1] @@ -174,28 +175,31 @@ def test_trim_noncutting_paths_by_freq_degree_stop(self): g.add_edge(14, 15, freq=6) g.trim_noncutting_paths_by_freq(3) - self.assertEqual(list(range(1, 9)) + path2[1:-1], g.nodes()) + assert g.nodes() == list(range(1, 9)) + path2[1:-1] + +@pytest.fixture +def assembly_sequences(): + # load the sequences + with open(os.path.join(DATA_DIR, 'test_assembly_sequences.txt')) as fh: + seq = [i.strip() for i in fh.readlines()] + return seq -class TestFullAssemly(unittest.TestCase): - def setUp(self): - # load the sequences - with open(os.path.join(DATA_DIR, 'test_assembly_sequences.txt')) as fh: - self.seq = [i.strip() for i in fh.readlines()] - @pytest.mark.skipif(os.environ.get('RUN_FULL', '0') != '1', reason='running short tests only') - def test_deterministic_assembly(self): +class TestFullAssemly: + @long_running_test + def test_deterministic_assembly(self, assembly_sequences): contig_sequences = set() for i in range(20): - random.shuffle(self.seq) + random.shuffle(assembly_sequences) contigs = assemble( - self.seq, + assembly_sequences, 111, min_edge_trim_weight=3, assembly_max_paths=8, assembly_min_uniq=0.1, min_complexity=0.1, ) - self.assertEqual(1, len(contigs)) + assert len(contigs) == 1 contig_sequences.add(contigs[0].seq) - self.assertEqual(1, len(contig_sequences)) + assert len(contig_sequences) == 1 diff --git a/tests/unit/test_bam.py b/tests/unit/test_bam.py index c4b0005e..bbcb0ab7 100644 --- a/tests/unit/test_bam.py +++ b/tests/unit/test_bam.py @@ -1,5 +1,4 @@ -import unittest - +import pytest from mavis.bam import cigar as _cigar from mavis.bam import read as _read from mavis.constants import CIGAR, ORIENT @@ -7,7 +6,7 @@ from .mock import Mock, MockFunction -class TestPileUp(unittest.TestCase): +class TestPileUp: def mock_read(self, positions, **kwargs): return Mock(get_reference_positions=MockFunction(positions), **kwargs) @@ -15,7 +14,7 @@ def test_sparse_coverage(self): reads = [self.mock_read(range(0, 5)), self.mock_read(range(20, 25))] pileup = _read.pileup(reads) expected = [(r, 1) for r in range(1, 6)] + [(r, 1) for r in range(21, 26)] - self.assertEqual(expected, pileup) + assert pileup == expected def test_dense_coverage(self): reads = [ @@ -28,7 +27,7 @@ def test_dense_coverage(self): ] pileup = _read.pileup(reads) expected = list(zip(range(1, 9), [2, 4, 5, 6, 6, 4, 3, 2])) - self.assertEqual(expected, pileup) + assert pileup == expected def test_filter_reads(self): reads = [ @@ -41,14 +40,14 @@ def test_filter_reads(self): ] pileup = _read.pileup(reads, filter_func=lambda x: True if x.mapping_quality < 1 else False) expected = list(zip(range(2, 9), [1, 1, 2, 2, 2, 2, 2])) - self.assertEqual(expected, pileup) + assert pileup == expected -class TestConvertEventsToSoftclipping(unittest.TestCase): +class TestConvertEventsToSoftclipping: def test_left_large_deletion(self): read = Mock(cigar=[(CIGAR.EQ, 10), (CIGAR.D, 10), (CIGAR.EQ, 40)], query_sequence='A' * 50) converted = _read.convert_events_to_softclipping(read, ORIENT.LEFT, 5, 5) - self.assertEqual([(CIGAR.EQ, 10), (CIGAR.S, 40)], converted.cigar) + assert converted.cigar == [(CIGAR.EQ, 10), (CIGAR.S, 40)] def test_left_anchor_after_event(self): read = Mock( @@ -56,14 +55,12 @@ def test_left_anchor_after_event(self): query_sequence='A' * 50, ) converted = _read.convert_events_to_softclipping(read, ORIENT.LEFT, 5, 5) - self.assertEqual( - [(CIGAR.EQ, 4), (CIGAR.D, 10), (CIGAR.EQ, 40), (CIGAR.S, 6)], converted.cigar - ) + assert converted.cigar == [(CIGAR.EQ, 4), (CIGAR.D, 10), (CIGAR.EQ, 40), (CIGAR.S, 6)] def test_left_all_mismatch_error(self): read = Mock(cigar=[(CIGAR.X, 10), (CIGAR.D, 10), (CIGAR.X, 40)], query_sequence='A' * 50) converted = _read.convert_events_to_softclipping(read, ORIENT.LEFT, 5, 5) - self.assertEqual(read, converted) + assert converted == read def test_left_combined_small_events(self): read = Mock( @@ -71,7 +68,7 @@ def test_left_combined_small_events(self): query_sequence='A' * 50, ) converted = _read.convert_events_to_softclipping(read, ORIENT.LEFT, 10, 10) - self.assertEqual([(CIGAR.EQ, 10), (CIGAR.S, 40)], converted.cigar) + assert converted.cigar == [(CIGAR.EQ, 10), (CIGAR.S, 40)] def test_right_large_deletion(self): read = Mock( @@ -80,8 +77,8 @@ def test_right_large_deletion(self): reference_start=100, ) converted = _read.convert_events_to_softclipping(read, ORIENT.RIGHT, 5, 5) - self.assertEqual([(CIGAR.S, 10), (CIGAR.EQ, 40)], converted.cigar) - self.assertEqual(read.reference_start + 20, converted.reference_start) + assert converted.cigar == [(CIGAR.S, 10), (CIGAR.EQ, 40)] + assert converted.reference_start == read.reference_start + 20 def test_right_anchor_after_event(self): read = Mock( @@ -90,10 +87,8 @@ def test_right_anchor_after_event(self): reference_start=100, ) converted = _read.convert_events_to_softclipping(read, ORIENT.RIGHT, 5, 5) - self.assertEqual( - [(CIGAR.S, 6), (CIGAR.EQ, 40), (CIGAR.D, 10), (CIGAR.EQ, 4)], converted.cigar - ) - self.assertEqual(read.reference_start + 16, converted.reference_start) + assert converted.cigar == [(CIGAR.S, 6), (CIGAR.EQ, 40), (CIGAR.D, 10), (CIGAR.EQ, 4)] + assert converted.reference_start == read.reference_start + 16 def test_complex_alignment(self): cigar = [ @@ -112,15 +107,15 @@ def test_complex_alignment(self): ] read = Mock(cigar=cigar, query_sequence='A' * 365, reference_start=88217410) - with self.assertRaises(NotImplementedError): + with pytest.raises(NotImplementedError): _read.convert_events_to_softclipping(read, ORIENT.LEFT, 50, 50) read.cigar = [(CIGAR.EQ if x == CIGAR.M else x, y) for x, y in read.cigar] converted = _read.convert_events_to_softclipping(read, ORIENT.LEFT, 50, 50) - self.assertEqual([(CIGAR.EQ, 137), (CIGAR.S, 365 - 137)], converted.cigar) + assert converted.cigar == [(CIGAR.EQ, 137), (CIGAR.S, 365 - 137)] converted = _read.convert_events_to_softclipping(read, ORIENT.RIGHT, 50, 100) - self.assertEqual(read.cigar, converted.cigar) + assert converted.cigar == read.cigar def test_multiple_events(self): cigar = [ @@ -139,7 +134,7 @@ def test_multiple_events(self): read = Mock(cigar=cigar, query_sequence=('N' * qlen), reference_start=1000) converted = _read.convert_events_to_softclipping(read, ORIENT.RIGHT, 50, 50) exp = [(CIGAR.S, 59), (CIGAR.EQ, 28), (CIGAR.D, 2), (CIGAR.EQ, 27), (CIGAR.S, 77)] - self.assertEqual(exp, converted.cigar) + assert converted.cigar == exp def test_multiple_left_with_ins(self): cigar = [ @@ -180,82 +175,82 @@ def test_multiple_left_with_ins(self): qlen = sum([v for c, v in cigar if c in _cigar.QUERY_ALIGNED_STATES]) read = Mock(cigar=cigar, query_sequence=('N' * qlen), reference_start=1000) converted = _read.convert_events_to_softclipping(read, ORIENT.LEFT, 50, 50) - self.assertEqual(exp, converted.cigar) + assert converted.cigar == exp -class TestMergeIndels(unittest.TestCase): +class TestMergeIndels: def test_no_events(self): c = [(CIGAR.EQ, 1)] - self.assertEqual(c, _cigar.merge_indels(c)) + assert _cigar.merge_indels(c) == c c = [(CIGAR.EQ, 1), (CIGAR.X, 3), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_indels(c)) + assert _cigar.merge_indels(c) == c def test_del_before_ins(self): c = [(CIGAR.EQ, 1), (CIGAR.D, 1), (CIGAR.I, 2), (CIGAR.EQ, 2)] exp = [(CIGAR.EQ, 1), (CIGAR.I, 2), (CIGAR.D, 1), (CIGAR.EQ, 2)] - self.assertEqual(exp, _cigar.merge_indels(c)) + assert _cigar.merge_indels(c) == exp def test_ins_before_del(self): exp = [(CIGAR.EQ, 1), (CIGAR.I, 2), (CIGAR.D, 1), (CIGAR.EQ, 2)] - self.assertEqual(exp, _cigar.merge_indels(exp)) + assert _cigar.merge_indels(exp) == exp def test_mixed(self): c = [(CIGAR.EQ, 1), (CIGAR.I, 2), (CIGAR.D, 1), (CIGAR.I, 2), (CIGAR.D, 1), (CIGAR.EQ, 2)] exp = [(CIGAR.EQ, 1), (CIGAR.I, 4), (CIGAR.D, 2), (CIGAR.EQ, 2)] - self.assertEqual(exp, _cigar.merge_indels(c)) + assert _cigar.merge_indels(c) == exp -class TestMergeInternalEvents(unittest.TestCase): +class TestMergeInternalEvents: def test_mismatch_and_deletion(self): c = [(CIGAR.EQ, 10), (CIGAR.X, 2), (CIGAR.EQ, 5), (CIGAR.D, 2), (CIGAR.EQ, 10)] exp = [(CIGAR.EQ, 10), (CIGAR.I, 7), (CIGAR.D, 9), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 5)) - self.assertEqual(exp, _cigar.merge_internal_events(c, 6)) + assert _cigar.merge_internal_events(c, 5) == c + assert _cigar.merge_internal_events(c, 6) == exp def test_mismatch_and_insertion(self): c = [(CIGAR.EQ, 10), (CIGAR.X, 2), (CIGAR.EQ, 5), (CIGAR.I, 2), (CIGAR.EQ, 10)] exp = [(CIGAR.EQ, 10), (CIGAR.I, 9), (CIGAR.D, 7), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 5)) - self.assertEqual(exp, _cigar.merge_internal_events(c, 6)) + assert _cigar.merge_internal_events(c, 5) == c + assert _cigar.merge_internal_events(c, 6) == exp def test_insertions(self): c = [(CIGAR.EQ, 10), (CIGAR.I, 2), (CIGAR.EQ, 5), (CIGAR.I, 2), (CIGAR.EQ, 10)] exp = [(CIGAR.EQ, 10), (CIGAR.I, 9), (CIGAR.D, 5), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 5)) - self.assertEqual(exp, _cigar.merge_internal_events(c, 6)) + assert _cigar.merge_internal_events(c, 5) == c + assert _cigar.merge_internal_events(c, 6) == exp def test_deletions(self): c = [(CIGAR.EQ, 10), (CIGAR.D, 2), (CIGAR.EQ, 5), (CIGAR.D, 2), (CIGAR.EQ, 10)] exp = [(CIGAR.EQ, 10), (CIGAR.I, 5), (CIGAR.D, 9), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 5)) - self.assertEqual(exp, _cigar.merge_internal_events(c, 6)) + assert _cigar.merge_internal_events(c, 5) == c + assert _cigar.merge_internal_events(c, 6) == exp def test_insertion_and_deletion(self): c = [(CIGAR.EQ, 10), (CIGAR.I, 2), (CIGAR.EQ, 5), (CIGAR.D, 2), (CIGAR.EQ, 10)] exp = [(CIGAR.EQ, 10), (CIGAR.I, 7), (CIGAR.D, 7), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 5)) - self.assertEqual(exp, _cigar.merge_internal_events(c, 6)) + assert _cigar.merge_internal_events(c, 5) == c + assert _cigar.merge_internal_events(c, 6) == exp def test_no_internal_events(self): c = [(CIGAR.EQ, 10), (CIGAR.EQ, 10)] exp = [(CIGAR.EQ, 20)] - self.assertEqual(exp, _cigar.merge_internal_events(c, 10)) + assert _cigar.merge_internal_events(c, 10) == exp c = [(CIGAR.X, 10), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 10)) + assert _cigar.merge_internal_events(c, 10) == c def test_single_internal_event(self): c = [(CIGAR.EQ, 10), (CIGAR.X, 5), (CIGAR.EQ, 10)] - self.assertEqual(c, _cigar.merge_internal_events(c, 10)) + assert _cigar.merge_internal_events(c, 10) == c def test_long_suffix_and_prefix(self): c = [ @@ -301,55 +296,52 @@ def test_long_suffix_and_prefix(self): actual = _cigar.merge_internal_events(c, 20, 15) print(c) print(actual) - self.assertEqual(exp, actual) + assert actual == exp def test_mismatch_only(self): exp = _cigar.convert_string_to_cigar('39=1X16=1X71=22S') - self.assertEqual(exp, _cigar.merge_internal_events(exp, 20, 15)) + assert _cigar.merge_internal_events(exp, 20, 15) == exp -class TestExtendSoftclipping(unittest.TestCase): +class TestExtendSoftclipping: def test_simple(self): - self.assertEqual( - ([(CIGAR.S, 10), (CIGAR.M, 10)], 0), - _cigar.extend_softclipping([(CIGAR.S, 10), (CIGAR.M, 10)], 1), + assert _cigar.extend_softclipping([(CIGAR.S, 10), (CIGAR.M, 10)], 1) == ( + [(CIGAR.S, 10), (CIGAR.M, 10)], + 0, ) def test_deletions(self): - self.assertEqual( - ([(CIGAR.S, 10), (CIGAR.M, 10)], 1), - _cigar.extend_softclipping([(CIGAR.I, 10), (CIGAR.D, 1), (CIGAR.M, 10)], 1), + assert _cigar.extend_softclipping([(CIGAR.I, 10), (CIGAR.D, 1), (CIGAR.M, 10)], 1) == ( + [(CIGAR.S, 10), (CIGAR.M, 10)], + 1, ) def test_mismatch(self): - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): _cigar.extend_softclipping([(CIGAR.X, 10), (CIGAR.M, 20), (CIGAR.X, 10)], 30) def test_insert(self): - self.assertEqual( - ([(CIGAR.S, 17), (CIGAR.M, 10), (CIGAR.S, 5)], 2), - _cigar.extend_softclipping( - [(CIGAR.S, 10), (CIGAR.M, 2), (CIGAR.I, 5), (CIGAR.M, 10), (CIGAR.I, 5)], 5 - ), - ) + assert _cigar.extend_softclipping( + [(CIGAR.S, 10), (CIGAR.M, 2), (CIGAR.I, 5), (CIGAR.M, 10), (CIGAR.I, 5)], 5 + ) == ([(CIGAR.S, 17), (CIGAR.M, 10), (CIGAR.S, 5)], 2) def test_hardclipping(self): c = [(CIGAR.H, 10), (CIGAR.EQ, 10)] cnew, prefix = _cigar.extend_softclipping(c, 1) - self.assertEqual(0, prefix) - self.assertEqual(c, cnew) + assert prefix == 0 + assert cnew == c def test_hardclipping_right(self): c = [(CIGAR.EQ, 30), (CIGAR.H, 120)] cnew, prefix = _cigar.extend_softclipping(c, 6) - self.assertEqual(0, prefix) - self.assertEqual(c, cnew) + assert prefix == 0 + assert cnew == c -class TestSequenceComplexity(unittest.TestCase): +class TestSequenceComplexity: def test_low_at(self): seq = 'TATATATAAATATATATTTATATATACATTATTTATATATAAATATATATTTATACATTATTTATATATAAATATATATTTATATATACATTATGTATATATAAAT' - self.assertEqual(0.04, round(_read.sequence_complexity(seq), 2)) + assert round(_read.sequence_complexity(seq), 2) == 0.04 def test_empty(self): - self.assertEqual(0, _read.sequence_complexity('')) + assert _read.sequence_complexity('') == 0 diff --git a/tests/unit/test_blat.py b/tests/unit/test_blat.py index 749b0611..488f14d7 100644 --- a/tests/unit/test_blat.py +++ b/tests/unit/test_blat.py @@ -1,12 +1,11 @@ -import unittest - +import pytest from mavis.blat import Blat from mavis.constants import CIGAR, reverse_complement from .mock import Mock, MockFunction, MockLongString -class TestConvertPslxToPysam(unittest.TestCase): +class TestConvertPslxToPysam: def test_simple(self): row = { 'match': 142, @@ -50,10 +49,10 @@ def test_simple(self): } cache = Mock(reference_id=MockFunction(16)) read = Blat.pslx_row_to_pysam(row, cache, refseq) - self.assertEqual(16, read.reference_id) - self.assertEqual('17', read.reference_name) - self.assertEqual(row['qseq_full'], reverse_complement(read.query_sequence)) - self.assertEqual([(CIGAR.S, 62), (CIGAR.EQ, 142)], read.cigar) + assert read.reference_id == 16 + assert read.reference_name == '17' + assert reverse_complement(read.query_sequence) == row['qseq_full'] + assert read.cigar == [(CIGAR.S, 62), (CIGAR.EQ, 142)] def test_overlapping_blat_blocks_error(self): row = { @@ -72,5 +71,5 @@ def test_overlapping_blat_blocks_error(self): ), } cache = Mock(reference_id=MockFunction(6)) - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): Blat.pslx_row_to_pysam(row, cache, None) diff --git a/tests/unit/test_breakpoint.py b/tests/unit/test_breakpoint.py index 56bdb9da..9eeb8347 100644 --- a/tests/unit/test_breakpoint.py +++ b/tests/unit/test_breakpoint.py @@ -1,17 +1,16 @@ -import unittest from unittest.mock import Mock +import pytest from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import COLUMNS, ORIENT, STRAND, SVTYPE +from mavis.constants import ORIENT, STRAND, SVTYPE from mavis.error import InvalidRearrangement, NotSpecifiedError from mavis.interval import Interval -from mavis.util import read_bpp_from_input_file -class TestBreakpoint(unittest.TestCase): +class TestBreakpoint: def test___eq__(self): - self.assertNotEqual(Breakpoint('1', 1), None) - self.assertEqual(Breakpoint('1', 1), Breakpoint('1', 1)) + assert Breakpoint('1', 1) != None # noqa: E711 + assert Breakpoint('1', 1) == Breakpoint('1', 1) def test___hash__(self): b = Breakpoint('1', 1, 2) @@ -22,44 +21,44 @@ def test___hash__(self): temp.add(b) temp.add(c) temp.add(d) - self.assertEqual(2, len(temp)) + assert len(temp) == 2 temp = dict() temp[b] = None temp[c] = None temp[d] = None - self.assertEqual(2, len(temp.keys())) + assert len(temp.keys()) == 2 def test___len__(self): - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Breakpoint('11', 87042760, 87041922, orient=ORIENT.LEFT, strand=STRAND.NS) def test_inherited_interval_methods(self): b = Breakpoint('1', 1, 10) - self.assertEqual(1, b[0]) - self.assertEqual(10, b[1]) - self.assertEqual(10, len(b)) + assert b[0] == 1 + assert b[1] == 10 + assert len(b) == 10 def test_breakpoint_constructor(self): b = Breakpoint('1', 10, 50) - self.assertEqual(10, b[0]) - self.assertEqual(50, b[1]) - self.assertTrue(Interval.overlaps((1, 10), b)) - self.assertTrue(Interval.overlaps((50, 55), b)) - self.assertFalse(Interval.overlaps((1, 9), b)) + assert b[0] == 10 + assert b[1] == 50 + assert Interval.overlaps((1, 10), b) + assert Interval.overlaps((50, 55), b) + assert not Interval.overlaps((1, 9), b) -class TestBreakpointPair(unittest.TestCase): +class TestBreakpointPair: def test___eq__(self): b = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 3), opposing_strands=True) c = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 3), opposing_strands=True) - self.assertFalse(b is c) - self.assertEqual(b, c) + assert b is not c + assert c == b d = BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 3), opposing_strands=True, untemplated_seq='' ) - self.assertNotEqual(b, d) - self.assertNotEqual(b, None) + assert d != b + assert None != b # noqa: E711 def test___hash__(self): b = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 3), opposing_strands=True) @@ -67,31 +66,31 @@ def test___hash__(self): d = BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 3), opposing_strands=True, untemplated_seq='' ) - self.assertFalse(b is c) + assert b is not c temp = dict() temp[b] = None temp[d] = None temp[c] = None - self.assertEqual(2, len(temp.keys())) + assert len(temp.keys()) == 2 temp = set() temp.add(b) temp.add(c) temp.add(d) - self.assertEqual(2, len(temp)) + assert len(temp) == 2 def test___init__swap_break_order(self): b1 = Breakpoint('1', 1) b2 = Breakpoint('1', 50) bpp = BreakpointPair(b1, b2, opposing_strands=True) - self.assertEqual(bpp.break1, b1) - self.assertEqual(bpp.break2, b2) + assert b1 == bpp.break1 + assert b2 == bpp.break2 bpp = BreakpointPair(b2, b1, opposing_strands=True) - self.assertEqual(bpp.break1, b1) - self.assertEqual(bpp.break2, b2) + assert b1 == bpp.break1 + assert b2 == bpp.break2 def test___init__opstrand_conflict(self): - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): BreakpointPair( Breakpoint('1', 1, strand=STRAND.POS), Breakpoint('1', 2, strand=STRAND.POS), @@ -100,16 +99,16 @@ def test___init__opstrand_conflict(self): def test___init__opstrand_indv_not_specified(self): bpp = BreakpointPair(Breakpoint('test', 1), Breakpoint('test', 10), opposing_strands=True) - self.assertTrue(bpp.opposing_strands) + assert bpp.opposing_strands bpp = BreakpointPair(Breakpoint('test', 1), Breakpoint('test', 10), opposing_strands=False) - self.assertFalse(bpp.opposing_strands) + assert not bpp.opposing_strands def test___init__opstrand_not_specified(self): - with self.assertRaises(NotSpecifiedError): + with pytest.raises(NotSpecifiedError): BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 2)) def test___init__stranded(self): - with self.assertRaises(NotSpecifiedError): + with pytest.raises(NotSpecifiedError): BreakpointPair( Breakpoint('1', 1), Breakpoint('1', 2), stranded=True, opposing_strands=True ) @@ -118,25 +117,25 @@ def test___get_item__(self): bp1 = Breakpoint(1, 1, 2, ORIENT.LEFT) bp2 = Breakpoint(2, 1, 2, ORIENT.LEFT) bpp = BreakpointPair(bp1, bp2, opposing_strands=True) - self.assertEqual(bpp[0], bp1) - self.assertEqual(bpp[1], bp2) - with self.assertRaises(IndexError): + assert bp1 == bpp[0] + assert bp2 == bpp[1] + with pytest.raises(IndexError): bpp['?'] - with self.assertRaises(IndexError): + with pytest.raises(IndexError): bpp[2] def test_interchromosomal(self): bp1 = Breakpoint(1, 1, 2, ORIENT.LEFT) bp2 = Breakpoint(2, 1, 2, ORIENT.LEFT) bpp = BreakpointPair(bp1, bp2, opposing_strands=True) - self.assertTrue(bpp.interchromosomal) + assert bpp.interchromosomal bp1 = Breakpoint(1, 1, 2, ORIENT.LEFT) bp2 = Breakpoint(1, 7, 8, ORIENT.LEFT) bpp = BreakpointPair(bp1, bp2, opposing_strands=True) - self.assertFalse(bpp.interchromosomal) + assert not bpp.interchromosomal def test___init__invalid_intra_rprp(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), @@ -144,7 +143,7 @@ def test___init__invalid_intra_rprp(self): ) def test___init__invalid_intra_rnrn(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), @@ -152,7 +151,7 @@ def test___init__invalid_intra_rnrn(self): ) def test___init__invalid_intra_rpln(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), @@ -160,7 +159,7 @@ def test___init__invalid_intra_rpln(self): ) def test___init__invalid_intra_lprn(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), @@ -168,7 +167,7 @@ def test___init__invalid_intra_lprn(self): ) def test___init__invalid_intra_rnlp(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), @@ -176,7 +175,7 @@ def test___init__invalid_intra_rnlp(self): ) def test___init__invalid_intra_lnrp(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), @@ -184,7 +183,7 @@ def test___init__invalid_intra_lnrp(self): ) def test___init__invalid_inter_rl_opp(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, ORIENT.RIGHT), Breakpoint(2, 1, 2, ORIENT.LEFT), @@ -192,7 +191,7 @@ def test___init__invalid_inter_rl_opp(self): ) def test___init__invalid_inter_lr_opp(self): - with self.assertRaises(InvalidRearrangement): + with pytest.raises(InvalidRearrangement): BreakpointPair( Breakpoint(1, 1, 2, ORIENT.LEFT), Breakpoint(2, 1, 2, ORIENT.RIGHT), @@ -200,7 +199,7 @@ def test___init__invalid_inter_lr_opp(self): ) -class TestClassifyBreakpointPair(unittest.TestCase): +class TestClassifyBreakpointPair: def test_inverted_translocation(self): b = BreakpointPair( Breakpoint(1, 1, 2, ORIENT.LEFT), @@ -222,116 +221,116 @@ def test_inversion(self): Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), ) - self.assertEqual({SVTYPE.INV}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), ) - self.assertEqual({SVTYPE.INV}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.NS), ) - self.assertEqual({SVTYPE.INV}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.NS), ) - self.assertEqual({SVTYPE.INV}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), ) - self.assertEqual({SVTYPE.INV}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - self.assertEqual({SVTYPE.INV}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.INV} def test_duplication(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - self.assertEqual({SVTYPE.DUP}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - self.assertEqual({SVTYPE.DUP}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), ) - self.assertEqual({SVTYPE.DUP}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.NS), ) - self.assertEqual({SVTYPE.DUP}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.NS), ) - self.assertEqual({SVTYPE.DUP}, BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == {SVTYPE.DUP} def test_deletion_or_insertion(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), opposing_strands=False, ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), opposing_strands=False, ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) def test_insertion(self): b = BreakpointPair( @@ -339,7 +338,7 @@ def test_insertion(self): Breakpoint(1, 2, 2, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - self.assertEqual(sorted([SVTYPE.INS]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.INS]) def test_no_type(self): b = BreakpointPair( @@ -348,7 +347,7 @@ def test_no_type(self): opposing_strands=False, untemplated_seq='', ) - self.assertEqual(set(), BreakpointPair.classify(b)) + assert BreakpointPair.classify(b) == set() def test_deletion(self): b = BreakpointPair( @@ -357,7 +356,7 @@ def test_deletion(self): opposing_strands=False, untemplated_seq='', ) - self.assertEqual(sorted([SVTYPE.DEL]), sorted(BreakpointPair.classify(b))) + assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL]) def test_deletion_with_useq(self): bpp = BreakpointPair( @@ -366,30 +365,30 @@ def test_deletion_with_useq(self): opposing=False, untemplated_seq='CCCT', ) - self.assertEqual(sorted([SVTYPE.DEL, SVTYPE.INS]), sorted(BreakpointPair.classify(bpp))) + assert sorted(BreakpointPair.classify(bpp)) == sorted([SVTYPE.DEL, SVTYPE.INS]) def distance(x, y): return Interval(abs(x - y)) net_size = BreakpointPair.net_size(bpp, distance) - self.assertEqual(Interval(-71), net_size) - self.assertEqual(sorted([SVTYPE.DEL]), sorted(BreakpointPair.classify(bpp, distance))) + assert net_size == Interval(-71) + assert sorted(BreakpointPair.classify(bpp, distance)) == sorted([SVTYPE.DEL]) def test_deletion_no_distance_error(self): bpp = BreakpointPair( Breakpoint('1', 7039, orient='L'), Breakpoint('1', 7040, orient='R'), opposing=False ) - self.assertEqual(sorted([SVTYPE.INS]), sorted(BreakpointPair.classify(bpp))) + assert sorted(BreakpointPair.classify(bpp)) == sorted([SVTYPE.INS]) -class TestNetSize(unittest.TestCase): +class TestNetSize: def test_indel(self): bpp = BreakpointPair( Breakpoint('1', 13, orient=ORIENT.RIGHT), Breakpoint('1', 10, orient=ORIENT.LEFT), untemplated_seq='TTT', ) - self.assertEqual(Interval(1), bpp.net_size()) + assert bpp.net_size() == Interval(1) def test_large_indel(self): bpp = BreakpointPair( @@ -397,7 +396,7 @@ def test_large_indel(self): Breakpoint('1', 101, orient=ORIENT.RIGHT), untemplated_seq='TTT', ) - self.assertEqual(Interval(-87), bpp.net_size()) + assert bpp.net_size() == Interval(-87) def test_insertion(self): bpp = BreakpointPair( @@ -405,14 +404,14 @@ def test_insertion(self): Breakpoint('1', 10, orient=ORIENT.LEFT), untemplated_seq='T', ) - self.assertEqual(Interval(1), bpp.net_size()) + assert bpp.net_size() == Interval(1) bpp = BreakpointPair( Breakpoint('1', 11, orient=ORIENT.RIGHT), Breakpoint('1', 10, orient=ORIENT.LEFT), untemplated_seq='TT', ) - self.assertEqual(Interval(2), bpp.net_size()) + assert bpp.net_size() == Interval(2) def test_duplication_with_insertion(self): bpp = BreakpointPair( @@ -420,7 +419,7 @@ def test_duplication_with_insertion(self): Breakpoint('1', 15, orient=ORIENT.LEFT), untemplated_seq='TTT', ) - self.assertEqual(Interval(9), bpp.net_size()) + assert bpp.net_size() == Interval(9) def test_deletion(self): bpp = BreakpointPair( @@ -428,7 +427,7 @@ def test_deletion(self): Breakpoint('1', 15, orient=ORIENT.RIGHT), untemplated_seq='', ) - self.assertEqual(Interval(-4), bpp.net_size()) + assert bpp.net_size() == Interval(-4) def test_inversion(self): bpp = BreakpointPair( @@ -436,7 +435,7 @@ def test_inversion(self): Breakpoint('1', 15, orient=ORIENT.LEFT), untemplated_seq='', ) - self.assertEqual(Interval(0), bpp.net_size()) + assert bpp.net_size() == Interval(0) def test_inversion_insertion(self): bpp = BreakpointPair( @@ -444,10 +443,10 @@ def test_inversion_insertion(self): Breakpoint('1', 15, orient=ORIENT.LEFT), untemplated_seq='TT', ) - self.assertEqual(Interval(2), bpp.net_size()) + assert bpp.net_size() == Interval(2) -class TestUntemplatedShift(unittest.TestCase): +class TestUntemplatedShift: def test_indel(self): ref = { '1': Mock( @@ -461,4 +460,4 @@ def test_indel(self): ) result = bpp.untemplated_shift(ref) print(result) - self.assertEqual((0, 1), result) + assert result == (0, 1) diff --git a/tests/unit/test_call_indels.py b/tests/unit/test_call_indels.py index f897113d..840947df 100644 --- a/tests/unit/test_call_indels.py +++ b/tests/unit/test_call_indels.py @@ -1,134 +1,133 @@ -import unittest - +import pytest from mavis.annotate.variant import IndelCall, call_protein_indel from .mock import Mock, MockFunction -class TestIndelCall(unittest.TestCase): +class TestIndelCall: def test_deletion(self): refseq = 'asdfghjkl' mutseq = 'asdfkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(4, indel.nterm_aligned) - self.assertEqual(len(indel.ref_seq) - 8 + 1, indel.cterm_aligned) - self.assertEqual('ghj', indel.del_seq) - self.assertEqual('', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 4 + assert indel.cterm_aligned == len(indel.ref_seq) - 8 + 1 + assert indel.del_seq == 'ghj' + assert indel.ins_seq == '' + assert not indel.is_dup def test_insertion(self): refseq = 'asdfghjkl' mutseq = 'asdfmmmghjkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(4, indel.nterm_aligned) - self.assertEqual(len(indel.ref_seq) - 5 + 1, indel.cterm_aligned) - self.assertEqual('', indel.del_seq) - self.assertEqual('mmm', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 4 + assert indel.cterm_aligned == len(indel.ref_seq) - 5 + 1 + assert indel.del_seq == '' + assert indel.ins_seq == 'mmm' + assert not indel.is_dup def test_dup(self): refseq = 'asdfghjkl' mutseq = 'asdfsdfghjkl' indel = IndelCall(refseq, mutseq) print(indel) - self.assertEqual(4, indel.nterm_aligned) - self.assertEqual(len(indel.ref_seq) - 2 + 1, indel.cterm_aligned) - self.assertEqual('', indel.del_seq) - self.assertEqual('sdf', indel.ins_seq) - self.assertTrue(indel.is_dup) + assert indel.nterm_aligned == 4 + assert indel.cterm_aligned == len(indel.ref_seq) - 2 + 1 + assert indel.del_seq == '' + assert indel.ins_seq == 'sdf' + assert indel.is_dup def test_delins(self): refseq = 'asdfghjkl' mutseq = 'asdfmmmkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(4, indel.nterm_aligned) - self.assertEqual(len(indel.ref_seq) - 8 + 1, indel.cterm_aligned) - self.assertEqual('ghj', indel.del_seq) - self.assertEqual('mmm', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 4 + assert indel.cterm_aligned == len(indel.ref_seq) - 8 + 1 + assert indel.del_seq == 'ghj' + assert indel.ins_seq == 'mmm' + assert not indel.is_dup def test_delete_start(self): refseq = 'asdfghjkl' mutseq = 'fghjkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(0, indel.nterm_aligned) - self.assertEqual(6, indel.cterm_aligned) - self.assertEqual('asd', indel.del_seq) - self.assertEqual('', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 0 + assert indel.cterm_aligned == 6 + assert indel.del_seq == 'asd' + assert indel.ins_seq == '' + assert not indel.is_dup def test_delete_start_repetition(self): refseq = 'asdafghjkl' mutseq = 'afghjkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(0, indel.nterm_aligned) - self.assertEqual(7, indel.cterm_aligned) - self.assertEqual('asd', indel.del_seq) - self.assertEqual('', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 0 + assert indel.cterm_aligned == 7 + assert indel.del_seq == 'asd' + assert indel.ins_seq == '' + assert not indel.is_dup def test_delete_end(self): refseq = 'asdfghjkl' mutseq = 'asdfgh' indel = IndelCall(refseq, mutseq) - self.assertEqual(6, indel.nterm_aligned) - self.assertEqual(0, indel.cterm_aligned) - self.assertEqual('jkl', indel.del_seq) - self.assertEqual('', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 6 + assert indel.cterm_aligned == 0 + assert indel.del_seq == 'jkl' + assert indel.ins_seq == '' + assert not indel.is_dup def test_ins_start(self): refseq = 'asdfghjkl' mutseq = 'mmasdfghjkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(0, indel.nterm_aligned) - self.assertEqual(9, indel.cterm_aligned) - self.assertEqual('', indel.del_seq) - self.assertEqual('mm', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 0 + assert indel.cterm_aligned == 9 + assert indel.del_seq == '' + assert indel.ins_seq == 'mm' + assert not indel.is_dup def test_ins_end(self): refseq = 'asdfghjkl' mutseq = 'asdfghjklmmm' indel = IndelCall(refseq, mutseq) - self.assertEqual(9, indel.nterm_aligned) - self.assertEqual(0, indel.cterm_aligned) - self.assertEqual('', indel.del_seq) - self.assertEqual('mmm', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 9 + assert indel.cterm_aligned == 0 + assert indel.del_seq == '' + assert indel.ins_seq == 'mmm' + assert not indel.is_dup def test_delins_start(self): refseq = 'asdfghjkl' mutseq = 'mmfghjkl' indel = IndelCall(refseq, mutseq) - self.assertEqual(0, indel.nterm_aligned) - self.assertEqual(6, indel.cterm_aligned) - self.assertEqual('asd', indel.del_seq) - self.assertEqual('mm', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 0 + assert indel.cterm_aligned == 6 + assert indel.del_seq == 'asd' + assert indel.ins_seq == 'mm' + assert not indel.is_dup def test_delins_end(self): refseq = 'asdfghjkl' mutseq = 'asdfghjmmm' indel = IndelCall(refseq, mutseq) - self.assertEqual(7, indel.nterm_aligned) - self.assertEqual(0, indel.cterm_aligned) - self.assertEqual('kl', indel.del_seq) - self.assertEqual('mmm', indel.ins_seq) - self.assertFalse(indel.is_dup) + assert indel.nterm_aligned == 7 + assert indel.cterm_aligned == 0 + assert indel.del_seq == 'kl' + assert indel.ins_seq == 'mmm' + assert not indel.is_dup -class TestHgvsProteinNotation(unittest.TestCase): +class TestHgvsProteinNotation: def test_homopolymer(self): indel = IndelCall('ASDFGHJKKLQWERTYUIOP', 'ASDFGHJKKKKLQWERTYUIOP').hgvs_protein_notation() - self.assertEqual('p.K8_K9dupKK', indel) + assert indel == 'p.K8_K9dupKK' def test_dup(self): indel = IndelCall('ASDFGHJKL', 'ASDFSDFGHJKL').hgvs_protein_notation() - self.assertEqual('p.S2_F4dupSDF', indel) + assert indel == 'p.S2_F4dupSDF' -class TestCallProteinIndel(unittest.TestCase): +class TestCallProteinIndel: def test_large_start_deletion(self): ref_translation = Mock( get_aa_seq=MockFunction( @@ -159,44 +158,43 @@ def test_large_start_deletion(self): ) ) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual( + assert notation == ( 'ref:p.M1_Y227del' 'MGLKAAQKTLFPLRSIDDVVRLFAAELGREEPDLVLLSLVLGFVEHFLAVNRVIPTNVPE' 'LTFQPSPAPDPPGGLTYFPVADLSIIAALYARFTAQIRGAVDLSLYPREGGVSSRELVKK' 'VSDVIWNSLSRSYFKDRAHIQSLFSFITGTKLDSSGVAFAVVGACQALGLRDVHLALSED' - 'HAWVVFGPNGEQTAEVTWHGKGNEDRRGQTVNAGVAERSWLYLKGSY', - notation, + 'HAWVVFGPNGEQTAEVTWHGKGNEDRRGQTVNAGVAERSWLYLKGSY' ) def test_deletion_rep_at_breaks(self): ref_translation = Mock(get_aa_seq=MockFunction('ABCDEFKJFEDAGFLKJ'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ABCDE' 'AGFLKJ')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.F6_D11delFKJFED', notation) + assert notation == 'ref:p.F6_D11delFKJFED' def test_insertion(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKIIILQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8_L9insIII', notation) + assert notation == 'ref:p.K8_L9insIII' def test_deletion(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8_L9delKL', notation) + assert notation == 'ref:p.K8_L9delKL' def test_synonymous(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual(None, notation) + assert notation is None def test_delins(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJIIIQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8_L9delKLinsIII', notation) + assert notation == 'ref:p.K8_L9delKLinsIII' def test_transcript_name(self): ref_translation = Mock( @@ -206,77 +204,77 @@ def test_transcript_name(self): ) mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJIIIQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('reft:p.K8_L9delKLinsIII', notation) + assert notation == 'reft:p.K8_L9delKLinsIII' def test_delete_start(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('FGHJKLQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.A1_D3delASD', notation) + assert notation == 'ref:p.A1_D3delASD' def test_delete_single_aa_start(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('SDFGHJKLQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.A1delA', notation) + assert notation == 'ref:p.A1delA' def test_delete_end(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYU')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.I17_P19delIOP', notation) + assert notation == 'ref:p.I17_P19delIOP' def test_delete_single_aa_end(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIO')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.P19delP', notation) + assert notation == 'ref:p.P19delP' def test_ins_start(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('IIASDFGHJKLQWERTYUIOP')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.A1ext-2', notation) + assert notation == 'ref:p.A1ext-2' def test_ins_end(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOPII')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.P19ext2', notation) + assert notation == 'ref:p.P19ext2' def test_no_reference_obj(self): ref_translation = Mock( get_aa_seq=MockFunction('ASDFGHJKLQWERTYUIOP'), name=None, reference_object='thing' ) mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJIIIQWERTYUIOP')) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): call_protein_indel(ref_translation, mut_translation) def test_fs(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKL'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJMMM')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8Mfs', notation) + assert notation == 'ref:p.K8Mfs' ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKL'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJCMMEF')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8Cfs', notation) + assert notation == 'ref:p.K8Cfs' def test_fs_with_stops(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLT*'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJMMMHGFTTSBF*TUHG*')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8Mfs*12', notation) + assert notation == 'ref:p.K8Mfs*12' def test_fs_immeadiate_stop(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDFGHJKLT*'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('ASDFGHJMMMHGFTTSBF*')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.K8Mfs*12', notation) + assert notation == 'ref:p.K8Mfs*12' def test_delete_start_with_rep(self): ref_translation = Mock(get_aa_seq=MockFunction('ASDAFGHJKL'), name='ref') mut_translation = Mock(get_aa_seq=MockFunction('AFGHJKL')) notation = call_protein_indel(ref_translation, mut_translation) - self.assertEqual('ref:p.A1_D3delASD', notation) + assert notation == 'ref:p.A1_D3delASD' diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index a7cb6c2d..43172ea9 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -1,29 +1,30 @@ import unittest +import pytest from mavis.cluster.cluster import merge_integer_intervals from mavis.interval import Interval -class TestMergeIntegerIntervals(unittest.TestCase): +class TestMergeIntegerIntervals: def test_varying_lengths(self): m = merge_integer_intervals((1, 2), (1, 9), (2, 10), weight_adjustment=0) - self.assertEqual(Interval(1, 4), m) + assert m == Interval(1, 4) def test_same_length(self): m = merge_integer_intervals((1, 1), (10, 10)) - self.assertEqual(Interval(6), m) + assert m == Interval(6) def test_empty_list_error(self): - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): merge_integer_intervals() def test_identical_even_length(self): m = merge_integer_intervals((1, 2), (1, 2), (1, 2)) - self.assertEqual(Interval(1, 2), m) + assert m == Interval(1, 2) def test_identical_odd_length(self): m = merge_integer_intervals((1, 3), (1, 3), (1, 3)) - self.assertEqual(Interval(1, 3), m) + assert m == Interval(1, 3) if __name__ == '__main__': diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py index b69571db..c602c7ee 100644 --- a/tests/unit/test_constants.py +++ b/tests/unit/test_constants.py @@ -1,54 +1,47 @@ -import unittest +from mavis.constants import COLUMNS, ORIENT, STRAND, reverse_complement, sort_columns, translate -from mavis.constants import ( - COLUMNS, - ORIENT, - STRAND, - MavisNamespace, - reverse_complement, - sort_columns, - translate, -) - -class TestConstants(unittest.TestCase): +class TestConstants: def test_strand_compare(self): - self.assertTrue(STRAND.compare(STRAND.NS, STRAND.POS)) - self.assertTrue(STRAND.compare(STRAND.NS, STRAND.NEG)) - self.assertTrue(STRAND.compare(STRAND.POS, STRAND.POS)) - self.assertTrue(STRAND.compare(STRAND.NEG, STRAND.NEG)) - self.assertFalse(STRAND.compare(STRAND.POS, STRAND.NEG)) - self.assertFalse(STRAND.compare(STRAND.NEG, STRAND.POS)) + assert STRAND.compare(STRAND.NS, STRAND.POS) + assert STRAND.compare(STRAND.NS, STRAND.NEG) + assert STRAND.compare(STRAND.POS, STRAND.POS) + assert STRAND.compare(STRAND.NEG, STRAND.NEG) + assert not STRAND.compare(STRAND.POS, STRAND.NEG) + assert not STRAND.compare(STRAND.NEG, STRAND.POS) def test_orient_compare(self): - self.assertTrue(ORIENT.compare(ORIENT.NS, ORIENT.RIGHT)) - self.assertTrue(ORIENT.compare(ORIENT.NS, ORIENT.LEFT)) - self.assertTrue(ORIENT.compare(ORIENT.RIGHT, ORIENT.RIGHT)) - self.assertTrue(ORIENT.compare(ORIENT.LEFT, ORIENT.LEFT)) - self.assertFalse(ORIENT.compare(ORIENT.RIGHT, ORIENT.LEFT)) - self.assertFalse(ORIENT.compare(ORIENT.LEFT, ORIENT.RIGHT)) + assert ORIENT.compare(ORIENT.NS, ORIENT.RIGHT) + assert ORIENT.compare(ORIENT.NS, ORIENT.LEFT) + assert ORIENT.compare(ORIENT.RIGHT, ORIENT.RIGHT) + assert ORIENT.compare(ORIENT.LEFT, ORIENT.LEFT) + assert not ORIENT.compare(ORIENT.RIGHT, ORIENT.LEFT) + assert not ORIENT.compare(ORIENT.LEFT, ORIENT.RIGHT) def test_reverse_complement(self): - self.assertEqual('ATCG', reverse_complement('CGAT')) - self.assertEqual('', reverse_complement('')) + assert reverse_complement('CGAT') == 'ATCG' + assert reverse_complement('') == '' def test_translate(self): seq = 'ATG' 'AAT' 'TCT' 'GGA' 'TGA' translated_seq = translate(seq, 0) - self.assertEqual('MNSG*', translated_seq) # ATG AAT TCT GGA TGA + assert translated_seq == 'MNSG*' # ATG AAT TCT GGA TGA translated_seq = translate(seq, 1) - self.assertEqual('*ILD', translated_seq) # A TGA ATT CTG GAT GA + assert translated_seq == '*ILD' # A TGA ATT CTG GAT GA translated_seq = translate(seq, 2) - self.assertEqual('EFWM', translated_seq) # AT GAA TTC TGG ATG A + assert translated_seq == 'EFWM' # AT GAA TTC TGG ATG A def test_sort_columns(self): temp = ['NEW', 'NEW2', COLUMNS.break1_seq, COLUMNS.break2_seq, COLUMNS.break1_chromosome] - self.assertEqual( - [COLUMNS.break1_chromosome, COLUMNS.break1_seq, COLUMNS.break2_seq, 'NEW', 'NEW2'], - sort_columns(temp), - ) + assert sort_columns(temp) == [ + COLUMNS.break1_chromosome, + COLUMNS.break1_seq, + COLUMNS.break2_seq, + 'NEW', + 'NEW2', + ] def test_column_matches_column_name(self): - self.assertEqual(COLUMNS.library, COLUMNS.library) + assert COLUMNS.library == COLUMNS.library s = set([COLUMNS.library, COLUMNS.library]) - self.assertEqual(1, len(s)) + assert len(s) == 1 diff --git a/tests/unit/test_illustrate.py b/tests/unit/test_illustrate.py index 9968d292..db5f77f2 100644 --- a/tests/unit/test_illustrate.py +++ b/tests/unit/test_illustrate.py @@ -1,9 +1,8 @@ -import unittest from mavis.illustrate.util import generate_interval_mapping from mavis.interval import Interval -class TestGenerateIntervalMapping(unittest.TestCase): +class TestGenerateIntervalMapping: def test_single_bp_window(self): regions = [ Interval(4222347, 4222347), @@ -20,7 +19,7 @@ def test_single_bp_window(self): mapping = generate_interval_mapping( regions, target, ratio, min_width, buffer_, start, end, min_inter ) - self.assertEqual(7, len(mapping.keys())) + assert len(mapping.keys()) == 7 def test_no_input_intervals(self): target = 911.9921875 @@ -33,4 +32,4 @@ def test_no_input_intervals(self): mapping = generate_interval_mapping( [], target, ratio, min_width, buffer_, start, end, min_inter ) - self.assertEqual(1, len(mapping.keys())) + assert len(mapping.keys()) == 1 diff --git a/tests/unit/test_interval.py b/tests/unit/test_interval.py index ae6d0ff0..1b130201 100644 --- a/tests/unit/test_interval.py +++ b/tests/unit/test_interval.py @@ -1,175 +1,173 @@ -import unittest +import pytest from mavis.interval import Interval, IntervalMapping -class TestInterval(unittest.TestCase): +class TestInterval: def test___init__error(self): - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Interval(4, 3) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Interval(3, 4, 0) def test___contains__(self): - self.assertTrue(Interval(1, 2) in Interval(1, 7)) - self.assertFalse(Interval(1, 7) in Interval(1, 2)) - self.assertTrue(Interval(1.0, 2) in Interval(1.0, 7)) - self.assertFalse(Interval(1, 7) in Interval(1, 2)) - self.assertTrue(1 in Interval(1, 7)) - self.assertFalse(0 in Interval(1, 7)) + assert Interval(1, 2) in Interval(1, 7) + assert not Interval(1, 7) in Interval(1, 2) + assert Interval(1.0, 2) in Interval(1.0, 7) + assert not Interval(1, 7) in Interval(1, 2) + assert 1 in Interval(1, 7) + assert 0 not in Interval(1, 7) def test_eq(self): - self.assertEqual(Interval(1, 2), Interval(1, 2)) - self.assertEqual(Interval(1, 2), Interval(1, 2)) + assert Interval(1, 2) == Interval(1, 2) + assert Interval(1, 2) == Interval(1, 2) def test_ne(self): - self.assertNotEqual(Interval(1, 2), Interval(1, 3)) - self.assertNotEqual(Interval(1, 2), Interval(1, 3)) + assert Interval(1, 2) != Interval(1, 3) + assert Interval(1, 2) != Interval(1, 3) def test___get_item__(self): temp = Interval(1, 2, 3) - self.assertEqual(1, temp[0]) - self.assertEqual(2, temp[1]) - with self.assertRaises(IndexError): + assert temp[0] == 1 + assert temp[1] == 2 + with pytest.raises(IndexError): temp[3] - with self.assertRaises(IndexError): + with pytest.raises(IndexError): temp[-1] - with self.assertRaises(IndexError): + with pytest.raises(IndexError): temp['1b'] def test___gt__(self): - self.assertTrue(Interval(10) > Interval(1)) - self.assertFalse(Interval(1) > Interval(10)) - self.assertTrue(Interval(10) > Interval(1)) - self.assertFalse(Interval(1) > Interval(1.01)) + assert Interval(10) > Interval(1) + assert not Interval(1) > Interval(10) + assert Interval(10) > Interval(1) + assert not Interval(1) > Interval(1.01) def test_overlaps(self): left = Interval(-4, 1) middle = Interval(0, 10) right = Interval(5, 12) - self.assertFalse(Interval.overlaps(left, right)) - self.assertFalse(Interval.overlaps(right, left)) - self.assertTrue(Interval.overlaps(left, middle)) - self.assertTrue(Interval.overlaps(right, middle)) - self.assertTrue(Interval.overlaps(middle, left)) - self.assertTrue(Interval.overlaps(middle, right)) - self.assertTrue(Interval.overlaps((1, 2), (2, 5))) + assert not Interval.overlaps(left, right) + assert not Interval.overlaps(right, left) + assert Interval.overlaps(left, middle) + assert Interval.overlaps(right, middle) + assert Interval.overlaps(middle, left) + assert Interval.overlaps(middle, right) + assert Interval.overlaps((1, 2), (2, 5)) left = Interval(1148432, 1149343) right = Interval(1149493, 1150024) - self.assertFalse(Interval.overlaps(left, right)) + assert not Interval.overlaps(left, right) left = Interval(-4, 0.1) middle = Interval(0, 10) right = Interval(0.11, 12) - self.assertFalse(Interval.overlaps(left, right)) - self.assertFalse(Interval.overlaps(right, left)) - self.assertTrue(Interval.overlaps(left, middle)) - self.assertTrue(Interval.overlaps(right, middle)) - self.assertTrue(Interval.overlaps(middle, left)) - self.assertTrue(Interval.overlaps(middle, right)) + assert not Interval.overlaps(left, right) + assert not Interval.overlaps(right, left) + assert Interval.overlaps(left, middle) + assert Interval.overlaps(right, middle) + assert Interval.overlaps(middle, left) + assert Interval.overlaps(middle, right) def test___len__(self): - self.assertEqual(5, len(Interval(1, 5))) - with self.assertRaises(TypeError): + assert len(Interval(1, 5)) == 5 + with pytest.raises(TypeError): len(Interval(1, 5.0)) - self.assertEqual(4.0, Interval(1, 5.0).length()) + assert Interval(1, 5.0).length() == 4.0 def test___lt__(self): - self.assertTrue(Interval(1) < Interval(10)) - self.assertFalse(Interval(10) < Interval(1)) + assert Interval(1) < Interval(10) + assert not Interval(10) < Interval(1) def test___and__(self): - self.assertEqual(None, Interval(1, 1) & Interval(2)) + assert Interval(1, 1) & Interval(2) is None def test___sub__(self): # x in y - self.assertEqual([Interval(0, 4), Interval(7, 10)], Interval(0, 10) - Interval(5, 6)) + assert Interval(0, 10) - Interval(5, 6) == [Interval(0, 4), Interval(7, 10)] # x overlaps the start of y - self.assertEqual([Interval(7, 10)], Interval(0, 10) - Interval(-1, 6)) + assert Interval(0, 10) - Interval(-1, 6) == [Interval(7, 10)] # x overlaps the end of y - self.assertEqual([Interval(0, 4)], Interval(0, 10) - Interval(5, 11)) + assert Interval(0, 10) - Interval(5, 11) == [Interval(0, 4)] # x overlaps all of y - self.assertEqual([], Interval(0, 10) - Interval(-1, 11)) + assert Interval(0, 10) - Interval(-1, 11) == [] # x does not overlap y - self.assertEqual([Interval(0, 10)], Interval(0, 10) - Interval(11, 15)) + assert Interval(0, 10) - Interval(11, 15) == [Interval(0, 10)] def test___xor__(self): # x in y - self.assertEqual([], Interval(0, 10) ^ Interval(0, 10)) + assert Interval(0, 10) ^ Interval(0, 10) == [] # x overlaps the start of y - self.assertEqual([Interval(7, 10), Interval(-1, -1)], Interval(0, 10) ^ Interval(-1, 6)) + assert Interval(0, 10) ^ Interval(-1, 6) == [Interval(7, 10), Interval(-1, -1)] # x overlaps the end of y - self.assertEqual([Interval(0, 4), Interval(11, 11)], Interval(0, 10) ^ Interval(5, 11)) + assert Interval(0, 10) ^ Interval(5, 11) == [Interval(0, 4), Interval(11, 11)] # x overlaps all of y - self.assertEqual([Interval(-1, -1), Interval(11, 11)], Interval(0, 10) ^ Interval(-1, 11)) + assert Interval(0, 10) ^ Interval(-1, 11) == [Interval(-1, -1), Interval(11, 11)] # x does not overlap y - self.assertEqual([Interval(0, 10), Interval(11, 15)], Interval(0, 10) ^ Interval(11, 15)) + assert Interval(0, 10) ^ Interval(11, 15) == [Interval(0, 10), Interval(11, 15)] def test_center(self): - self.assertEqual(3, Interval(1, 5).center) - self.assertEqual(3.5, Interval(2, 5).center) + assert Interval(1, 5).center == 3 + assert Interval(2, 5).center == 3.5 def test_position_in_range(self): pos = (12, 12) - self.assertEqual((2, False), Interval.position_in_range([(1, 2), (3, 6), (7, 15)], pos)) - self.assertEqual( - (3, True), Interval.position_in_range([(1, 2), (3, 6), (7, 10), (14, 16)], pos) - ) - self.assertEqual((3, False), Interval.position_in_range([(1, 2), (3, 6), (7, 10)], pos)) - self.assertEqual((0, True), Interval.position_in_range([(15, 16), (17, 19)], pos)) + assert Interval.position_in_range([(1, 2), (3, 6), (7, 15)], pos) == (2, False) + assert Interval.position_in_range([(1, 2), (3, 6), (7, 10), (14, 16)], pos) == (3, True) + assert Interval.position_in_range([(1, 2), (3, 6), (7, 10)], pos) == (3, False) + assert Interval.position_in_range([(15, 16), (17, 19)], pos) == (0, True) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Interval.position_in_range([], 1) def test_convert_pos(self): mapping = {(1, 10): (101, 110), (21, 30): (201, 210), (41, 50): (301, 310)} - self.assertEqual(105, Interval.convert_pos(mapping, 5)) - self.assertEqual(101, Interval.convert_pos(mapping, 1)) - self.assertEqual(310, Interval.convert_pos(mapping, 50)) + assert Interval.convert_pos(mapping, 5) == 105 + assert Interval.convert_pos(mapping, 1) == 101 + assert Interval.convert_pos(mapping, 50) == 310 - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 15) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 0) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 80) def test_convert_pos_forward_to_reverse(self): mapping = {(41, 50): (101, 110), (21, 30): (201, 210), (1, 10): (301, 310)} - self.assertEqual(306, Interval.convert_pos(mapping, 5)) - self.assertEqual(110, Interval.convert_pos(mapping, 41)) - self.assertEqual(210, Interval.convert_pos(mapping, 21)) - self.assertEqual(310, Interval.convert_pos(mapping, 1)) - self.assertEqual(309, Interval.convert_pos(mapping, 2)) + assert Interval.convert_pos(mapping, 5) == 306 + assert Interval.convert_pos(mapping, 41) == 110 + assert Interval.convert_pos(mapping, 21) == 210 + assert Interval.convert_pos(mapping, 1) == 310 + assert Interval.convert_pos(mapping, 2) == 309 - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 15) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 51) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 0) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): Interval.convert_pos(mapping, 31) def test_convert_pos_input_errors(self): # test input errors - with self.assertRaises(AttributeError): # unequal length + with pytest.raises(AttributeError): # unequal length Interval.convert_pos({(1, 10): (4, 5)}, 3) - with self.assertRaises(AttributeError): # overlapping ranges + with pytest.raises(AttributeError): # overlapping ranges Interval.convert_pos({(1, 10): (11, 20), (5, 14): (21, 30)}, 6) - with self.assertRaises(AttributeError): # range not increasing or decreasing + with pytest.raises(AttributeError): # range not increasing or decreasing mapping = {(1, 2): (1, 2), (3, 4): (4, 5), (5, 6): (3, 3)} Interval.convert_pos(mapping, 10) - with self.assertRaises(AttributeError): # range not increasing or decreasing + with pytest.raises(AttributeError): # range not increasing or decreasing mapping = {(1, 2): (4, 5), (3, 4): (1, 2), (5, 6): (3, 3)} Interval.convert_pos(mapping, 10) @@ -179,7 +177,7 @@ def test_convert_pos_one_to_one(self): s = x * 10 + 1 mapping[Interval(s, s + 9)] = Interval(s, s + 9) for pos in range(1, 101): - self.assertEqual(pos, Interval.convert_pos(mapping, pos)) + assert Interval.convert_pos(mapping, pos) == pos def test_convert_pos_ratioed_intervals(self): mapping = { @@ -189,59 +187,59 @@ def test_convert_pos_ratioed_intervals(self): (601.0, 900): (52, 57.0), (901.0, 1100): (58.0, 100), } - self.assertEqual(Interval(1), Interval.convert_ratioed_pos(mapping, 1)) - self.assertEqual(Interval(20), Interval.convert_ratioed_pos(mapping, 100)) - self.assertEqual(Interval(100, 100), Interval.convert_ratioed_pos(mapping, 1100)) + assert Interval.convert_ratioed_pos(mapping, 1) == Interval(1) + assert Interval.convert_ratioed_pos(mapping, 100) == Interval(20) + assert Interval.convert_ratioed_pos(mapping, 1100) == Interval(100, 100) mapping = {(1, 100): (1, 1), (101, 500): (21, 30)} - self.assertEqual(Interval(1, 1), Interval.convert_ratioed_pos(mapping, 1)) - self.assertEqual(Interval(1, 1), Interval.convert_ratioed_pos(mapping, 100)) + assert Interval.convert_ratioed_pos(mapping, 1) == Interval(1, 1) + assert Interval.convert_ratioed_pos(mapping, 100) == Interval(1, 1) mapping = {(1, 100.0): (20.0, 30), (100.1, 500): (1.0, 1.0)} - self.assertEqual(Interval(1, 1), Interval.convert_ratioed_pos(mapping, 101)) - self.assertEqual(Interval(1, 1), Interval.convert_ratioed_pos(mapping, 500)) - self.assertEqual(Interval(25, 25), Interval.convert_ratioed_pos(mapping, 50)) + assert Interval.convert_ratioed_pos(mapping, 101) == Interval(1, 1) + assert Interval.convert_ratioed_pos(mapping, 500) == Interval(1, 1) + assert Interval.convert_ratioed_pos(mapping, 50) == Interval(25, 25) def test_union(self): interval_list = [Interval(1, 10), Interval(5, 7), Interval(7)] - self.assertEqual(Interval(1, 10), Interval.union(*interval_list)) + assert Interval.union(*interval_list) == Interval(1, 10) m = interval_list + [Interval(11)] - self.assertEqual(Interval(1, 11), Interval.union(*m)) + assert Interval.union(*m) == Interval(1, 11) - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Interval.union() def test_intersection(self): interval_list = [Interval(1, 10), Interval(5, 7), Interval(7)] - self.assertEqual(Interval(7), Interval.intersection(*interval_list)) + assert Interval.intersection(*interval_list) == Interval(7) interval_list.append(Interval(11)) - self.assertEqual(None, Interval.intersection(*interval_list)) + assert Interval.intersection(*interval_list) is None - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): Interval.intersection() def test_dist(self): x = Interval(1, 4) y = Interval(-1, 0) z = Interval(0, 3) - self.assertEqual(1, Interval.dist(x, y)) - self.assertEqual(-1, Interval.dist(y, x)) - self.assertEqual(0, Interval.dist(x, z)) - self.assertEqual(0, Interval.dist(z, x)) - self.assertEqual(0, Interval.dist(y, z)) - self.assertEqual(0, Interval.dist(z, y)) - self.assertEqual(-6, Interval.dist((1, 4), (10, 12))) + assert Interval.dist(x, y) == 1 + assert Interval.dist(y, x) == -1 + assert Interval.dist(x, z) == 0 + assert Interval.dist(z, x) == 0 + assert Interval.dist(y, z) == 0 + assert Interval.dist(z, y) == 0 + assert Interval.dist((1, 4), (10, 12)) == -6 def test_min_nonoverlapping(self): r = Interval.min_nonoverlapping(Interval(1, 2), Interval(4, 7), Interval(8, 9)) - self.assertEqual(3, len(r)) + assert len(r) == 3 r = Interval.min_nonoverlapping(Interval(1, 5), Interval(4, 7), Interval(8, 9)) - self.assertEqual(2, len(r)) + assert len(r) == 2 r = Interval.min_nonoverlapping(Interval(1, 5), Interval(4, 7), Interval(7, 9)) - self.assertEqual([Interval(1, 9)], r) + assert r == [Interval(1, 9)] r = Interval.min_nonoverlapping((1, 2), (2, 4)) - self.assertEqual([Interval(1, 4)], r) - self.assertEqual([], Interval.min_nonoverlapping()) + assert r == [Interval(1, 4)] + assert Interval.min_nonoverlapping() == [] def test_split_overlapping_no_weight(self): input_intervals = [Interval(1, 10), Interval(2, 11), Interval(4, 5), Interval(4, 8)] @@ -257,7 +255,7 @@ def test_split_overlapping_no_weight(self): result = Interval.split_overlap(*input_intervals) result = sorted(result) print('found', result) - self.assertEqual(exp, result) + assert result == exp def test_split_overlapping_weighted(self): input_intervals = [Interval(1, 10), Interval(2, 11), Interval(4, 5), Interval(4, 8)] @@ -271,12 +269,12 @@ def test_split_overlapping_weighted(self): Interval(10, 11): 4, } result = Interval.split_overlap(*input_intervals, weight_mapping=weights) - self.assertEqual(sorted(exp), sorted(result)) + assert sorted(result) == sorted(exp) for itvl in exp: - self.assertEqual(exp[itvl], result[itvl]) + assert result[itvl] == exp[itvl] -class TestIntervalMapping(unittest.TestCase): +class TestIntervalMapping: def test_convert_pos_ratioed(self): mapping = IntervalMapping( { @@ -287,37 +285,37 @@ def test_convert_pos_ratioed(self): (901.0, 1100): (58.0, 100), } ) - self.assertEqual(1, mapping.convert_pos(1)) - self.assertEqual(1, mapping.convert_ratioed_pos(1).start) - self.assertAlmostEqual(1.191919191919, mapping.convert_ratioed_pos(1).end) - self.assertEqual(20, mapping.convert_pos(100)) - self.assertEqual(20, mapping.convert_ratioed_pos(100).start) - self.assertEqual(100, mapping.convert_pos(1100)) - self.assertEqual(100, mapping.convert_ratioed_pos(1100).start) + assert mapping.convert_pos(1) == 1 + assert mapping.convert_ratioed_pos(1).start == 1 + assert pytest.approx(mapping.convert_ratioed_pos(1).end) == 1.191919191919 + assert mapping.convert_pos(100) == 20 + assert mapping.convert_ratioed_pos(100).start == 20 + assert mapping.convert_pos(1100) == 100 + assert mapping.convert_ratioed_pos(1100).start == 100 mapping = IntervalMapping({(1, 100): (1, 1.0), (101, 500): (21.0, 30)}) - self.assertEqual(1, mapping.convert_pos(1)) - self.assertEqual(1, mapping.convert_pos(100)) + assert mapping.convert_pos(1) == 1 + assert mapping.convert_pos(100) == 1 mapping = IntervalMapping({(1, 100.0): (20.0, 30), (100.1, 500): (1.0, 1.0)}) - self.assertEqual(1, mapping.convert_pos(101)) - self.assertEqual(1, mapping.convert_pos(500)) - self.assertEqual(25, mapping.convert_pos(50)) + assert mapping.convert_pos(101) == 1 + assert mapping.convert_pos(500) == 1 + assert mapping.convert_pos(50) == 25 def test_convert_pos(self): mapping = IntervalMapping({(1, 10): (101, 110), (21, 30): (201, 210), (41, 50): (301, 310)}) - self.assertEqual(105, mapping.convert_pos(5)) - self.assertEqual(101, mapping.convert_pos(1)) - self.assertEqual(310, mapping.convert_pos(50)) + assert mapping.convert_pos(5) == 105 + assert mapping.convert_pos(1) == 101 + assert mapping.convert_pos(50) == 310 - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(15) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(0) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(80) def test_convert_pos_forward_to_reverse(self): @@ -326,22 +324,22 @@ def test_convert_pos_forward_to_reverse(self): opposing=[(41, 50), (21, 30), (1, 10)], ) - self.assertEqual(306, mapping.convert_pos(5)) - self.assertEqual(110, mapping.convert_pos(41)) - self.assertEqual(210, mapping.convert_pos(21)) - self.assertEqual(310, mapping.convert_pos(1)) - self.assertEqual(309, mapping.convert_pos(2)) + assert mapping.convert_pos(5) == 306 + assert mapping.convert_pos(41) == 110 + assert mapping.convert_pos(21) == 210 + assert mapping.convert_pos(1) == 310 + assert mapping.convert_pos(2) == 309 - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(15) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(51) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(0) - with self.assertRaises(IndexError): + with pytest.raises(IndexError): mapping.convert_pos(31) def test_convert_pos_one_to_one(self): @@ -351,4 +349,4 @@ def test_convert_pos_one_to_one(self): mapping[Interval(s, s + 9)] = Interval(s, s + 9) mapping = IntervalMapping(mapping) for pos in range(1, 101): - self.assertEqual(pos, mapping.convert_pos(pos)) + assert mapping.convert_pos(pos) == pos diff --git a/tests/unit/test_summary.py b/tests/unit/test_summary.py index f2a81ef8..3e2a9efc 100644 --- a/tests/unit/test_summary.py +++ b/tests/unit/test_summary.py @@ -1,146 +1,176 @@ -import unittest - +import pytest from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CALL_METHOD, COLUMNS, PROTOCOL, STRAND, SVTYPE from mavis.summary.summary import filter_by_annotations - -class TestFilterByAnnotations(unittest.TestCase): - def setUp(self): - self.gev1 = BreakpointPair( - Breakpoint('1', 1), - Breakpoint('1', 10), - opposing_strands=True, - **{ - COLUMNS.event_type: SVTYPE.DEL, - COLUMNS.call_method: CALL_METHOD.CONTIG, - COLUMNS.fusion_sequence_fasta_id: None, - COLUMNS.protocol: PROTOCOL.GENOME, - COLUMNS.fusion_cdna_coding_end: None, - COLUMNS.fusion_cdna_coding_start: None, - } - ) - self.gev2 = BreakpointPair( - Breakpoint('1', 1), - Breakpoint('1', 100), - opposing_strands=True, - **{ - COLUMNS.event_type: SVTYPE.DEL, - COLUMNS.call_method: CALL_METHOD.CONTIG, - COLUMNS.fusion_sequence_fasta_id: None, - COLUMNS.protocol: PROTOCOL.GENOME, - COLUMNS.fusion_cdna_coding_start: None, - COLUMNS.fusion_cdna_coding_end: None, - } - ) - self.best_transcripts = {'ABCA': True, 'ABCD': True} - - def test_filter_by_annotations_two_best_transcripts(self): - self.gev1.data[COLUMNS.gene1] = 'ABC' - self.gev1.data[COLUMNS.gene2] = 'ABC' - self.gev1.data[COLUMNS.transcript1] = 'ABCA' - self.gev1.data[COLUMNS.transcript2] = 'ABCA' - self.gev2.data[COLUMNS.gene1] = 'ABC' - self.gev2.data[COLUMNS.gene2] = 'ABC' - self.gev2.data[COLUMNS.transcript1] = 'ABCD' - self.gev2.data[COLUMNS.transcript2] = 'ABCD' - result, removed = filter_by_annotations([self.gev1, self.gev2], self.best_transcripts) +from ..util import todo + + +@pytest.fixture +def genomic_event1(): + return BreakpointPair( + Breakpoint('1', 1), + Breakpoint('1', 10), + opposing_strands=True, + **{ + COLUMNS.event_type: SVTYPE.DEL, + COLUMNS.call_method: CALL_METHOD.CONTIG, + COLUMNS.fusion_sequence_fasta_id: None, + COLUMNS.protocol: PROTOCOL.GENOME, + COLUMNS.fusion_cdna_coding_end: None, + COLUMNS.fusion_cdna_coding_start: None, + } + ) + + +@pytest.fixture +def genomic_event2(): + return BreakpointPair( + Breakpoint('1', 1), + Breakpoint('1', 100), + opposing_strands=True, + **{ + COLUMNS.event_type: SVTYPE.DEL, + COLUMNS.call_method: CALL_METHOD.CONTIG, + COLUMNS.fusion_sequence_fasta_id: None, + COLUMNS.protocol: PROTOCOL.GENOME, + COLUMNS.fusion_cdna_coding_start: None, + COLUMNS.fusion_cdna_coding_end: None, + } + ) + + +@pytest.fixture +def best_transcripts(): + return {'ABCA': True, 'ABCD': True} + + +class TestFilterByAnnotations: + def test_filter_by_annotations_two_best_transcripts( + self, genomic_event1, genomic_event2, best_transcripts + ): + genomic_event1.data[COLUMNS.gene1] = 'ABC' + genomic_event1.data[COLUMNS.gene2] = 'ABC' + genomic_event1.data[COLUMNS.transcript1] = 'ABCA' + genomic_event1.data[COLUMNS.transcript2] = 'ABCA' + genomic_event2.data[COLUMNS.gene1] = 'ABC' + genomic_event2.data[COLUMNS.gene2] = 'ABC' + genomic_event2.data[COLUMNS.transcript1] = 'ABCD' + genomic_event2.data[COLUMNS.transcript2] = 'ABCD' + result, removed = filter_by_annotations([genomic_event1, genomic_event2], best_transcripts) bpp = result[0] print(bpp.data) - self.assertEqual(self.gev1, bpp) - self.assertEqual('ABCA', bpp.data[COLUMNS.transcript1]) - - def test_filter_by_annotations_two_transcripts(self): - self.gev1.data[COLUMNS.gene1] = 'XYZ' - self.gev1.data[COLUMNS.gene2] = 'XYS' - self.gev1.data[COLUMNS.transcript1] = 'XYZB' - self.gev1.data[COLUMNS.transcript2] = 'XYSZ' - self.gev2.data[COLUMNS.gene1] = 'XYZ' - self.gev2.data[COLUMNS.gene2] = 'XYS' - self.gev2.data[COLUMNS.transcript1] = 'XYZA' - self.gev2.data[COLUMNS.transcript2] = 'XYSB' - bpps, removed = filter_by_annotations([self.gev1, self.gev2], self.best_transcripts) + assert bpp == genomic_event1 + assert bpp.data[COLUMNS.transcript1] == 'ABCA' + + def test_filter_by_annotations_two_transcripts( + self, genomic_event1, genomic_event2, best_transcripts + ): + genomic_event1.data[COLUMNS.gene1] = 'XYZ' + genomic_event1.data[COLUMNS.gene2] = 'XYS' + genomic_event1.data[COLUMNS.transcript1] = 'XYZB' + genomic_event1.data[COLUMNS.transcript2] = 'XYSZ' + genomic_event2.data[COLUMNS.gene1] = 'XYZ' + genomic_event2.data[COLUMNS.gene2] = 'XYS' + genomic_event2.data[COLUMNS.transcript1] = 'XYZA' + genomic_event2.data[COLUMNS.transcript2] = 'XYSB' + bpps, removed = filter_by_annotations([genomic_event1, genomic_event2], best_transcripts) print(bpps) bpp = bpps[0] print(bpp, bpp.data) - self.assertEqual(self.gev2, bpp) - self.assertEqual('XYZA', bpp.data[COLUMNS.transcript1]) - - def test_filter_by_annotations_two_fusion_cdna(self): - self.gev1.data[COLUMNS.gene1] = 'XYZ' - self.gev1.data[COLUMNS.gene2] = 'XYS' - self.gev1.data[COLUMNS.transcript1] = 'XYZB' - self.gev1.data[COLUMNS.transcript2] = 'XYSZ' - self.gev2.data[COLUMNS.gene1] = 'XYZ' - self.gev2.data[COLUMNS.gene2] = 'XYS' - self.gev2.data[COLUMNS.transcript1] = 'XYZB' - self.gev2.data[COLUMNS.transcript2] = 'XYSZ' - self.gev1.data[COLUMNS.fusion_cdna_coding_start] = 1 - self.gev1.data[COLUMNS.fusion_cdna_coding_end] = 20 - self.gev2.data[COLUMNS.fusion_cdna_coding_start] = 1 - self.gev2.data[COLUMNS.fusion_cdna_coding_end] = 40 - result, removed = filter_by_annotations([self.gev1, self.gev2], self.best_transcripts) + assert bpp == genomic_event2 + assert bpp.data[COLUMNS.transcript1] == 'XYZA' + + def test_filter_by_annotations_two_fusion_cdna( + self, genomic_event1, genomic_event2, best_transcripts + ): + genomic_event1.data[COLUMNS.gene1] = 'XYZ' + genomic_event1.data[COLUMNS.gene2] = 'XYS' + genomic_event1.data[COLUMNS.transcript1] = 'XYZB' + genomic_event1.data[COLUMNS.transcript2] = 'XYSZ' + genomic_event2.data[COLUMNS.gene1] = 'XYZ' + genomic_event2.data[COLUMNS.gene2] = 'XYS' + genomic_event2.data[COLUMNS.transcript1] = 'XYZB' + genomic_event2.data[COLUMNS.transcript2] = 'XYSZ' + genomic_event1.data[COLUMNS.fusion_cdna_coding_start] = 1 + genomic_event1.data[COLUMNS.fusion_cdna_coding_end] = 20 + genomic_event2.data[COLUMNS.fusion_cdna_coding_start] = 1 + genomic_event2.data[COLUMNS.fusion_cdna_coding_end] = 40 + result, removed = filter_by_annotations([genomic_event1, genomic_event2], best_transcripts) bpp = result[0] - self.assertEqual(self.gev2, bpp) - - def test_filter_by_annotations_one_transcript(self): - self.gev1.data[COLUMNS.gene1] = None - self.gev1.data[COLUMNS.gene2] = 'XYS' - self.gev1.data[COLUMNS.transcript1] = None - self.gev1.data[COLUMNS.transcript2] = 'XYSZ' - self.gev2.data[COLUMNS.gene1] = 'XYZ' - self.gev2.data[COLUMNS.gene2] = 'XYS' - self.gev2.data[COLUMNS.transcript1] = 'XYZA' - self.gev2.data[COLUMNS.transcript2] = 'XYSB' - result, removed = filter_by_annotations([self.gev1, self.gev2], self.best_transcripts) + assert bpp == genomic_event2 + + def test_filter_by_annotations_one_transcript( + self, genomic_event1, genomic_event2, best_transcripts + ): + genomic_event1.data[COLUMNS.gene1] = None + genomic_event1.data[COLUMNS.gene2] = 'XYS' + genomic_event1.data[COLUMNS.transcript1] = None + genomic_event1.data[COLUMNS.transcript2] = 'XYSZ' + genomic_event2.data[COLUMNS.gene1] = 'XYZ' + genomic_event2.data[COLUMNS.gene2] = 'XYS' + genomic_event2.data[COLUMNS.transcript1] = 'XYZA' + genomic_event2.data[COLUMNS.transcript2] = 'XYSB' + result, removed = filter_by_annotations([genomic_event1, genomic_event2], best_transcripts) bpp = result[0] - self.assertEqual(self.gev2, bpp) - - def test_filter_by_annotations_one_best_transcripts(self): - self.gev1.data[COLUMNS.gene1] = 'XYZ' - self.gev1.data[COLUMNS.gene2] = 'ABC' - self.gev1.data[COLUMNS.transcript1] = 'XYZB' - self.gev1.data[COLUMNS.transcript2] = 'ABCA' - self.gev2.data[COLUMNS.gene1] = 'XYZ' - self.gev2.data[COLUMNS.gene2] = 'ABC' - self.gev2.data[COLUMNS.transcript1] = 'XYZA' - self.gev2.data[COLUMNS.transcript2] = 'ABCB' - result, removed = filter_by_annotations([self.gev1, self.gev2], self.best_transcripts) + assert bpp == genomic_event2 + + def test_filter_by_annotations_one_best_transcripts( + self, genomic_event1, genomic_event2, best_transcripts + ): + genomic_event1.data[COLUMNS.gene1] = 'XYZ' + genomic_event1.data[COLUMNS.gene2] = 'ABC' + genomic_event1.data[COLUMNS.transcript1] = 'XYZB' + genomic_event1.data[COLUMNS.transcript2] = 'ABCA' + genomic_event2.data[COLUMNS.gene1] = 'XYZ' + genomic_event2.data[COLUMNS.gene2] = 'ABC' + genomic_event2.data[COLUMNS.transcript1] = 'XYZA' + genomic_event2.data[COLUMNS.transcript2] = 'ABCB' + result, removed = filter_by_annotations([genomic_event1, genomic_event2], best_transcripts) bpp = result[0] - self.assertEqual(self.gev1, bpp) - self.assertEqual('XYZB', bpp.data[COLUMNS.transcript1]) - - def test_filter_by_annotations_no_transcripts(self): - self.gev1.data[COLUMNS.gene1] = None - self.gev1.data[COLUMNS.gene2] = None - self.gev1.data[COLUMNS.transcript1] = None - self.gev1.data[COLUMNS.transcript2] = None - self.gev2.data[COLUMNS.gene1] = None - self.gev2.data[COLUMNS.gene2] = None - self.gev2.data[COLUMNS.transcript1] = None - self.gev2.data[COLUMNS.transcript2] = None - self.gev1.break1.strand = STRAND.POS - result, removed = filter_by_annotations([self.gev1, self.gev2], self.best_transcripts) + assert bpp == genomic_event1 + assert bpp.data[COLUMNS.transcript1] == 'XYZB' + + def test_filter_by_annotations_no_transcripts( + self, genomic_event1, genomic_event2, best_transcripts + ): + genomic_event1.data[COLUMNS.gene1] = None + genomic_event1.data[COLUMNS.gene2] = None + genomic_event1.data[COLUMNS.transcript1] = None + genomic_event1.data[COLUMNS.transcript2] = None + genomic_event2.data[COLUMNS.gene1] = None + genomic_event2.data[COLUMNS.gene2] = None + genomic_event2.data[COLUMNS.transcript1] = None + genomic_event2.data[COLUMNS.transcript2] = None + genomic_event1.break1.strand = STRAND.POS + result, removed = filter_by_annotations([genomic_event1, genomic_event2], best_transcripts) bpp = result[0] - self.assertEqual(None, bpp.data[COLUMNS.transcript1]) + assert bpp.data[COLUMNS.transcript1] is None + @todo def test_combine_events(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_filtering_events_contigs(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_filtering_events_none(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_filtering_events_flanking(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_filtering_events_spanning(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_filtering_events_split(self): - raise unittest.SkipTest('TODO') + pass + @todo def test_get_pairing_state(self): - raise unittest.SkipTest('TODO') + pass diff --git a/tests/unit/test_tool.py b/tests/unit/test_tool.py index 72c0d2e9..96531588 100644 --- a/tests/unit/test_tool.py +++ b/tests/unit/test_tool.py @@ -39,14 +39,14 @@ def test_convert_insertion(self): assert bpp.break2.strand == STRAND.NS assert bpp.break2.chr == '1' assert bpp.event_type == SVTYPE.INS - assert bpp.untemplated_seq == None + assert bpp.untemplated_seq is None bpp_list = _convert_tool_row( _parse_vcf_record(row)[0], SUPPORTED_TOOL.DELLY, False, assume_no_untemplated=True ) assert len(bpp_list) == 1 bpp = bpp_list[0] - assert bpp.untemplated_seq == None + assert bpp.untemplated_seq is None assert bpp.untemplated_seq != '' def test_convert_convert_translocation(self): @@ -118,8 +118,8 @@ def test_convert_standard_event(self): assert bpp.break2.chr == 'chr13' assert bpp.break1.start == 114529969 assert bpp.break2.start == 114751269 - assert bpp.opposing_strands == False - assert bpp.stranded == True + assert bpp.opposing_strands is False + assert bpp.stranded is True def test_convert_translocation(self): row = { @@ -135,8 +135,8 @@ def test_convert_translocation(self): assert bpp.break2.chr == 'chr20' assert bpp.break1.start == 59445688 assert bpp.break2.start == 49411710 - assert bpp.opposing_strands == False - assert bpp.stranded == True + assert bpp.opposing_strands is False + assert bpp.stranded is True def test_malformed(self): row = {'FusionName': 'BCAS4--BCAS3', 'LeftBreakpoint': '', 'RightBreakpoint': None} @@ -163,8 +163,8 @@ def test_convert_stranded_indel_insertion(self): assert bpp.break1.start == 10015 assert bpp.break2.start == 10016 assert bpp.event_type == SVTYPE.INS - assert bpp.opposing_strands == False - assert bpp.stranded == True + assert bpp.opposing_strands is False + assert bpp.stranded is True assert bpp.untemplated_seq == 'AAT' def test_convert_indel_deletion(self): @@ -206,8 +206,8 @@ def test_convert_indel_unstranded_insertion(self): assert bpp.event_type == SVTYPE.INS assert bpp.break1.strand == STRAND.NS assert bpp.break2.strand == STRAND.NS - assert bpp.stranded == False - assert bpp.opposing_strands == False + assert bpp.stranded is False + assert bpp.opposing_strands is False assert bpp.untemplated_seq == 'TT' def test_convert_indel_duplication(self): @@ -229,8 +229,8 @@ def test_convert_indel_duplication(self): assert bpp.event_type == SVTYPE.DUP assert bpp.break1.strand == STRAND.NS assert bpp.break2.strand == STRAND.NS - assert bpp.stranded == False - assert bpp.opposing_strands == False + assert bpp.stranded is False + assert bpp.opposing_strands is False assert bpp.untemplated_seq == '' def test_convert_translocation(self): @@ -379,11 +379,11 @@ def test_convert_inverted_translocation(self): assert bpp.break2.chr == 'X' assert bpp.break1.start == 50294136 assert bpp.break2.start == 153063989 - assert bpp.event_type == None - assert bpp.opposing_strands == False + assert bpp.event_type is None + assert bpp.opposing_strands is False assert bpp.break1.orient == ORIENT.RIGHT assert bpp.break2.orient == ORIENT.LEFT - assert bpp.stranded == False + assert bpp.stranded is False assert bpp.data['tracking_id'] == 'defuse-1' def test_convert_translocation(self): @@ -403,11 +403,11 @@ def test_convert_translocation(self): assert bpp.break2.chr == 'X' assert bpp.break1.start == 50294136 assert bpp.break2.start == 153063989 - assert bpp.event_type == None - assert bpp.opposing_strands == True + assert bpp.event_type is None + assert bpp.opposing_strands is True assert bpp.break1.orient == ORIENT.LEFT assert bpp.break2.orient == ORIENT.LEFT - assert bpp.stranded == False + assert bpp.stranded is False assert bpp.data['tracking_id'] == 'defuse-1' def test_convert_indel(self): @@ -427,11 +427,11 @@ def test_convert_indel(self): assert bpp.break2.chr == '1' assert bpp.break1.start == 1663681 assert bpp.break2.start == 151732089 - assert bpp.event_type == None - assert bpp.opposing_strands == False + assert bpp.event_type is None + assert bpp.opposing_strands is False assert bpp.break1.orient == ORIENT.LEFT assert bpp.break2.orient == ORIENT.RIGHT - assert bpp.stranded == False + assert bpp.stranded is False assert bpp.data['tracking_id'] == 'defuse-1' def test_convert_inversion(self): @@ -451,11 +451,11 @@ def test_convert_inversion(self): assert bpp.break2.chr == '1' assert bpp.break1.start == 144898348 assert bpp.break2.start == 235294748 - assert bpp.event_type == None - assert bpp.opposing_strands == True + assert bpp.event_type is None + assert bpp.opposing_strands is True assert bpp.break1.orient == ORIENT.LEFT assert bpp.break2.orient == ORIENT.LEFT - assert bpp.stranded == False + assert bpp.stranded is False assert bpp.data['tracking_id'] == 'defuse-1' @@ -480,10 +480,10 @@ def test_convert_pos_pos(self): print(bpp) assert bpp.break1.start == int(row['end5p']) assert bpp.break2.start == int(row['start3p']) - assert bpp.opposing_strands == False + assert bpp.opposing_strands is False assert bpp.break1.orient == ORIENT.LEFT assert bpp.break2.orient == ORIENT.RIGHT - assert bpp.stranded == False + assert bpp.stranded is False def test_convert_pos_neg(self): row = { @@ -505,10 +505,10 @@ def test_convert_pos_neg(self): print(bpp) assert bpp.break1.start == int(row['end5p']) assert bpp.break2.start == int(row['end3p']) - assert bpp.opposing_strands == True + assert bpp.opposing_strands is True assert bpp.break1.orient == ORIENT.LEFT assert bpp.break2.orient == ORIENT.LEFT - assert bpp.stranded == False + assert bpp.stranded is False def test_convert_neg_pos(self): row = { @@ -530,10 +530,10 @@ def test_convert_neg_pos(self): print(bpp) assert bpp.break1.start == int(row['start5p']) assert bpp.break2.start == int(row['start3p']) - assert bpp.opposing_strands == True + assert bpp.opposing_strands is True assert bpp.break1.orient == ORIENT.RIGHT assert bpp.break2.orient == ORIENT.RIGHT - assert bpp.stranded == False + assert bpp.stranded is False def test_convert_neg_neg(self): row = { @@ -555,10 +555,10 @@ def test_convert_neg_neg(self): print(bpp) assert bpp.break1.start == int(row['start5p']) assert bpp.break2.start == int(row['end3p']) - assert bpp.opposing_strands == False + assert bpp.opposing_strands is False assert bpp.break1.orient == ORIENT.RIGHT assert bpp.break2.orient == ORIENT.LEFT - assert bpp.stranded == False + assert bpp.stranded is False class TestPindel: @@ -578,8 +578,8 @@ def test_convert_deletion(self): assert bpp.break1.strand == STRAND.NS assert bpp.break2.orient == ORIENT.RIGHT assert bpp.break2.strand == STRAND.NS - assert bpp.stranded == False - assert bpp.opposing_strands == False + assert bpp.stranded is False + assert bpp.opposing_strands is False def test_convert_insertion(self): row = Mock(chrom='21', pos=9412306, info={'SVTYPE': 'INS'}, stop=9412400, id=None, alts=[]) @@ -597,8 +597,8 @@ def test_convert_insertion(self): assert bpp.break1.strand == STRAND.NS assert bpp.break2.orient == ORIENT.RIGHT assert bpp.break2.strand == STRAND.NS - assert bpp.stranded == False - assert bpp.opposing_strands == False + assert bpp.stranded is False + assert bpp.opposing_strands is False def test_convert_inversion(self): row = Mock(chrom='21', pos=9412306, info={'SVTYPE': 'INV'}, stop=9412400, id=None, alts=[]) @@ -616,8 +616,8 @@ def test_convert_inversion(self): assert bpp.break1.strand == STRAND.NS assert bpp.break2.orient == ORIENT.LEFT assert bpp.break2.strand == STRAND.NS - assert bpp.stranded == False - assert bpp.opposing_strands == True + assert bpp.stranded is False + assert bpp.opposing_strands is True class TestParseBndAlt: @@ -726,7 +726,7 @@ def test_itx(self): assert bpps[0].break2.start == 10546 assert bpps[0].break2.end == 10546 assert bpps[0].break2.orient == ORIENT.LEFT - assert bpps[0].opposing_strands == False + assert bpps[0].opposing_strands is False def test_deletion(self): row = { @@ -750,7 +750,7 @@ def test_deletion(self): assert bpps[0].break2.start == 870225 assert bpps[0].break2.end == 870225 assert bpps[0].break2.orient == ORIENT.RIGHT - assert bpps[0].opposing_strands == False + assert bpps[0].opposing_strands is False def test_inversion(self): row = { @@ -774,7 +774,7 @@ def test_inversion(self): assert bpps[0].break2.start == 13218683 assert bpps[0].break2.end == 13218683 assert bpps[0].break2.orient == ORIENT.LEFT - assert bpps[0].opposing_strands == True + assert bpps[0].opposing_strands is True assert bpps[1].event_type == SVTYPE.INV assert bpps[1].break1.start == 13143396 @@ -783,7 +783,7 @@ def test_inversion(self): assert bpps[1].break2.start == 13218683 assert bpps[1].break2.end == 13218683 assert bpps[1].break2.orient == ORIENT.RIGHT - assert bpps[1].opposing_strands == True + assert bpps[1].opposing_strands is True def test_insertion(self): row = { @@ -807,7 +807,7 @@ def test_insertion(self): assert bpps[0].break2.start == 20218060 assert bpps[0].break2.end == 20218060 assert bpps[0].break2.orient == ORIENT.RIGHT - assert bpps[0].opposing_strands == False + assert bpps[0].opposing_strands is False class TestStrelka: diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index e163349e..4d2ccffa 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -1,17 +1,9 @@ -import os - import pytest from mavis.constants import COLUMNS, ORIENT, STRAND from mavis.error import NotSpecifiedError -from mavis.util import ( - ENV_VAR_PREFIX, - cast, - get_connected_components, - get_env_variable, - read_bpp_from_input_file, -) +from mavis.util import cast, get_connected_components, read_bpp_from_input_file -from .mock import Mock +from ..util import todo class TestGetConnectedComponents: @@ -39,12 +31,12 @@ def test_multiple_components(self): class TestCast: def test_float(self): - assert type(cast('1', float)) == type(1.0) - assert type(cast('1', int)) != type(1.0) + assert type(cast('1', float)) == type(1.0) # noqa: E721 + assert type(cast('1', int)) != type(1.0) # noqa: E721 def test_boolean(self): - assert type(cast('f', bool)) == type(False) - assert type(cast('false', bool)) == type(False) + assert type(cast('f', bool)) == type(False) # noqa: E721 + assert type(cast('false', bool)) == type(False) # noqa: E721 assert not cast('f', bool) assert not cast('false', bool) assert not cast('0', bool) @@ -267,7 +259,7 @@ def test_break1_orient_ns(self, tmp_path): assert len(bpps) == 1 assert bpps[0].break1.orient == ORIENT.LEFT - @pytest.mark.skip(reason='TODO') + @todo def test_break2_orient_ns(self, tmp_path): input_file = tmp_path / "inputs.tsv" input_file.write_text( @@ -292,10 +284,6 @@ def test_break2_orient_ns(self, tmp_path): assert len(bpps) == 1 assert bpps[0].break1.orient == ORIENT.LEFT - @pytest.mark.skip(reason='TODO') - def test_both_break_orient_ns(self, tmp_path): - input_file = tmp_path / "inputs.tsv" - def test_base_case(self, tmp_path): input_file = tmp_path / "inputs.tsv" input_file.write_text( @@ -319,7 +307,7 @@ def test_base_case(self, tmp_path): bpps = read_bpp_from_input_file(input_file, expand_strand=False, expand_orient=False) assert len(bpps) == 1 assert bpps[0].break1.orient == ORIENT.RIGHT - assert bpps[0].opposing_strands == True + assert bpps[0].opposing_strands is True def test_unstranded_with_strand_calls(self, tmp_path): input_file = tmp_path / "inputs.tsv" diff --git a/tests/unit/test_validate.py b/tests/unit/test_validate.py index dc2de156..560ca909 100644 --- a/tests/unit/test_validate.py +++ b/tests/unit/test_validate.py @@ -1,15 +1,12 @@ -import unittest - from mavis.constants import ORIENT -from mavis.validate.call import _call_interval_by_flanking_coverage -from mavis.validate.evidence import GenomeEvidence -from mavis.validate.base import Evidence from mavis.interval import Interval +from mavis.validate.base import Evidence +from mavis.validate.call import _call_interval_by_flanking_coverage from .mock import Mock -class CallIntervalByFlankingCoverage(unittest.TestCase): +class CallIntervalByFlankingCoverage: def test_invalid_input_attr(self): pass @@ -22,8 +19,8 @@ def test_left(self): distance=Evidence.distance, traverse=Evidence.traverse, ) - self.assertEqual(110, i.start) - self.assertEqual(180, i.end) + assert i.start == 110 + assert i.end == 180 i = _call_interval_by_flanking_coverage( Mock(start=20, end=80), @@ -33,8 +30,8 @@ def test_left(self): distance=Evidence.distance, traverse=Evidence.traverse, ) - self.assertEqual(80, i.start) - self.assertEqual(209, i.end) + assert i.start == 80 + assert i.end == 209 def test_right(self): i = _call_interval_by_flanking_coverage( @@ -45,8 +42,8 @@ def test_right(self): distance=Evidence.distance, traverse=Evidence.traverse, ) - self.assertEqual(101, i.end) - self.assertEqual(31, i.start) + assert i.end == 101 + assert i.start == 31 i = _call_interval_by_flanking_coverage( Mock(start=150, end=200), @@ -56,16 +53,16 @@ def test_right(self): distance=Evidence.distance, traverse=Evidence.traverse, ) - self.assertEqual(11, i.start) - self.assertEqual(150, i.end) + assert i.start == 11 + assert i.end == 150 -class TestDistanceAndTraverse(unittest.TestCase): +class TestDistanceAndTraverse: def test_distance(self): - self.assertEqual(Interval(10), Evidence.distance(1, 11)) + assert Evidence.distance(1, 11) == Interval(10) def test_traverse_right(self): - self.assertEqual(Interval(11), Evidence.traverse(1, 10, ORIENT.RIGHT)) + assert Evidence.traverse(1, 10, ORIENT.RIGHT) == Interval(11) def test_traverse_left(self): - self.assertEqual(Interval(10), Evidence.traverse(20, 10, ORIENT.LEFT)) + assert Evidence.traverse(20, 10, ORIENT.LEFT) == Interval(10) diff --git a/tests/util.py b/tests/util.py index 55db2d11..7cc9eaf9 100644 --- a/tests/util.py +++ b/tests/util.py @@ -1,9 +1,22 @@ import glob import os +import shutil + +import pytest DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +long_running_test = pytest.mark.skipif( + os.environ.get('RUN_FULL') != '1', + reason='Only running FAST tests subset', +) + +bwa_only = pytest.mark.skipif(not shutil.which('bwa'), reason='missing the command') +blat_only = pytest.mark.skipif(not shutil.which('blat'), reason='missing the command') +todo = pytest.mark.skip(reason='TODO') + + def package_relative_file(*paths): return os.path.abspath(os.path.join(os.path.dirname(__file__), '..', *paths)) From e7d9279220d64f826d837e5e7eafd82d3eb0f44d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 26 Apr 2021 15:33:43 -0700 Subject: [PATCH 026/137] Fix linting errors --- src/tools/find_repeats.py | 44 +++++++++++++++++++++++------- src/tools/generate_ensembl_json.py | 8 +++--- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/tools/find_repeats.py b/src/tools/find_repeats.py index d840a037..e96337ce 100644 --- a/src/tools/find_repeats.py +++ b/src/tools/find_repeats.py @@ -15,17 +15,29 @@ def parse_arguments(): """ parser = argparse.ArgumentParser() parser.add_argument( - '-o', '--output', - help='path to the output file', required=True, metavar='FILEPATH' + '-o', '--output', help='path to the output file', required=True, metavar='FILEPATH' ) parser.add_argument( - '-n', '--input', required=True, metavar='FILEPATH', - help='Path to the Input reference genome fasta file' + '-n', + '--input', + required=True, + metavar='FILEPATH', + help='Path to the Input reference genome fasta file', ) parser.add_argument( - '--min_length', default=20, type=int, help='Minimum total length of the repeat region to find', metavar='INT') + '--min_length', + default=20, + type=int, + help='Minimum total length of the repeat region to find', + metavar='INT', + ) parser.add_argument( - '--repeat_seq', default='N', type=str, help='Repeat sequence to look for. Case insensitive', nargs='+') + '--repeat_seq', + default='N', + type=str, + help='Repeat sequence to look for. Case insensitive', + nargs='+', + ) args = parser.parse_args() if args.min_length < 2: parser.error('argument --min_length: cannot specify a shorter repeat than 2 bases') @@ -43,7 +55,7 @@ def main(): os.path.basename(__file__), 'input: {}'.format(args.input), 'min_length: {}'.format(args.min_length), - 'repeat_seq: {}'.format(', '.join(args.repeat_seq)) + 'repeat_seq: {}'.format(', '.join(args.repeat_seq)), ] log('writing:', args.output) with open(args.output, 'w') as fh: @@ -61,21 +73,33 @@ def main(): visited.add(seq) spans = [] for repseq in repeat_sequences: - log('finding {}_repeat (min_length: {}), for chr{} (length: {})'.format(repseq, args.min_length, chrom, len(seq))) + log( + 'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format( + repseq, args.min_length, chrom, len(seq) + ) + ) index = 0 while index < len(seq): next_n = seq.find(repseq, index) if next_n < 0: break index = next_n - while index + len(repseq) <= len(seq) and seq[index:index + len(repseq)] == repseq: + while ( + index + len(repseq) <= len(seq) + and seq[index : index + len(repseq)] == repseq + ): index += len(repseq) span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq)) if len(span) >= args.min_length and len(span) >= 2 * len(repseq): spans.append(span) log('found', len(spans), 'spans', time_stamp=False) for span in spans: - fh.write('{}\t{}\t{}\t{}\n'.format(span.reference_object, span.start, span.end, span.name)) + fh.write( + '{}\t{}\t{}\t{}\n'.format( + span.reference_object, span.start, span.end, span.name + ) + ) + if __name__ == '__main__': main() diff --git a/src/tools/generate_ensembl_json.py b/src/tools/generate_ensembl_json.py index 2e554f07..9c44d2f1 100755 --- a/src/tools/generate_ensembl_json.py +++ b/src/tools/generate_ensembl_json.py @@ -438,10 +438,10 @@ def choose_best_transcripts(self): """ Select a canonical transcript for each human gene using Ensembl rules. - For human, the canonical transcript for a gene is set according to the following hierarchy: - - 1. Longest CCDS translation with no stop codons. - - 2. If no (1), choose the longest Ensembl/Havana merged translation with no stop codons. - - 3. If no (2), choose the longest translation with no stop codons. + For human, the canonical transcript for a gene is set according to the following hierarchy: + - 1. Longest CCDS translation with no stop codons. + - 2. If no (1), choose the longest Ensembl/Havana merged translation with no stop codons. + - 3. If no (2), choose the longest translation with no stop codons. - 4. If no translation, choose the longest non-protein-coding transcript. See: http://uswest.ensembl.org/Help/Glossary?id=346 From d5b4b140695d937a4f8e564e9605bbb956e60bf1 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 26 Apr 2021 15:36:16 -0700 Subject: [PATCH 027/137] use aligner decorators --- tests/integration/test_align.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_align.py b/tests/integration/test_align.py index 4effd774..0075a32d 100644 --- a/tests/integration/test_align.py +++ b/tests/integration/test_align.py @@ -14,7 +14,7 @@ from mavis.schemas import DEFAULTS from mavis.validate.evidence import GenomeEvidence -from ..util import get_data +from ..util import blat_only, bwa_only, get_data from . import MockLongString, MockObject, MockRead REFERENCE_GENOME = None @@ -52,7 +52,7 @@ def test_hardclipping(self): class TestAlign: - @pytest.mark.skipif(not shutil.which('blat'), reason='missing the blat command') + @blat_only def test_blat_contigs(self): ev = GenomeEvidence( Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), @@ -98,7 +98,7 @@ def test_blat_contigs(self): assert alignment.read1.cigar == [(CIGAR.S, 125), (CIGAR.EQ, 120)] assert alignment.read2.cigar == [(CIGAR.S, 117), (CIGAR.EQ, 128)] - @pytest.mark.skipif(not shutil.which('bwa'), reason='missing the command') + @bwa_only def test_bwa_contigs(self): ev = GenomeEvidence( Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), @@ -148,7 +148,7 @@ def test_bwa_contigs(self): assert alignment.read1.cigar == [(CIGAR.S, 125), (CIGAR.EQ, 120)] assert alignment.read2.cigar == [(CIGAR.S, 117), (CIGAR.EQ, 128)] - @pytest.mark.skipif(not shutil.which('blat'), reason='missing the blat command') + @blat_only def test_blat_contigs_deletion(self): ev = GenomeEvidence( Breakpoint('fake', 1714, orient=ORIENT.LEFT), @@ -192,7 +192,7 @@ def test_blat_contigs_deletion(self): assert alignment.read1.reference_start == 1612 assert alignment.read1.cigar == [(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)] - @pytest.mark.skipif(not shutil.which('blat'), reason='missing the blat command') + @blat_only def test_blat_contigs_deletion_revcomp(self): ev = GenomeEvidence( Breakpoint('fake', 1714, orient=ORIENT.LEFT), From cf1bf9fe665523908610c45daffa112f0280fafa Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 26 Apr 2021 16:23:15 -0700 Subject: [PATCH 028/137] Update black --- setup.py | 2 +- src/mavis/annotate/genomic.py | 6 +++--- src/mavis/assemble.py | 2 +- src/mavis/blat.py | 2 +- src/mavis/illustrate/elements.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 0245e812..ecbc4691 100644 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ def check_nonpython_dependencies(): extras_require={ 'docs': DOC_REQS, 'test': TEST_REQS, - 'dev': ['black', 'flake8'] + DOC_REQS + TEST_REQS + DEPLOY_REQS, + 'dev': ['black==20.8b1', 'flake8'] + DOC_REQS + TEST_REQS + DEPLOY_REQS, 'deploy': DEPLOY_REQS, 'tools': ['pyensembl', 'simplejson'], }, diff --git a/src/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py index 43246960..9891cebe 100644 --- a/src/mavis/annotate/genomic.py +++ b/src/mavis/annotate/genomic.py @@ -64,7 +64,7 @@ def to_dict(self): class Gene(BioInterval): - """""" + """ """ def __init__(self, chr, start, end, name=None, strand=STRAND.NS, aliases=None, seq=None): """ @@ -161,7 +161,7 @@ def to_dict(self): class Exon(BioInterval): - """""" + """ """ def __init__( self, @@ -275,7 +275,7 @@ def __repr__(self): class PreTranscript(BioInterval): - """""" + """ """ def __init__( self, diff --git a/src/mavis/assemble.py b/src/mavis/assemble.py index 3e16e429..f7a9d301 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/assemble.py @@ -12,7 +12,7 @@ class Contig: - """""" + """ """ def __init__(self, sequence, score): self.seq = sequence diff --git a/src/mavis/blat.py b/src/mavis/blat.py index c8cd325f..6efd0fb2 100644 --- a/src/mavis/blat.py +++ b/src/mavis/blat.py @@ -32,7 +32,7 @@ class Blat: - """""" + """ """ @staticmethod def millibad(row, is_protein=False, is_mrna=True): diff --git a/src/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py index aa9e90e9..cb79549b 100644 --- a/src/mavis/illustrate/elements.py +++ b/src/mavis/illustrate/elements.py @@ -93,7 +93,7 @@ def draw_exon_track( genomic_max=None, translation=None, ): - """""" + """ """ colors = {} if colors is None else colors main_group = canvas.g(class_='exon_track') From 0244f170c58e70975037a72eada38e4160b82cb6 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 26 Apr 2021 17:48:36 -0700 Subject: [PATCH 029/137] Add singularity comment to docs --- docs/configuration/performance.md | 5 ++--- docs/configuration/pipeline.md | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/configuration/performance.md b/docs/configuration/performance.md index e666c10a..a5cb9e7c 100644 --- a/docs/configuration/performance.md +++ b/docs/configuration/performance.md @@ -12,7 +12,7 @@ cpu requirements depending on what the user is trying to analyze. See ## Validation Resources -![](../images/colo829_tumour_validation_resource_req.png) +![validation resources](../images/colo829_tumour_validation_resource_req.png) Resource Requirements (MAVIS 1.8.0) for each validation job of the COLO829 tumour genome. The BAM file for the tumour genome is 127GB. @@ -21,7 +21,6 @@ structural variant validations per job. The effect of number of events validated on both memory and time is plotted above. - ## Annotation Resources Similar trends were observed for the annotation step (see below) with @@ -29,7 +28,7 @@ regards to time elapsed. However the memory requirements remained more constant which is expected since, unlike validation, anntotation does not read more data in for more events. -![](../images/colo829_tumour_annotation_resource_req.png) +![annotation resources](../images/colo829_tumour_annotation_resource_req.png) Resource Requirements (MAVIS 1.8.0) for each annotation job of the COLO829 tumour genome. The events which passed validation (see above) diff --git a/docs/configuration/pipeline.md b/docs/configuration/pipeline.md index d8073a1c..2aab3368 100644 --- a/docs/configuration/pipeline.md +++ b/docs/configuration/pipeline.md @@ -23,3 +23,11 @@ snakemake -j --configfile --profile ``` This will submit a series of jobs with dependencies. + +To use the mavis docker container through singularity, instead of installing mavis via pip, add the +[`--use-singularity`](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers) +flag. + +```bash +snakemake -j --configfile --profile --use-singularity` +``` From 7f1b806c89b0683abde0eb10918692a473004cef Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 27 Apr 2021 14:26:07 -0700 Subject: [PATCH 030/137] Use config package --- README.md | 91 +++-------------- Snakefile | 18 +--- docs/configuration/pipeline.md | 6 +- docs/install.md | 142 ++++++++++++++++++++++++++ docs/tutorials/full.md | 11 +- docs/tutorials/mini.md | 9 +- setup.py | 2 +- src/mavis/config.py | 92 ++--------------- src/mavis/constants.py | 99 +----------------- src/mavis/main.py | 7 +- src/tools/get_hg19_reference_files.sh | 16 +-- src/tools/get_hg38_reference_files.sh | 21 ++++ 12 files changed, 218 insertions(+), 296 deletions(-) create mode 100644 docs/install.md create mode 100644 src/tools/get_hg38_reference_files.sh diff --git a/README.md b/README.md index 47956ae1..b6e6ef45 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,8 @@
- ![PyPi](https://img.shields.io/pypi/v/mavis.svg) ![build](https://github.com/bcgsc/mavis/workflows/build/badge.svg?branch=master) [![codecov](https://codecov.io/gh/bcgsc/mavis/branch/master/graph/badge.svg)](https://codecov.io/gh/bcgsc/mavis) ![ReadTheDocs](https://readthedocs.org/projects/pip/badge/) - ## About [MAVIS](http://mavis.bcgsc.ca) is python command-line tool for the post-processing of structural variant calls. @@ -39,87 +37,28 @@ Common problems and questions are addressed on the [wiki](https://github.com/bcg If you have a question or issue that is not answered there (or already a github issue) please submit a github issue to our [github page](https://github.com/bcgsc/mavis/issues) or contact us by email at [mavis@bcgsc.ca](mailto:mavis@bcgsc.ca) -## Install Instructions - -There are 3 major steps to setting up and installing [MAVIS](http://mavis.bcgsc.ca). If you are a developer contributing to mavis, please see the [instructions for developers page](https://mavis.readthedocs.io/en/latest/development) instead - -### 1. Install Aligner - -In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed. -Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa). -For MAVIS to run successfully the aligner must be installed and accessible on the path. -If you have a non-standard install you may find it useful to edit the PATH environment variable. For example - -``` bash -export PATH=/path/to/directory/containing/blat/binary:$PATH -``` - -[blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) as a default instead, use the -[MAVIS environment variables](https://mavis.readthedocs.io/en/latest/configuration/settings/). Make sure to specify BOTH of the variables below to change the default aligner. - -``` bash -export MAVIS_ALIGNER='bwa mem' -export MAVIS_ALIGNER_REFERENCE=/path/to/mem/fasta/ref/file -``` - -After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/) - -### 2. Install MAVIS - -#### Install using pip - -The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads) - -Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies - -``` bash -pip install --upgrade pip setuptools -``` - -or (for Anaconda users) - -``` bash -conda update pip setuptools -``` - -If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html) - -Then install mavis itself - -``` bash -pip install mavis -``` - -This will install mavis and its python dependencies. +## Getting Started -#### Install using Buildout +The simplest way to use MAVIS is via Singularity. The MAVIS docker container used +by singularity will take care of installing the aligner as well. -Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis - -``` bash -git clone https://github.com/bcgsc/mavis.git -cd mavis -pip install zc.buildout -python bootstrap.py -bin/buildout +```bash +pip install -U setuptools pip +pip install mavis_config # also installs snakemake ``` -This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis +Now you will run mavis via Snakemake as follows -### 3. Build or Download Reference Files - -After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files and generate a MAVIS environment file is provided under mavis/tools for convenience. - -``` bash -cd /path/to/where/you/want/to/put/the/files -wget https://raw.githubusercontent.com/bcgsc/mavis/master/tools/get_hg19_reference_files.sh -bash get_hg19_reference_files.sh -source reference_inputs/hg19_env.sh +```bash +snakemake \ + -j \ + --configfile \ + --use-singularity \ + -s Snakefile ``` -Once the above 3 steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run. -See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS. - +For other installation options which do not use docker/singularity see the comprehensive install +instructions in the [user manual](https://mavis.readthedocs.io/en/latest/install) ## Citation diff --git a/Snakefile b/Snakefile index 23c0772e..5501fe9f 100644 --- a/Snakefile +++ b/Snakefile @@ -1,10 +1,11 @@ -from snakemake.utils import validate from snakemake.exceptions import WorkflowError import os from typing import List, Dict import re import json import pandas as pd +from mavis_config import validate_config +from mavis_config.constants import SUBCOMMAND CONTAINER = 'bcgsc/mavis:latest' @@ -15,18 +16,7 @@ INITIALIZED_CONFIG = output_dir('config.json') try: - # TODO: replace with URL so that the user does not need a copy of the config schema - validate( - config, - os.path.join(os.getcwd(), 'src/mavis/schemas/config.json') - ) - for key in [ - "libraries", - "reference.annotations", - "output_dir" - ]: - if key not in config: - raise ValueError(f'missing required property: {key}') + validate_config(config, stage=SUBCOMMAND.SETUP) except Exception as err: short_msg = ' '.join(str(err).split('\n')[:2]) # these can get super long raise WorkflowError(short_msg) @@ -85,7 +75,7 @@ rule copy_config: with open(output_dir('config.raw.json'), 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) - +# adds the bam stats and default settings rule init_config: input: rules.copy_config.output output: INITIALIZED_CONFIG diff --git a/docs/configuration/pipeline.md b/docs/configuration/pipeline.md index 2aab3368..a79e16f2 100644 --- a/docs/configuration/pipeline.md +++ b/docs/configuration/pipeline.md @@ -13,13 +13,13 @@ far-left. The most common use case is running the pipeline through snakemake ```bash -snakemake -j --configfile +snakemake -j --configfile -s Snakefile ``` If you are submitting to a cluster, use the [snakemake profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) ```bash -snakemake -j --configfile --profile +snakemake -j --configfile --profile -s Snakefile ``` This will submit a series of jobs with dependencies. @@ -29,5 +29,5 @@ To use the mavis docker container through singularity, instead of installing mav flag. ```bash -snakemake -j --configfile --profile --use-singularity` +snakemake -j --configfile --profile --use-singularity -s Snakefile` ``` diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 00000000..b3468f7a --- /dev/null +++ b/docs/install.md @@ -0,0 +1,142 @@ +# Install Instructions + +Once the install steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run. +See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS. + +For either install option you will want to install the main Snakefile. It is best to use a tag to +specify the version of interest but you can download the latest version from the master branch + +```bash +wget https://raw.githubusercontent.com/bcgsc/mavis/master/Snakefile -O Snakefile +``` + +## Install for Docker/Singularity + +The simplest way to use MAVIS is via Singularity. The MAVIS docker container used +by singularity will take care of installing the aligner as well. + +```bash +pip install -U setuptools pip +pip install mavis_config # also installs snakemake +``` + +Now you will run mavis via Snakemake as follows + +```bash +snakemake \ + -j \ + --configfile \ + --use-singularity \ + -s Snakefile +``` + +## Install (Python Only) + +MAVIS can also be run with just python. However you will need to install the aligner(s) required +by MAVIS separately and ensure they are availble on the default PATH variable when MAVIS is run + +### 1. Install Aligner + +In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed. +Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa). +For MAVIS to run successfully the aligner must be installed and accessible on the path. +If you have a non-standard install you may find it useful to edit the PATH environment variable. For example + +``` bash +export PATH=/path/to/directory/containing/blat/binary:$PATH +``` + +[blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) it must be specified +in the [config](https://mavis.readthedocs.io/en/latest/configuration/settings/) JSON file. + +After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/) + +### 2. Install MAVIS + +#### Install using pip + +The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads) + +Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies + +``` bash +pip install --upgrade pip setuptools +``` + +or (for Anaconda users) + +``` bash +conda update pip setuptools +``` + +If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html) + +Then install mavis itself + +``` bash +pip install mavis +``` + +This will install mavis and its python dependencies. + +#### Install using Buildout + +Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis + +``` bash +git clone https://github.com/bcgsc/mavis.git +cd mavis +pip install zc.buildout +python bootstrap.py +bin/buildout +``` + +This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis + +Finally you will need to Build/Download the necessary reference files + +## Build or Download Reference Files + +After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files is provided under mavis/tools for convenience. + +### Download Hg19 Files + +``` bash +cd /path/to/where/you/want/to/put/the/files +wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh +bash get_hg19_reference_files.sh +``` + +You should now see the reference files in the current directory + +```text +. +|-- cytoBand.txt +|-- dgv_hg19_variants.tab +|-- ensembl69_hg19_annotations.json +|-- get_hg19_reference_files.sh +|-- hg19.2bit +|-- hg19.fa +`-- hg19_masking.tab +``` + +### Download Hg38 Files + +``` bash +cd /path/to/where/you/want/to/put/the/files +wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg38_reference_files.sh +bash get_hg19_reference_files.sh +``` + +You should now see the reference files in the current directory + +```text +. +|-- cytoBand.txt +|-- dgv_hg38_variants.tab +|-- ensembl79_hg38_annotations.json +|-- get_hg38_reference_files.sh +|-- GCA_000001405.15_GRCh38_no_alt_analysis_set.fna +|-- GRCh38_masking.tab +`-- hg38.2bit +``` diff --git a/docs/tutorials/full.md b/docs/tutorials/full.md index 44b187bc..ebcfd290 100644 --- a/docs/tutorials/full.md +++ b/docs/tutorials/full.md @@ -36,13 +36,11 @@ The expected contents are ## Downloading the Reference Inputs -Run the following to download the hg19 reference files and set up the -environment variables for configuring MAVIS +Run the following to download the hg19 reference files ```bash wget https://raw.githubusercontent.com/bcgsc/mavis/master/tools/get_hg19_reference_files.sh bash get_hg19_reference_files.sh -source reference_inputs/hg19_env.sh ``` ## Creating the Config File @@ -173,6 +171,13 @@ Finally you will need to set output directory and the reference files ## Running the Workflow +In order to run the snakemake file you will need to have the config validation module +`mavis_config` installed which has minimal dependencies. + +```bash +pip install mavis_config +``` + You are now ready to run the workflow ```bash diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index 657dac26..27b5f51f 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -16,21 +16,22 @@ installed git clone https://github.com/bcgsc/mavis.git git checkout mv mavis/tests . +mv mavis/Snakefile . rm -r mavis ``` Now you should have a folder called `tests` in your current directory. Since this is a trivial example, it can easily be run locally. However in order to run the snakemake file you will need -to have a copy of the config schema definition file which is included in MAVIS by default. +to have the config validation module `mavis_config` installed which has minimal dependencies. -```text -mavis/schemas/config.json +```bash +pip install mavis_config ``` Now you are ready to run MAVIS. This can be done in a single command using snakemake. ```bash -snakemake -j 1 --configfile=tests/mini-tutorial.config.json +snakemake -j 1 --configfile=tests/mini-tutorial.config.json -s Snakefile ``` Which will run the mini tutorial version and output files into a folder called `output_dir` in the diff --git a/setup.py b/setup.py index ecbc4691..cf458d52 100644 --- a/setup.py +++ b/setup.py @@ -90,7 +90,7 @@ def check_nonpython_dependencies(): 'pyvcf==0.6.8', 'shortuuid>=0.5.0', 'svgwrite', - 'snakemake>=6.1.1, <7', + 'mavis_config==1.0.0', ] DEPLOY_REQS = ['twine', 'm2r', 'wheel'] diff --git a/src/mavis/config.py b/src/mavis/config.py index 480d3f72..38fcb500 100644 --- a/src/mavis/config.py +++ b/src/mavis/config.py @@ -110,91 +110,21 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, items) -def validate_config(config: Dict, bam_stats: Optional[bool] = False, stage: str = '') -> None: +def add_bamstats_to_config(config: Dict): """ Check that the input JSON config conforms to the expected schema as well as the other relevant checks such as file exsts """ - schema = 'config' if stage != SUBCOMMAND.OVERLAY else 'overlay' - - try: - snakemake_validate( - config, - os.path.join(os.path.dirname(__file__), f'schemas/{schema}.json'), - set_default=True, - ) - except Exception as err: - short_msg = '. '.join( - [line for line in str(err).split('\n') if line.strip()][:3] - ) # these can get super long - raise WorkflowError(short_msg) - - required = [] - if ( - stage not in {SUBCOMMAND.CONVERT} - or stage == SUBCOMMAND.CLUSTER - and not config['cluster.uninformative_filter'] - ): - required.append('reference.annotations') - - if stage == SUBCOMMAND.VALIDATE: - required.extend(['reference.aligner_reference', 'reference.reference_genome']) - - for req in required: - if req not in config: - raise WorkflowError(f'missing required property: {req}') - - if schema == 'config': - conversion_dir = os.path.join(config['output_dir'], 'converted_outputs') - # check all assignments are conversions aliases or existing files - for libname, library in config['libraries'].items(): - assignments = [] - for i, assignment in enumerate(library['assign']): - if assignment in config.get('convert', {}): - # replace the alias with the expected output path - converted_output = os.path.join(conversion_dir, f'{assignment}.tab') - assignments.append(converted_output) - elif ( - not os.path.exists(assignment) and os.path.dirname(assignment) != conversion_dir - ): - raise FileNotFoundError(f'cannot find the expected input file {assignment}') - else: - assignments.append(assignment) - library['assign'] = assignments - - if not config['skip_stage.validate'] and stage in { - SUBCOMMAND.VALIDATE, - SUBCOMMAND.SETUP, - }: - if not library.get('bam_file', None) or not os.path.exists(library['bam_file']): - raise FileNotFoundError( - f'missing bam file for library ({libname}), it is a required input when the validate stage is not skipped' - ) - # calculate the bam_stats if the have not been given - missing_stats = any( - [ - col not in library - for col in ['median_fragment_size', 'read_length', 'stdev_fragment_size'] - ] - ) - if missing_stats and bam_stats: - library.update(calculate_bam_stats(config, libname)) - - # expand and check the input files exist for any conversions - for conversion in config.get('convert', {}).values(): - expanded = [] - for input_file in conversion['inputs']: - expanded.extend(bash_expands(input_file)) - conversion['inputs'] = expanded - - # make sure all the reference files specified exist and overload with environment variables where applicable - for ref_type in list(config.keys()): - if not ref_type.startswith('reference.'): - continue - expanded = [] - for input_file in config[ref_type]: - expanded.extend(bash_expands(input_file)) - config[ref_type] = expanded + # check all assignments are conversions aliases or existing files + for libname, library in config['libraries'].items(): + # calculate the bam_stats if the have not been given + if any( + [ + col not in library + for col in ['median_fragment_size', 'read_length', 'stdev_fragment_size'] + ] + ): + library.update(calculate_bam_stats(config, libname)) def get_metavar(arg_type): diff --git a/src/mavis/constants.py b/src/mavis/constants.py index 89e75e64..fc27c03d 100644 --- a/src/mavis/constants.py +++ b/src/mavis/constants.py @@ -10,6 +10,7 @@ from Bio.Alphabet.IUPAC import ambiguous_dna from Bio.Data.IUPACData import ambiguous_dna_values from Bio.Seq import Seq +from mavis_config.constants import SUBCOMMAND, MavisNamespace PROGNAME: str = 'mavis' EXIT_OK: int = 0 @@ -17,89 +18,6 @@ EXIT_INCOMPLETE: int = 2 -class EnumType(type): - def __contains__(cls, member): - return member in cls.values() - - def __getitem__(cls, item): - for k, v in cls.items(): - if k == item: - return v - raise KeyError(item) - - def __iter__(cls): - """ - Returns members in definition order. - """ - return cls.values() - - -class MavisNamespace(metaclass=EnumType): - @classmethod - def items(cls): - return [(k, v) for k, v in cls.__dict__.items() if not k.startswith('_')] - - @classmethod - def to_dict(cls): - return dict(cls.items()) - - @classmethod - def keys(cls): - return [k for k, v in cls.items()] - - @classmethod - def values(cls): - return [v for k, v in cls.items()] - - @classmethod - def enforce(cls, value): - """ - checks that the current namespace has a given value - - Returns: - the input value - - Raises: - KeyError: the value did not exist - - Example: - >>> nspace.enforce(1) - 1 - >>> nspace.enforce(3) - Traceback (most recent call last): - .... - """ - if value not in cls.values(): - raise KeyError('value {0} is not a valid member of '.format(repr(value)), cls.values()) - return value - - @classmethod - def reverse(cls, value): - """ - for a given value, return the associated key - - Args: - value: the value to get the key/attribute name for - - Raises: - KeyError: the value is not unique - KeyError: the value is not assigned - - Example: - >>> nspace.reverse(1) - 'thing' - """ - result = [] - for key in cls.keys(): - if cls[key] == value: - result.append(key) - if len(result) > 1: - raise KeyError('could not reverse, the mapping is not unique', value, result) - elif not result: - raise KeyError('input value is not assigned to a key', value) - return result[0] - - def float_fraction(num): """ cast input to a float @@ -147,21 +65,6 @@ class SPLICE_TYPE(MavisNamespace): """Filename for all complete stamp files""" -class SUBCOMMAND(MavisNamespace): - """ - holds controlled vocabulary for allowed pipeline stage values - """ - - ANNOTATE: str = 'annotate' - VALIDATE: str = 'validate' - CLUSTER: str = 'cluster' - PAIR: str = 'pairing' - SUMMARY: str = 'summary' - CONVERT: str = 'convert' - OVERLAY: str = 'overlay' - SETUP: str = 'setup' - - CODON_SIZE: int = 3 """the number of bases making up a codon""" diff --git a/src/mavis/main.py b/src/mavis/main.py index 5b69fff3..abedaf82 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -8,6 +8,8 @@ import time from typing import Dict +from mavis_config import validate_config + from . import __version__ from . import config as _config from . import util as _util @@ -194,9 +196,8 @@ def main(argv=None): if args.command != SUBCOMMAND.CONVERT: with open(args.config, 'r') as fh: config = json.load(fh) - _config.validate_config( + validate_config( config, - args.command in {SUBCOMMAND.SETUP, SUBCOMMAND.VALIDATE}, args.command, ) except AttributeError as err: @@ -263,6 +264,8 @@ def main(argv=None): args.assume_no_untemplated, ) elif command == SUBCOMMAND.SETUP: + # add bam stats to the config if missing + _config.add_bamstats_to_config(config) _util.LOG(f'writing: {args.outputfile}') with open(args.outputfile, 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) diff --git a/src/tools/get_hg19_reference_files.sh b/src/tools/get_hg19_reference_files.sh index 0f4f91c4..3fb40f46 100644 --- a/src/tools/get_hg19_reference_files.sh +++ b/src/tools/get_hg19_reference_files.sh @@ -1,14 +1,9 @@ -mkdir reference_inputs -cd reference_inputs - -ENV_FILE=hg19_env.sh +set -euo pipefail echo "downloading the reference genome file" wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz tar -xvzf chromFa.tar.gz -CWD=$( pwd ) - # concatenate the chromosome fa files into a single file for fname in chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.fa do @@ -17,27 +12,20 @@ done # Clean up the non concatenated and alt chromosome files rm -f chr*.fa -echo export MAVIS_REFERENCE_GENOME="${CWD}/hg19.fa" >> $ENV_FILE +rm -f chromeFa.tar.gz echo "downloading the gene annotations file" wget http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations.json -echo export MAVIS_ANNOTATIONS="${CWD}/ensembl69_hg19_annotations.json" >> $ENV_FILE echo "downloading the masking file" wget http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab -echo export MAVIS_MASKING="${CWD}/hg19_masking.tab" >> $ENV_FILE echo "downloading the dgv annotation file" wget http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab -echo export MAVIS_DGV_ANNOTATION="${CWD}/dgv_hg19_variants.tab" >> $ENV_FILE echo "downloading the aligner reference file" wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit -echo export MAVIS_ALIGNER_REFERENCE="${CWD}/hg19.2bit" >> $ENV_FILE echo "downloading the template metadata file" wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz gunzip cytoBand.txt.gz -echo export MAVIS_TEMPLATE_METADATA="${CWD}/cytoBand.txt" >> $ENV_FILE - -echo "Source $CWD/$ENV_FILE prior to running MAVIS to set MAVIS default arguments" diff --git a/src/tools/get_hg38_reference_files.sh b/src/tools/get_hg38_reference_files.sh new file mode 100644 index 00000000..97c1face --- /dev/null +++ b/src/tools/get_hg38_reference_files.sh @@ -0,0 +1,21 @@ +set -euo pipefail + +echo "downloading the reference genome (no alt) file" +wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz +gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz + +echo "downloading the gene annotations file" +wget http://www.bcgsc.ca/downloads/mavis/ensembl79_hg38_annotations.json + +echo "downloading the masking file" +wget http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab + +echo "downloading the dgv annotation file" +wget http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab + +echo "downloading the aligner reference file" +wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit + +echo "downloading the template metadata file" +wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz +gunzip cytoBand.txt.gz From 078846fa31502da476b65075780572bb817b7667 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 27 Apr 2021 14:43:26 -0700 Subject: [PATCH 031/137] Add docker publish job to tag workflow --- .github/workflows/publish.yml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 35bf9a9d..3b19260f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,10 +8,8 @@ on: types: [created] jobs: - deploy: - + pypi: runs-on: ubuntu-latest - steps: - uses: actions/checkout@v2 - name: Set up Python @@ -30,3 +28,15 @@ jobs: python setup.py sdist bdist_wheel install twine check dist/* twine upload dist/* + docker: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD + env: + DOCKER_USER: ${{ secrets.DOCKER_USER }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + - run: | + docker build --file Dockerfile --tag bcgsc/mavis:latest --tag bcgsc/mavis:${{ github.event.release.tag_name }} . + - run: docker push bcgsc/mavis:latest + - run: docker push bcgsc/mavis:${{ github.event.release.tag_name }} From d5d131ba1aaf7fefce648479b00e831326bb3252 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 27 Apr 2021 15:30:26 -0700 Subject: [PATCH 032/137] Add tests for running docker image with singularity --- .github/workflows/build.yml | 31 +++++++++++ .github/workflows/quick-tests.yml | 13 +---- Dockerfile | 4 ++ README.md | 2 +- Snakefile | 61 ++++++++++++++++++--- docs/install.md | 2 +- docs/tutorials/mini.md | 2 +- tests/integration/test_annotate_examples.py | 3 + tests/snakemake/test_mini_workflow.py | 3 +- 9 files changed, 97 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e8107623..a328130c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,3 +72,34 @@ jobs: name: codecov-umbrella fail_ci_if_error: true if: matrix.python-version == 3.8 + docker: + runs-on: ubuntu-latest + name: docker build + steps: + - uses: actions/checkout@v2 + - name: build the docker container + run: | + docker build --file Dockerfile --tag bcgsc/mavis:latest . + - name: test the help menu + run: | + docker run bcgsc/mavis -h + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install workflow dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + pip install mavis_config pandas + - uses: eWaterCycle/setup-singularity@v6 + with: + singularity-version: 3.6.4 + - name: docker2singularity + run: + docker run --mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock --mount type=bind,source="$(pwd)",target=/output --privileged -t --rm singularityware/docker2singularity bcgsc/mavis:latest + - name: Run analysis with snakemake & singularity + run: | + # get the SIMG filename + export SNAKEMAKE_CONTAINER=$(ls *mavis*.simg) + snakemake -j 2 --configfile tests/mini-tutorial.config.json --use-singularity + if: always() diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml index 689ea8fb..c74ae1be 100644 --- a/.github/workflows/quick-tests.yml +++ b/.github/workflows/quick-tests.yml @@ -20,7 +20,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip setuptools + python -m pip install --upgrade pip setuptools wheel pip install .[test] - name: Lint with flake8 run: | @@ -51,14 +51,3 @@ jobs: --durations=10 env: RUN_FULL: 0 - docker: - runs-on: ubuntu-latest - name: docker build - steps: - - uses: actions/checkout@v2 - - name: build the docker container - run: | - docker build --file Dockerfile --tag bcgsc/mavis . - - name: test the help menu - run: | - docker run bcgsc/mavis -h diff --git a/Dockerfile b/Dockerfile index 3f2ef284..8b93e0c0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,10 @@ RUN git clone https://github.com/lh3/bwa.git && \ cd .. && \ mv bwa/bwa /usr/local/bin + +# install blat dependencies +RUN apt-get install -y libcurl4 + # install blat RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ chmod a+x blat && \ diff --git a/README.md b/README.md index b6e6ef45..c3f18975 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ by singularity will take care of installing the aligner as well. ```bash pip install -U setuptools pip -pip install mavis_config # also installs snakemake +pip install mavis_config pandas # also installs snakemake ``` Now you will run mavis via Snakemake as follows diff --git a/Snakefile b/Snakefile index 5501fe9f..b9104212 100644 --- a/Snakefile +++ b/Snakefile @@ -3,16 +3,19 @@ import os from typing import List, Dict import re import json -import pandas as pd from mavis_config import validate_config from mavis_config.constants import SUBCOMMAND -CONTAINER = 'bcgsc/mavis:latest' +# env variable mainly for CI/CD +CONTAINER = os.environ.get('SNAKEMAKE_CONTAINER', 'docker://bcgsc/mavis:latest') +MAX_TIME = 57600 +DEFAULT_MEMORY_MB = 16000 def output_dir(*paths): return os.path.join(config['output_dir'], *paths) INITIALIZED_CONFIG = output_dir('config.json') +LOG_DIR = output_dir('logs') try: @@ -27,6 +30,8 @@ CLUSTER_OUTPUT = output_dir('{library}/cluster/batch-{job_id}.tab') # create the cluster inputs and guess the cluster sizes def count_total_rows(filenames): + import pandas as pd + row_count = 0 for filename in filenames: df = pd.read_csv(filename, sep='\t').drop_duplicates() @@ -71,6 +76,11 @@ rule all: rule copy_config: output: output_dir('config.raw.json') + resources: + time_limit=MAX_TIME, + mem_mb=4000, + cpus=1, + log_dir=LOG_DIR run: with open(output_dir('config.raw.json'), 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) @@ -80,19 +90,29 @@ rule init_config: input: rules.copy_config.output output: INITIALIZED_CONFIG container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR shell: 'mavis setup --config {input} --outputfile {output}' rule convert: output: output_dir('converted_outputs/{alias}.tab') input: rules.init_config.output - log: output_dir('converted_outputs/snakemake.{alias}.log.txt') + log: os.path.join(LOG_DIR, 'convert.snakemake.{alias}.log.txt') params: file_type=lambda w: config['convert'][w.alias]['file_type'], strand_specific=lambda w: config['convert'][w.alias]['strand_specific'], assume_no_untemplated=lambda w: config['convert'][w.alias]['assume_no_untemplated'], input_files=lambda w: config['convert'][w.alias]['inputs'] container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR shell: 'mavis convert --file_type {params.file_type}' + ' --strand_specific {params.strand_specific}' @@ -118,8 +138,13 @@ rule cluster: input: files=get_cluster_inputs, config=rules.init_config.output output: directory(output_dir('{library}/cluster')) - log: output_dir('snakemake.cluster.{library}.log.txt') + log: os.path.join(LOG_DIR, 'snakemake.cluster.{library}.log.txt') container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR shell: 'mavis cluster --config {input.config}' + ' --library {wildcards.library}' @@ -135,8 +160,13 @@ if not config['skip_stage.validate']: dirname=lambda w: output_dir(f'{w.library}/validate/batch-{w.job_id}'), inputfile=lambda w: expand(CLUSTER_OUTPUT, library=[w.library], job_id=[w.job_id]) output: VALIDATE_OUTPUT - log: output_dir('{library}/validate/snakemake.batch-{job_id}.log.txt') + log: os.path.join(LOG_DIR, '{library}.validate.snakemake.batch-{job_id}.log.txt') container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=18000, + cpus=2, + log_dir=LOG_DIR shell: 'mavis validate --config {rules.init_config.output}' + ' --library {wildcards.library}' @@ -149,8 +179,13 @@ rule annotate: input: rules.validate.output if not config['skip_stage.validate'] else rules.cluster.output output: stamp=output_dir('{library}/annotate/batch-{job_id}/MAVIS.COMPLETE'), result=output_dir('{library}/annotate/batch-{job_id}/annotations.tab') - log: output_dir('{library}/annotate/snakemake.batch-{job_id}.log.txt') + log: os.path.join(LOG_DIR, '{library}.annotate.snakemake.batch-{job_id}.log.txt') container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=2, + log_dir=LOG_DIR shell: 'mavis annotate --config {rules.init_config.output}' + ' --library {wildcards.library}' @@ -165,8 +200,13 @@ rule pairing: result=output_dir('pairing/mavis_paired.tab') params: dirname=output_dir('pairing') - log: output_dir('snakemake.pairing.log.txt') + log: os.path.join(LOG_DIR, output_dir('snakemake.pairing.log.txt')) container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR shell: 'mavis pairing --config {rules.init_config.output}' + ' --inputs {input}' @@ -179,8 +219,13 @@ rule summary: output: output_dir('summary/MAVIS.COMPLETE') params: dirname=output_dir('summary') - log: output_dir('snakemake.summary.log.txt') + log: os.path.join(LOG_DIR, 'snakemake.summary.log.txt') container: CONTAINER + resources: + time_limit=MAX_TIME, + mem_mb=DEFAULT_MEMORY_MB, + cpus=1, + log_dir=LOG_DIR shell: 'mavis summary --config {rules.init_config.output}' + ' --inputs {input}' diff --git a/docs/install.md b/docs/install.md index b3468f7a..badd817b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -17,7 +17,7 @@ by singularity will take care of installing the aligner as well. ```bash pip install -U setuptools pip -pip install mavis_config # also installs snakemake +pip install mavis_config pandas # also installs snakemake ``` Now you will run mavis via Snakemake as follows diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index 27b5f51f..37a8a6ec 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -25,7 +25,7 @@ example, it can easily be run locally. However in order to run the snakemake fil to have the config validation module `mavis_config` installed which has minimal dependencies. ```bash -pip install mavis_config +pip install mavis_config pandas ``` Now you are ready to run MAVIS. This can be done in a single command using snakemake. diff --git a/tests/integration/test_annotate_examples.py b/tests/integration/test_annotate_examples.py index f6ed15ee..faf0297b 100644 --- a/tests/integration/test_annotate_examples.py +++ b/tests/integration/test_annotate_examples.py @@ -9,6 +9,7 @@ from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import ORIENT, PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE +from ..util import long_running_test from . import MockLongString, MockObject, get_example_genes @@ -99,6 +100,7 @@ def test_small_duplication(self): class TestSVEP1: + @long_running_test def test_annotate_small_intronic_inversion(self): gene = get_example_genes()['SVEP1'] reference_annotations = {gene.chr: [gene]} @@ -129,6 +131,7 @@ def test_annotate_small_intronic_inversion(self): assert len(ann.fusion.transcripts) == 1 assert ann.fusion.transcripts[0].get_seq() == refseq + @long_running_test def test_build_single_transcript_inversion(self): gene = get_example_genes()['SVEP1'] reference_genome = { diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py index 1125cf3e..2751caac 100644 --- a/tests/snakemake/test_mini_workflow.py +++ b/tests/snakemake/test_mini_workflow.py @@ -9,7 +9,7 @@ from snakemake import main as snakemake_main -from ..util import glob_exists, package_relative_file +from ..util import glob_exists, long_running_test, package_relative_file @pytest.fixture @@ -27,6 +27,7 @@ def output_dir(): shutil.rmtree(temp_output) +@long_running_test def test_workflow(output_dir): argv = [ 'snakemake', From 43153b5a23ccad395c287a8f8d2e103fcc2e7f9b Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 30 Apr 2021 09:38:07 -0700 Subject: [PATCH 033/137] Use config pkg helpers --- Snakefile | 56 ++++++++++++++++++++++------------------------- src/mavis/util.py | 6 ++--- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/Snakefile b/Snakefile index b9104212..51504c5a 100644 --- a/Snakefile +++ b/Snakefile @@ -1,9 +1,13 @@ from snakemake.exceptions import WorkflowError import os -from typing import List, Dict -import re import json -from mavis_config import validate_config +from mavis_config import ( + count_total_rows, + get_library_inputs, + get_singularity_bindings, + guess_total_batches, + validate_config, +) from mavis_config.constants import SUBCOMMAND # env variable mainly for CI/CD @@ -11,12 +15,21 @@ CONTAINER = os.environ.get('SNAKEMAKE_CONTAINER', 'docker://bcgsc/mavis:latest') MAX_TIME = 57600 DEFAULT_MEMORY_MB = 16000 + +if 'output_dir' not in config: + raise WorkflowError('output_dir is a required property of the configfile') + + def output_dir(*paths): return os.path.join(config['output_dir'], *paths) + INITIALIZED_CONFIG = output_dir('config.json') LOG_DIR = output_dir('logs') +# external schedulers will not create the log dir if it does not already exist +if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR, exist_ok=True) try: validate_config(config, stage=SUBCOMMAND.SETUP) @@ -24,42 +37,23 @@ except Exception as err: short_msg = ' '.join(str(err).split('\n')[:2]) # these can get super long raise WorkflowError(short_msg) +# ADD bindings for singularity +print(workflow.singularity_args) +workflow.singularity_args = f'-B {",".join(get_singularity_bindings(config))}' + libraries = sorted(list(config['libraries'])) VALIDATE_OUTPUT = output_dir('{library}/validate/batch-{job_id}/validation-passed.tab') CLUSTER_OUTPUT = output_dir('{library}/cluster/batch-{job_id}.tab') -# create the cluster inputs and guess the cluster sizes -def count_total_rows(filenames): - import pandas as pd - - row_count = 0 - for filename in filenames: - df = pd.read_csv(filename, sep='\t').drop_duplicates() - row_count += df.shape[0] - return row_count - for library in libraries: - lib_config = config['libraries'][library] - if 'total_batches' in lib_config: + if 'total_batches' in config['libraries'][library]: continue - inputs = [] - for assignment in lib_config['assign']: - if assignment in config['convert']: - inputs.extend(config['convert'][assignment]['inputs']) - else: - inputs.append(assignment) # if not input by user, estimate the clusters based on the input files - max_files = config['cluster.max_files'] - min_rows = config['cluster.min_clusters_per_file'] - total_rows = count_total_rows(inputs) - - if round(total_rows / max_files) >= min_rows: - # use max number of jobs - lib_config['total_batches'] = max_files - else: - lib_config['total_batches'] = total_rows // min_rows + config['libraries'][library]['total_batches'] = guess_total_batches( + config, get_library_inputs(config, library) + ) libs_args = [] @@ -81,6 +75,7 @@ rule copy_config: mem_mb=4000, cpus=1, log_dir=LOG_DIR + log: os.path.join(LOG_DIR, 'copy_config.snakemake.log.txt') run: with open(output_dir('config.raw.json'), 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) @@ -90,6 +85,7 @@ rule init_config: input: rules.copy_config.output output: INITIALIZED_CONFIG container: CONTAINER + log: os.path.join(LOG_DIR, 'init_config.snakemake.log.txt') resources: time_limit=MAX_TIME, mem_mb=DEFAULT_MEMORY_MB, diff --git a/src/mavis/util.py b/src/mavis/util.py index 4a22984a..8a3f2d41 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -282,7 +282,7 @@ def output_tabbed_file(bpps: List[BreakpointPair], filename: str, header=None): row = row.flatten() rows.append(row) if not custom_header: - header.update(row.keys()) + header.update(row.keys()) # type: ignore header = sort_columns(header) LOG('writing:', filename) df = pd.DataFrame.from_records(rows, columns=header) @@ -449,7 +449,7 @@ def soft_null_cast(value): return [] for col in required_columns: - if col not in df: + if col not in df and col not in add_default: raise KeyError(f'missing required column: {col}') # run the custom functions @@ -500,7 +500,7 @@ def soft_null_cast(value): ]: for col in cols: if col in df: - df[col].apply(lambda c: vocab.enforce(c)) + df[col].apply(lambda c: vocab.enforce(c)) # type: ignore elif hasattr(vocab, 'NS'): df[col] = vocab.NS # type: ignore From 135c977185790f2b92050e83c2269519981bd210 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 30 Apr 2021 09:38:34 -0700 Subject: [PATCH 034/137] Make dir for reference inputs --- docs/tutorials/full.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/tutorials/full.md b/docs/tutorials/full.md index ebcfd290..dc1828a9 100644 --- a/docs/tutorials/full.md +++ b/docs/tutorials/full.md @@ -39,8 +39,11 @@ The expected contents are Run the following to download the hg19 reference files ```bash -wget https://raw.githubusercontent.com/bcgsc/mavis/master/tools/get_hg19_reference_files.sh +wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh +mkdir reference_inputs +cd reference_inputs bash get_hg19_reference_files.sh +cd .. ``` ## Creating the Config File From f1383ecd3612b91e5534b326eb3f2a5606d8b1b4 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 30 Apr 2021 10:53:42 -0700 Subject: [PATCH 035/137] Lint md file --- docs/background/theory.md | 98 +++++++++-------------------------- docs/inputs/standard.md | 72 ++++++++++++------------- docs/outputs/illustrations.md | 12 ++--- 3 files changed, 61 insertions(+), 121 deletions(-) diff --git a/docs/background/theory.md b/docs/background/theory.md index ac4ab2a7..d63210a4 100644 --- a/docs/background/theory.md +++ b/docs/background/theory.md @@ -6,26 +6,20 @@ In MAVIS structural variants (SVs) are defined by a pair of breakpoints And a breakpoint is defined by -1. chromosome -2. base-pair range (start, end). This has a length of 1 for exact calls - and more for uncertain/non-specific calls -3. [orientation](../../glossary/#orientation). This is Left or Right - with respect to the positive/forward strand. This defines which - portion of the genome is 'retained' -4. [strand](../../glossary/#strand). (only applicable to - stranded transcriptome libraries) +1. chromosome +2. base-pair range (start, end). This has a length of 1 for exact calls and more for uncertain/non-specific calls +3. [orientation](../../glossary/#orientation). This is Left or Right with respect to the positive/forward strand. This defines which portion of the genome is 'retained' +4. [strand](../../glossary/#strand). (only applicable to stranded transcriptome libraries) So then a breakpoint pair is any two intervals on the reference genome which are adjacent in the mutant genome - - ## Evidence There are many ways that single reads or paired-end reads can act as support for an SV call. -![](../images/read_evidence.svg) +![read evidence](../images/read_evidence.svg) In the figure above the red rectangle represents a deletion structural variant. The arrows are types of single or paired-end reads supporting @@ -58,7 +52,7 @@ For a deletion, we expect the flanking reads to be in the normal orientation but that the fragment size should be abnormal (for large deletions). -![](../images/read_pairs_deletion.svg) +![deletion supporting read pairs](../images/read_pairs_deletion.svg) Flanking read pair evidence for a deletion event. the read pairs will have a larger than expected fragment size when mapped to the reference @@ -71,10 +65,9 @@ on the positive strand and the second read in the pair would be on the negative/reverse strand. - #### Insertion -![](../images/read_pairs_insertion.svg) +![insertion supporting read pairs](../images/read_pairs_insertion.svg) Flanking read pair evidence for an insertion event. The read pairs will have a smaller than expected fragment size when mapped to the @@ -87,10 +80,9 @@ on the positive strand and the second read in the pair would be on the negative/reverse strand. - #### Duplication -![](../images/read_pairs_duplication.svg) +![duplication support read pairs](../images/read_pairs_duplication.svg) Flanking read pair evidence for a tandem duplication event. The read pairs will have an abnormal orientation but still the same strands as @@ -99,57 +91,48 @@ strand and have a right orientation. (B2) The second breakpoint will be on the positive strand and have a left orientation. - #### Inversion -![](../images/read_pairs_inversion_LL.svg) +![inversion supporting read pairs](../images/read_pairs_inversion_LL.svg) Flanking read pair evidence for an inversion. Both breakpoints have left orientation. - -![](../images/read_pairs_inversion_RR.svg) +![inversion supporting read pairs](../images/read_pairs_inversion_RR.svg) Flanking read pair evidence for an inversion. Both breakpoints have right orientation. - #### Translocation -![](../images/read_pairs_translocation_LR.svg) +![translocation supporting read pairs](../images/read_pairs_translocation_LR.svg) Flanking read pair evidence for a translocation. (B1) the first breakpoint with a left orientation. (B2) the second breakpoint with a right orientation. - -![](../images/read_pairs_translocation_RL.svg) +![translocation supporting read pairs](../images/read_pairs_translocation_RL.svg) Flanking read pair evidence for a translocation. (B1) the first breakpoint with a right orientation. (B2) the second breakpoint with a left orientation. - #### Inverted Translocation -![](../images/read_pairs_translocated_inversion_LL.svg) +![translocation supporting read pairs](../images/read_pairs_translocated_inversion_LL.svg) Flanking read pair evidence for an inverted translocation. Both breakpoints have left orientation. - -![](../images/read_pairs_translocated_inversion_RR.svg) +![translocation supporting read pairs](../images/read_pairs_translocated_inversion_RR.svg) Flanking read pair evidence for an inverted translocation. Both breakpoints have right orientation. - - - ### Compatible Flanking Pairs For insertion and duplication events compatible flanking pairs are @@ -158,7 +141,7 @@ be used as compatible flanking evidence for an insertion (in the same region) and similarly flanking pairs which support an insertion may be compatible flanking evidence for a duplication -![](../images/compatible_flanking_pairs.svg) +![compatible flanking pairs](../images/compatible_flanking_pairs.svg) The event depicted above may be called as either a duplication or an insertion (depending on the input call). If the even were called as a @@ -167,30 +150,24 @@ reads in blue would be given as compatible flanking support. If the event were called as an insertion the reverse would apply. - - - ### Calculating the Evidence Window -![](../images/read_pair_definitions.svg) +![read pair defn terms](../images/read_pair_definitions.svg) Basic Terms used in describing read pairs are shown above: fragment size: the distance between the pair; read length: the length of the read; fragment size: the combined length of both reads and the fragment size - We make some base assumptions with regards to paired-end read data: !!! note the distribution of fragment sizes approximately follows a normal distribution - !!! note the most common fragment size is the unmutated 'normal' fragment - With the above assumptions we take the median fragment size to be the expected normal. @@ -217,7 +194,7 @@ stdev = math.sqrt(sum(X) / len(X)) This gives us an idea of when to judge an fragment size as abnormal and where we expect our normal read pairs fragment sizes to fall. -![](../images/fragment_sizes_histogram.svg) +![read pair fragment size histogram](../images/fragment_sizes_histogram.svg) Distribution of fragment sizes (absolute values) of proper read pairs. The black curve representings the fit for a normal distribution using @@ -227,14 +204,13 @@ thick vertical black line is the median and the thin black lines are standard deviations away from the median. - As we can see from the diagram above, removing the outliers reproduces the observed distribution better than using all data points We use this in two ways -1. to find flanking evidence supporting deletions and insertions -2. to estimate the window size for where we will need to read from the +1. to find flanking evidence supporting deletions and insertions +2. to estimate the window size for where we will need to read from the bam when looking for evidence for a given event The @@ -250,8 +226,6 @@ complicated and we must take into account the possible annotations when calculating the evidence window. see `mavis.validate.evidence.TranscriptomeEvidence._generate_window` for more - - ### Calling Breakpoints by Flanking Evidence Breakpoints are called by contig, split-read, or flanking pairs @@ -273,8 +247,6 @@ outline, no fill) demonstrates the read length used to narrow the right side bound of the [estimated breakpoint interval. - - ### Determining Flanking support ![flanking support](../images/flanking_pairs_fragment_sizes_deletion.svg) @@ -290,7 +262,6 @@ The shaded portion of the graph represents the range in fragment sizes we expect for flanking pairs supporting the deletion event. - ## Classifying Events The following decision tree is used in classifying events based on their @@ -317,8 +288,6 @@ reverse complement are assembled into contigs using a [DeBruijn graph](../../glossary/#debruijn-graph). For strand specific events, we then attempt to resolve the sequence strand of the contig. - - ## Annotating Events We make the following assumptions when determining the annotations for @@ -328,15 +297,12 @@ each event If both breakpoints are in the same gene, they must also be in the same transcript - !!! note If the breakpoint intervals overlap we do not annotate encompassed genes - !!! note Encompassed and 'nearest' genes are reported without respect to strand - There are specific questions we want annotation to answer. We collect gene level annotations which describes things like what gene is near the breakpoint (useful in the case of a potential promoter swap); what genes @@ -357,8 +323,6 @@ computed. This is translated to a putative amino acid sequence from which protein metrics such as the possible ORFs and domain sequences can be computed. - - ## Predicting Splicing Patterns After the events have been called and an annotation has been attached, @@ -392,28 +356,17 @@ is paired with the 2nd donor site More complex examples are drawn below. There are five classifications -(`mavis.constants.SPLICE_TYPE`) for the +([`mavis.constants.SPLICE_TYPE`](../../package/mavis/constants/#class-mavisconstantssplice_type)) for the different splicing patterns: -1. Retained intron - (`mavis.constants.SPLICE_TYPE.RETAIN`{.interpreted-text - role="class"}) -2. Skipped exon (`mavis.constants.SPLICE_TYPE.SKIP`{.interpreted-text - role="attr"}) -3. Multiple retained introns - (`mavis.constants.SPLICE_TYPE.MULTI_RETAIN`{.interpreted-text - role="attr"}) -4. Multiple skipped exons - (`mavis.constants.SPLICE_TYPE.MULTI_SKIP`{.interpreted-text - role="attr"}) -5. Some combination of retained introns and skipped exons - (`mavis.constants.SPLICE_TYPE.COMPLEX`{.interpreted-text - role="attr"}) +1. Retained intron +2. Skipped exon +3. Multiple retained introns +4. Multiple skipped exons +5. Some combination of retained introns and skipped exons ![Splicing scenarios](../images/splicing_model.svg) - - ## Pairing Similar Events After breakpoints have been called and annotated we often need to see if @@ -430,7 +383,6 @@ rise to the following basic cases. breakpoint, or it is the same as the nearest retained donor/acceptor to the breakpoint. - ![exonic splicing](../images/breakpoint_prediction_exonic.svg) (A-D) The breakpoint lands in an exon and the five prime portion of diff --git a/docs/inputs/standard.md b/docs/inputs/standard.md index cd3d7e92..373a7cb1 100644 --- a/docs/inputs/standard.md +++ b/docs/inputs/standard.md @@ -1,40 +1,41 @@ # MAVIS standard input file format - These requirements pertain to the columns of input files from the various tools you want to merge. The input files should be tab-delimited text files. Comments at the top of may be included. Comments should -begin with two hash marks. They will be ignored when the file is read - +begin with hash marks. They will be ignored when the file is read - ## This is a comment +```text +## This is a comment +``` The header row contains the column names and is the first row following -the comments (or the first row if no comments are included). Optionally -the header row may (or may not) begin with a hash which will be stripped -out on read +the comments (or the first row if no comments are included). - ## This is a comment - ## this is another comment - # this is the header row +```text +## This is a comment +## this is another comment +# this is also a comment +This Is The Header +``` A simple input file might look as follows - ## File created at: 2018-01-02 - ## Generated by: MAVIS v1.0.0 - #break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end - X 1234 1234 X 77965 77965 +```text +## File created at: 2018-01-02 +## Generated by: MAVIS v1.0.0 +break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end +X 1234 1234 X 77965 77965 +``` ## Required Columns -- [break1_chromosome](../../outputs/columns/#break1_chromosome) -- [break1_position_start](../../outputs/columns/#break1_position_start) -- [break1_position_end](../../outputs/columns/#break1_position_end) (can be the - same as break1\_position\_start) -- [break2_chromosome](../../outputs/columns/#break2_chromosome) -- [break2_position_start](../../outputs/columns/#break2_position_start) -- [break2_position_end](../../outputs/columns/#break2_position_end) (can be the - same as break2\_position\_start) +- [break1_chromosome](../../outputs/columns/#break1_chromosome) +- [break1_position_start](../../outputs/columns/#break1_position_start) +- [break1_position_end](../../outputs/columns/#break1_position_end) (can be the same as break1\_position\_start) +- [break2_chromosome](../../outputs/columns/#break2_chromosome) +- [break2_position_start](../../outputs/columns/#break2_position_start) +- [break2_position_end](../../outputs/columns/#break2_position_end) (can be the same as break2\_position\_start) ## Optional Columns @@ -42,24 +43,15 @@ Optional Columns that are not given as input will be added with default (or command line parameter options) during the clustering stage of MAVIS as some are required for subsequent pipeline steps -- [break1_strand](../../outputs/columns/#break1_strand) (defaults to - not-specified during clustering) -- [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all - possible values during clustering) -- [break2_strand](../../outputs/columns/#break2_strand) (defaults to - not-specified during clustering) -- [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all - possible values during clustering) -- [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all - possible values during clustering) -- [stranded](../../outputs/columns/#stranded) (defaults to False during - clustering) -- [library](../../outputs/columns/#library) (defaults to command line - library parameter during clustering) -- [protocol](../../outputs/columns/#protocol) (defaults to command line - protocol parameter during clustering) -- [tools](../../outputs/columns/#tools) (defaults to an empty string - during clustering) +- [break1_strand](../../outputs/columns/#break1_strand) (defaults to not-specified during clustering) +- [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all possible values during clustering) +- [break2_strand](../../outputs/columns/#break2_strand) (defaults to not-specified during clustering) +- [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all possible values during clustering) +- [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all possible values during clustering) +- [stranded](../../outputs/columns/#stranded) (defaults to False during clustering) +- [library](../../outputs/columns/#library) (defaults to command line library parameter during clustering) +- [protocol](../../outputs/columns/#protocol) (defaults to command line protocol parameter during clustering) +- [tools](../../outputs/columns/#tools) (defaults to an empty string during clustering) ## Summary by Pipeline Step diff --git a/docs/outputs/illustrations.md b/docs/outputs/illustrations.md index f419db26..da7fb6a3 100644 --- a/docs/outputs/illustrations.md +++ b/docs/outputs/illustrations.md @@ -5,33 +5,29 @@ These are diagrams produced during the annotate step. These represent the putative fusion events of a single breakpoint pair. -![](../images/GIMAP4_IL7_fusion.svg) +![fusion diagram](../images/GIMAP4_IL7_fusion.svg) Fusion from transcriptome data. Intronic breakpoints here indicate retained intron sequence and a novel exon is predicted. - If the [draw_fusions_only](../../configuration/settings/#draw_fusions_only flag is set to False then all events will produce a diagram, even anti-sense fusions -![](../images/UBE2V2_GIMAP4_disruptive_fusion.svg) +![disruptive fusion diagram](../images/UBE2V2_GIMAP4_disruptive_fusion.svg) Disruptive Anti-sense Fusion - ## Transcript Overlays MAVIS supports generating diagrams of all transcripts for a given gene. These can be overlaid with markers and bam\_file pileup data. This is particularly useful for visualizing splice site mutations. -![](../images/ENSG00000139687_RB1_overlay.png) - -RB1 splice site mutation results in skipping of exon -9 +![overlay diagram](../images/ENSG00000139687_RB1_overlay.png) +RB1 splice site mutation results in skipping of exon 9 The above diagram was generated using the overlay command From d5130336e89d7bc5572a1371d098588ac7b03018 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 30 Apr 2021 11:18:36 -0700 Subject: [PATCH 036/137] Use relative links --- docs/outputs/columns.md | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/docs/outputs/columns.md b/docs/outputs/columns.md index 3dfd797f..14cdcdd2 100644 --- a/docs/outputs/columns.md +++ b/docs/outputs/columns.md @@ -3,7 +3,6 @@ List of column names and their definitions. The types indicated here are the expected types in a row for a given column name. - ## library Identifier for the library/source @@ -34,7 +33,7 @@ decision from the annotation step ## event\_type -**type**: [`mavis.constants.SVTYPE`](/package/mavis/constants/#class-mavisconstantssvtype) +**type**: [`mavis.constants.SVTYPE`](../package/mavis/constants/#class-mavisconstantssvtype) The classification of the event @@ -57,7 +56,7 @@ Gene for the current annotation at the first breakpoint ## gene1\_direction -**type**: [`mavis.constants.PRIME`](/package/mavis/constants/#class-mavisconstantsprime) +**type**: [`mavis.constants.PRIME`](../package/mavis/constants/#class-mavisconstantsprime) The direction/prime of the gene @@ -68,7 +67,7 @@ Gene for the current annotation at the second breakpoint ## gene2\_direction -**type**: [`mavis.constants.PRIME`](/package/mavis/constants/#class-mavisconstantsprime) +**type**: [`mavis.constants.PRIME`](../package/mavis/constants/#class-mavisconstantsprime) The direction/prime of the gene. Has the following possible values @@ -85,16 +84,11 @@ second breakpoint ## gene\_product\_type -**type**: [`mavis.constants.GENE_PRODUCT_TYPE`](/package/mavis/constants/#class-mavisconstantsgene_product_type) +**type**: [`mavis.constants.GENE_PRODUCT_TYPE`](../package/mavis/constants/#class-mavisconstantsgene_product_type) Describes if the putative fusion product will be sense or anti-sense -## fusion\_cdna\_coding\_end - -Position wrt the 5' end of the fusion transcript where coding ends -last base of the stop codon - ## transcript1 Transcript for the current annotation at the first breakpoint @@ -105,7 +99,7 @@ Transcript for the current annotation at the second breakpoint ## fusion\_splicing\_pattern -**type**: [`mavis.constants.SPLICE_TYPE`](/package/mavis/constants/#class-mavisconstantsslice_type) +**type**: [`mavis.constants.SPLICE_TYPE`](../package/mavis/constants/#class-mavisconstantsslice_type) Type of splicing pattern used to create the fusion cDNA. @@ -206,14 +200,14 @@ End integer inclusive ## break1\_orientation -**type**: [`mavis.constants.ORIENT`](/package/mavis/constants/#class-mavisconstantsorient) +**type**: [`mavis.constants.ORIENT`](../package/mavis/constants/#class-mavisconstantsorient) The side of the breakpoint wrt the positive/forward strand that is retained. ## break1\_strand -**type**: [`mavis.constants.STRAND`](/package/mavis/constants/#class-mavisconstantsstrand) +**type**: [`mavis.constants.STRAND`](../package/mavis/constants/#class-mavisconstantsstrand) The strand wrt to the reference positive/forward strand at this @@ -247,14 +241,14 @@ End integer inclusive ## break2\_orientation -**type**: [`mavis.constants.ORIENT`](/package/mavis/constants/#class-mavisconstantsorient) +**type**: [`mavis.constants.ORIENT`](../package/mavis/constants/#class-mavisconstantsorient) The side of the breakpoint wrt the positive/forward strand that is retained. ## break2\_strand -**type**: [`mavis.constants.STRAND`](/package/mavis/constants/#class-mavisconstantsstrand) +**type**: [`mavis.constants.STRAND`](../package/mavis/constants/#class-mavisconstantsstrand) The strand wrt to the reference positive/forward strand at this @@ -284,7 +278,7 @@ protocol was strand specific or not. Expects a boolean ## protocol -**type**: [`mavis.constants.PROTOCOL`](/package/mavis/constants/#class-mavisconstantsprotocol) +**type**: [`mavis.constants.PROTOCOL`](../package/mavis/constants/#class-mavisconstantsprotocol) Specifies the type of library @@ -406,7 +400,7 @@ event ## call\_method -**type**: [`mavis.constants.CALL_METHOD`](/package/mavis/constants/#class-mavisconstantscall_method) +**type**: [`mavis.constants.CALL_METHOD`](../package/mavis/constants/#class-mavisconstantscall_method) The method used to call the breakpoints From 06230daf7bf0249cf30fba8ff8dbc2f645586fba Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 30 Apr 2021 12:54:45 -0700 Subject: [PATCH 037/137] Use new version of mavis_config --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cf458d52..7b2c3a03 100644 --- a/setup.py +++ b/setup.py @@ -90,7 +90,7 @@ def check_nonpython_dependencies(): 'pyvcf==0.6.8', 'shortuuid>=0.5.0', 'svgwrite', - 'mavis_config==1.0.0', + 'mavis_config>=1.1.0, <2.0.0', ] DEPLOY_REQS = ['twine', 'm2r', 'wheel'] From b3d5256bae389d0f221c35fe980134cf357afb22 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 1 May 2021 13:46:45 -0700 Subject: [PATCH 038/137] Remove unused functions --- src/mavis/config.py | 3 +- src/mavis/util.py | 57 +------------------------------- tests/end_to_end/test_convert.py | 6 ++-- 3 files changed, 6 insertions(+), 60 deletions(-) diff --git a/src/mavis/config.py b/src/mavis/config.py index 38fcb500..a0b4341a 100644 --- a/src/mavis/config.py +++ b/src/mavis/config.py @@ -4,6 +4,7 @@ from typing import Dict, Optional import snakemake +from mavis_config import bash_expands from snakemake.exceptions import WorkflowError from snakemake.utils import validate as snakemake_validate @@ -11,7 +12,7 @@ from .bam import stats from .bam.cache import BamCache from .constants import INTEGER_COLUMNS, PROTOCOL, SUBCOMMAND, float_fraction -from .util import bash_expands, cast_boolean, filepath +from .util import cast_boolean, filepath def calculate_bam_stats(config: Dict, library_name: str) -> Dict: diff --git a/src/mavis/util.py b/src/mavis/util.py index 8a3f2d41..7f43be00 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Set import pandas as pd -from braceexpand import braceexpand +from mavis_config import bash_expands from shortuuid import uuid from .breakpoint import Breakpoint, BreakpointPair @@ -144,45 +144,6 @@ def soft_cast(value, cast_type): return cast_null(value) -def get_env_variable(arg, default, cast_type=None): - """ - Args: - arg (str): the argument/variable name - Returns: - the setting from the environment variable if given, otherwise the default value - """ - if cast_type is None: - cast_type = type(default) - name = ENV_VAR_PREFIX + str(arg).upper() - result = os.environ.get(name, None) - if result is not None: - return cast(result, cast_type) - return default - - -def bash_expands(*expressions): - """ - expand a file glob expression, allowing bash-style brackets. - - Returns: - list: a list of files - - Example: - >>> bash_expands('./{test,doc}/*py') - [...] - """ - result = [] - for expression in expressions: - eresult = [] - for name in braceexpand(expression): - for fname in glob(name): - eresult.append(fname) - if not eresult: - raise FileNotFoundError('The expression does not match any files', expression) - result.extend(eresult) - return [os.path.abspath(f) for f in result] - - def log_arguments(args): """ output the arguments to the console @@ -375,22 +336,6 @@ def filter_uninformative(annotations_by_chr, breakpoint_pairs, max_proximity=500 return result, filtered -def unique_exists( - pattern: str, allow_none: bool = False, get_newest: bool = False -) -> Optional[str]: - result = bash_expands(pattern) - if len(result) == 1: - return result[0] - elif result: - if get_newest: - return max(result, key=lambda x: os.stat(x).st_mtime) - raise OSError('duplicate results:', result) - elif allow_none: - return None - else: - raise OSError('no result found', pattern) - - def read_bpp_from_input_file( filename: str, expand_orient: bool = False, diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index 514fae52..95b45309 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -8,9 +8,9 @@ from mavis.constants import ORIENT, SUBCOMMAND, SVTYPE from mavis.main import main from mavis.tools import SUPPORTED_TOOL -from mavis.util import read_bpp_from_input_file, unique_exists +from mavis.util import read_bpp_from_input_file -from ..util import get_data +from ..util import get_data, glob_exists TEMP_OUTPUT = None @@ -40,7 +40,7 @@ def run_main(self, inputfile, file_type, strand_specific=False): with patch.object(sys, 'argv', args): main() print('output', outputfile) - assert unique_exists(outputfile) + assert glob_exists(outputfile, n=1) result = {} for pair in read_bpp_from_input_file(outputfile): result.setdefault(pair.data['tracking_id'], []).append(pair) From 899d8cd1469798cdbf9ab675f045a17c6ca47771 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 1 May 2021 13:49:22 -0700 Subject: [PATCH 039/137] BugFix update data column access --- src/mavis/summary/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index 5287d7d8..f30089b4 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -290,7 +290,7 @@ def filter_by_evidence( removed.append(bpp) continue elif bpp.column('call_method') == CALL_METHOD.SPAN: - if bpp.spanning_reads < filter_min_spanning_reads: + if bpp.column('spanning_reads') < filter_min_spanning_reads: removed.append(bpp) continue elif bpp.column('call_method') == CALL_METHOD.SPLIT: From 21507f0708bc72ca9a5feaa9550551e0d66d34e3 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 1 May 2021 13:49:47 -0700 Subject: [PATCH 040/137] Test bwa and blat in mini workflow --- tests/snakemake/test_mini_workflow.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py index 2751caac..963f339c 100644 --- a/tests/snakemake/test_mini_workflow.py +++ b/tests/snakemake/test_mini_workflow.py @@ -13,7 +13,7 @@ @pytest.fixture -def output_dir(): +def blat_output_dir(): temp_output = tempfile.mkdtemp() os.makedirs(os.path.join(temp_output, 'mavis/schemas')) @@ -21,13 +21,37 @@ def output_dir(): with open(package_relative_file('tests/mini-tutorial.config.json'), 'r') as fh: config = json.load(fh) config['output_dir'] = os.path.join(temp_output, 'output_dir') + config['validate.aligner'] = 'blat' with open(os.path.join(temp_output, 'mini-tutorial.config.json'), 'w') as fh: fh.write(json.dumps(config)) yield temp_output shutil.rmtree(temp_output) +@pytest.fixture +def bwa_output_dir(): + temp_output = tempfile.mkdtemp() + + os.makedirs(os.path.join(temp_output, 'mavis/schemas')) + + with open(package_relative_file('tests/mini-tutorial.config.json'), 'r') as fh: + config = json.load(fh) + config['output_dir'] = os.path.join(temp_output, 'output_dir') + config['validate.aligner'] = 'bwa mem' + config['reference.aligner_reference'] = config['reference.reference_genome'] + with open(os.path.join(temp_output, 'mini-tutorial.config.json'), 'w') as fh: + fh.write(json.dumps(config)) + yield temp_output + shutil.rmtree(temp_output) + + +@pytest.fixture +def output_dir(request): + return request.getfixturevalue(request.param) + + @long_running_test +@pytest.mark.parametrize('output_dir', ['blat_output_dir', 'bwa_output_dir'], indirect=True) def test_workflow(output_dir): argv = [ 'snakemake', From c054eafa1b75c80f2392bca4d0c853d86dae8475 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 1 May 2021 13:59:20 -0700 Subject: [PATCH 041/137] Remove pandas --- README.md | 2 +- docs/install.md | 2 +- docs/tutorials/mini.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c3f18975..b6e6ef45 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ by singularity will take care of installing the aligner as well. ```bash pip install -U setuptools pip -pip install mavis_config pandas # also installs snakemake +pip install mavis_config # also installs snakemake ``` Now you will run mavis via Snakemake as follows diff --git a/docs/install.md b/docs/install.md index badd817b..b3468f7a 100644 --- a/docs/install.md +++ b/docs/install.md @@ -17,7 +17,7 @@ by singularity will take care of installing the aligner as well. ```bash pip install -U setuptools pip -pip install mavis_config pandas # also installs snakemake +pip install mavis_config # also installs snakemake ``` Now you will run mavis via Snakemake as follows diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index 37a8a6ec..27b5f51f 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -25,7 +25,7 @@ example, it can easily be run locally. However in order to run the snakemake fil to have the config validation module `mavis_config` installed which has minimal dependencies. ```bash -pip install mavis_config pandas +pip install mavis_config ``` Now you are ready to run MAVIS. This can be done in a single command using snakemake. From 027c4039b21bf6d2c663c0f185d1a3ed62c8212c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 1 May 2021 14:00:34 -0700 Subject: [PATCH 042/137] Remove leftover print --- Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Snakefile b/Snakefile index 51504c5a..113dc2fb 100644 --- a/Snakefile +++ b/Snakefile @@ -38,7 +38,6 @@ except Exception as err: raise WorkflowError(short_msg) # ADD bindings for singularity -print(workflow.singularity_args) workflow.singularity_args = f'-B {",".join(get_singularity_bindings(config))}' libraries = sorted(list(config['libraries'])) From dd018541fab52f5c3a50d74420a52b422e6c75af Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Wed, 11 Aug 2021 11:47:18 -0700 Subject: [PATCH 043/137] #254 fix nan values confusing downstream None checks --- src/mavis/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mavis/util.py b/src/mavis/util.py index 7f43be00..abd55c34 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -390,6 +390,7 @@ def soft_null_cast(value): comment='#', na_values=['None', 'none', 'N/A', 'n/a', 'null', 'NULL', 'Null', 'nan', '', 'NaN'], ) + df = df.where(pd.notnull(df), None) except pd.errors.EmptyDataError: return [] From 94b1d1463867f8ad814cf09ac93b8aca4ace5589 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 20 Dec 2021 13:03:16 -0800 Subject: [PATCH 044/137] Create LICENSE --- LICENSE | 674 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 674 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f288702d --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. From 579de590c27426867001bdc4aa272d917b1dd72c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 20 Dec 2021 13:03:51 -0800 Subject: [PATCH 045/137] Remove Old License --- LICENSE.txt | 230 ---------------------------------------------------- 1 file changed, 230 deletions(-) delete mode 100644 LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index 20586469..00000000 --- a/LICENSE.txt +++ /dev/null @@ -1,230 +0,0 @@ -MAVIS -Copyright 2017 Canada's Michael Smith Genome Sciences Centre - -BC CANCER AGENCY SOFTWARE LICENSE AGREEMENT (ACADEMIC USE) -CAREFULLY READ THE FOLLOWING TERMS AND CONDITIONS. This License -Agreement (the "Agreement") is a legal contract between you, your -employer, educational institution or organization (collectively, "You") -and the British Columbia Cancer Agency ("BCCA") with respect to the -license of the software, including all associated documentation -(collectively, the "Product"). - -BCCA is willing to license the Product to You only if You accept the -terms and conditions of this Agreement. By clicking on the "I ACCEPT" -button, or by copying, downloading, accessing or otherwise using the -Product, You automatically agree to be bound by the terms of this -Agreement. IF YOU DO NOT WISH TO BE BOUND BY THE TERMS OF THIS -AGREEMENT, DO NOT COPY, DOWNLOAD, ACCESS OR OTHERWISE USE THE -PRODUCT. - -1. AUTHORITY: In the event that You are an educational institution or -organization, Your representative who is clicking the "I ACCEPT" -button, or otherwise copying, downloading, accessing or using the -Product hereby, in their personal capacity, represents and warrants -that they possess the legal authority to enter into this Agreement -on Your behalf and to bind You to the terms of this Agreement. - -2. LICENSE TO USE: BCCA hereby grants to You a personal, non-exclusive, -non-transferable, limited license to use the Product solely for -internal, non-commercial use for non-profit research or educational -purposes only on the terms and conditions contained in this Agreement. -The Product may be installed at a single site at Your premises only. A -copy of the Product installed on a single common machine or cluster of -machines may be shared for internal use by Qualified Users only. In -order to be a "Qualified User", an individual must be a student, -researcher, professor, instructor or staff member of a non-profit -educational institution or organization who uses the Product solely for -non-profit research or educational purposes. - -3. RESTRICTIONS: You acknowledge and agree that You shall not, and -shall not authorize any third party to: -(a) make copies of the Product, except as provided in Section 2 and -except for a single backup copy, and any such copy together with the -original must be kept in Your possession or control; -(b) modify, adapt, decompile, disassemble, translate into another -computer language, create derivative works of, or otherwise reverse -engineer the Product, or disclose any trade secrets relating to the -Product, except as permitted in Section 5; -(c) license, sublicense, distribute, sell, lease, transfer, assign, -trade, rent or publish the Product or any part thereof and/or copies -thereof, to any third party; -(d) use the Product to process any data other than Your own; -(e) use the Product or any part thereof for any commercial or -for-profit purpose or any other purpose other than as permitted in -Section 2; or -(f) use, without its express permission, the name of BCCA. - -4. INTELLECTUAL PROPERTY RIGHTS: Subject to Section 5 below, all -patents, copyrights, trade secrets, service marks, trademarks and -other proprietary rights in or related to the Product and any -improvements, modifications and enhancements thereof are and will -remain the exclusive property of BCCA or its licensors. You agree -that You will not, either during or after the termination of this -Agreement, contest or challenge the title to or the intellectual -property rights of BCCA or its licensors in the Product or any -portion thereof. - -5. OWNERSHIP OF IMPROVEMENTS: In the event that the Product, in the -form provided to You, includes source code (the "Source Code"), -You are entitled to make improvements, modifications and -enhancements to the Source Code (collectively, "Improvements") -which Improvements are to be used by You for non-profit research -and educational purposes only and You shall be the owner of those -Improvements that You directly make and of all intellectual -property rights to such Improvements, subject to the foregoing -limits on Your use and distribution of such Improvements. You -hereby grant to BCCA a perpetual, non-exclusive, worldwide, -fully-paid, irrevocable license to use such Improvements for any -purposes whatsoever, and to sublicense such Improvements including -the right for third parties to sublicense the same, in perpetuity -to the extent such rights are not limited in duration under -applicable law, without identifying or seeking Your -consent. Notwithstanding the foregoing, You acknowledge that BCCA -and its licensors will retain or own all rights in and to any -pre-existing code or other technology, content and data that may be -incorporated in the Improvements. For greater certainty, this -Section applies solely to the Source Code and shall not give You -any rights with respect to the object code or any other portion or -format of the Product which use, for greater certainty, is limited -as set forth in this Agreement including as set out in Section 3(b) -above. You acknowledge and agree that you will provide copies of -Improvements to BCCA in such format as reasonably requested by BCCA -at any time upon the request of BCCA. - -6. CONFIDENTIALITY: You acknowledge that the Product is and -incorporates confidential and proprietary information developed, -acquired by or licensed to BCCA. You will take all reasonable -precautions necessary to safeguard the confidentiality of the -Product, and will not disclose any information about the Product to -any other person without BCCA's prior written consent. You will -not allow the removal or defacement of any confidential or -proprietary notice placed on the Product. You acknowledge that any -breach of this Section 6 will cause irreparable harm to BCCA and -its licensors. - -7. NO WARRANTIES: THIS PRODUCT IS PROVIDED TO YOU BY BCCA IN ORDER TO -ALLOW YOU TO OBTAIN ACCESS TO LEADING ACADEMIC RESEARCH. THE PRODUCT -IS PROVIDED TO YOU ON AN "AS IS" BASIS WITHOUT WARRANTY OF ANY -KIND. NO WARRANTY, REPRESENTATION OR CONDITION EITHER EXPRESS OR -IMPLIED, INCLUDING WITHOUT LIMITATION, ANY IMPLIED WARRANTY OR -CONDITION OF MERCHANTABILITY, NON-INFRINGEMENT, PERFORMANCE, -DURABILITY OR FITNESS FOR A PARTICULAR PURPOSE OR USE SHALL -APPLY. BCCA DOES NOT WARRANT THAT THE PRODUCT WILL OPERATE ON A -CONTINUOUS OR TROUBLE FREE BASIS. - -8. LIMITATION OF LIABILITY: TO THE MAXIMUM EXTENT PERMITTED BY -APPLICABLE LAW, IN NO EVENT SHALL THE AGGREGATE LIABILITY OF BCCA TO -YOU EXCEED THE AMOUNT YOU HAVE PAID TO ACQUIRE THE PRODUCT ("MAXIMUM -AMOUNT") AND WHERE YOU HAVE NOT PAID ANY AMOUNT FOR THE PRODUCT THEN -THE MAXIMUM AMOUNT SHALL BE DEEMED TO BE CDN$100.00. IN NO EVENT SHALL -BCCA BE LIABLE FOR ANY INDIRECT, INCIDENTAL, CONSEQUENTIAL, OR SPECIAL -DAMAGES, INCLUDING WITHOUT LIMITATION ANY DAMAGES FOR LOST PROFITS OR -SAVINGS, REGARDLESS OF WHETHER THEY HAVE BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. EXCEPT TO THE EXTENT THAT THE LAWS OF A -COMPETENT JURISDICTION REQUIRE LIABILITIES BEYOND AND DESPITE THESE -LIMITATIONS, EXCLUSIONS AND DISCLAIMERS, THESE LIMITATIONS, EXCLUSIONS -AND DISCLAIMERS SHALL APPLY WHETHER AN ACTION, CLAIM OR DEMAND ARISES -FROM A BREACH OF WARRANTY OR CONDITION, BREACH OF CONTRACT, -NEGLIGENCE, STRICT LIABILITY OR ANY OTHER KIND OF CIVIL OR STATUTORY -LIABILITY CONNECTED WITH OR ARISING FROM THIS AGREEMENT. YOU AGREE -THAT THE FOREGOING DISCLAIMER OF WARRANTIES AND LIMITATION OF -LIABILITY ARE FAIR IN LIGHT OF THE NATURE OF THE RIGHTS GRANTED HEREIN -AND THE AMOUNT OF FEES PAID BY YOU IN RESPECT OF THE PRODUCT. - -9. INDEMNITY: You will indemnify, defend and hold harmless BCCA, its -board of directors, staff and agents from and against any and all -liability, loss, damage, action, claim or expense (including -attorney's fees and costs at trial and appellate levels) in -connection with any claim, suit, action, demand or judgement -(collectively, "Claim") arising out of, connected with, resulting -from, or sustained as a result of Your use of the Product or the -downloading of the Product, including without limitation, any Claim -relating to infringement of BCCA's intellectual property rights or -the intellectual property rights of any third party. - -10. SUPPORT AND MAINTENANCE: You acknowledge and agree that, unless -and to the extent expressly agreed by BCCA in a separate written -document, the Product is provided to You without any support or -maintenance from BCCA and, for greater certainty, BCCA shall have -no obligation to issue any update or upgrade to any Product. - -11. TERM: This Agreement is effective until terminated. You may -terminate this Agreement at any time by ceasing use of the Product -and destroying or deleting any copies of the Product. This -Agreement will terminate immediately without notice from BCCA if -You fail to comply with any provision of this Agreement. BCCA may -terminate this Agreement at any time upon notice to you where BCCA -determines, in its sole discretion, that any continued use of the -Product could infringe the rights of any third parties. Upon -termination of this Agreement, and in any event upon BCCA -delivering You notice of termination, You shall immediately purge -all Products from Your computer system(s), return to BCCA all -copies of the Product that are in Your possession or control, and -cease any further development of any Improvements. On any -termination of this Agreement Sections 1, 4, 6, 7, 8, 9, 13 and 14 -shall survive such termination. - -12. GOVERNMENT END USERS: Where any of the Product is used, duplicated -or disclosed by or to the United States government or a government -contractor or sub contractor, it is provided with RESTRICTED -RIGHTS as defined in Title 48 CFR 52.227-19 and is subject to the -following: Title 48 CFR 2.101, 52.227-19, 227.7201 through -227.7202-4, FAR 52.227-14, and FAR 52.227-19(c)(1-2) and (6/87), -and where applicable, the customary software license, as described -in Title 48 CFR 227-7202 with respect to commercial software and -commercial software documentation including DFAR 252.227-7013, -DFAR 252,227-7014, DFAR 252.227-7015 and DFAR 252.7018, all as -applicable. - -13. USE OF THE DOWNLOAD SERVICE: You acknowledge and agree that you -will be responsible for all costs, charges and taxes (where -applicable) arising out of Your use of the Product and the -downloading of the Product. You acknowledge that You are -responsible for supplying any hardware or software necessary to -use the Product pursuant to this Agreement. - -14. GENERAL PROVISIONS: -(a) This Agreement will be governed by the laws of the Province of -British Columbia, and the laws of Canada applicable therein, excluding -any rules of private international law that lead to the application of -the laws of any other jurisdiction. The United Nations Convention on -Contracts for the International Sale of Goods (1980) does not apply to -this Agreement. The courts of the Province of British Columbia shall -have non-exclusive jurisdiction to hear any matter arising in -connection with this Agreement. -(b) USE OF THE PRODUCT IS PROHIBITED IN ANY JURISDICTION WHICH DOES -NOT GIVE EFFECT TO THE TERMS OF THIS AGREEMENT. -(c) You agree that no joint venture, partnership, employment, -consulting or agency relationship exists between You and BCCA as a -result of this Agreement or Your use of the Product. -(d) You hereby consent to Your contact information and any other -personally identifiable information that You provide to us being -disclosed to and maintained and used by us and our business partners -for the purposes of (i) managing and developing our respective -businesses and operations; (ii) marketing products and services to You -and your staff; and (iii) developing new and enhancing existing -products. You further agree that we may provide this information to -other persons as required to satisfy any legal requirements and to any -person that acquires some or all of the assets of BCCA. Where any of -the personally identifiable information that You provide to us is in -respect of individuals other than Yourself (such as Your staff) then -You represent and warrant to use that You have obtained all necessary -consents and authorizations from such individuals in order to comply -with this provision. Please see the BCCA website for further -information regarding personally identifiable information. -(e) This Agreement is the entire Agreement between You and BCCA -relating to this subject matter. You will not contest the validity of -this Agreement merely because it is in electronic form. No -modification of this Agreement will be binding, unless in writing and -accepted by an authorized representative of each party. -(f) The provisions of this Agreement are severable in that if any -provision in the Agreement is determined to be invalid or -unenforceable under any controlling body of law, that will not affect -the validity or enforceability of the remaining provisions of the -Agreement. -(g) You agree to print out or download a copy of this Agreement and -retain it for Your records. -(h) You consent to the use of the English language in this Agreement. -(i) You may not assign this Agreement or any of Your rights or -obligations hereunder without BCCA's prior written consent. BCCA, at -its sole discretion may assign this Agreement without notice to You. From e0e0e6e9d446483c2388f7714517eb2dffbb2183 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 12:57:08 -0800 Subject: [PATCH 046/137] Fix name of license in manifest --- MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 8b270b97..2ab6e3c1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include src *.py *.json include README.md -include LICENSE.txt +include LICENSE prune docs/build prune docs/source/auto From dea842eaf404c5c249edb04a7d41f31492f61e19 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 11:17:47 -0800 Subject: [PATCH 047/137] Install missing openssl library --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a328130c..b3cbc8bd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,8 @@ jobs: name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 + - name: install machine dependencies + run: sudo apt-get install -y libcurl4-openssl-dev - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: From f143720a61014fd39de81f778f7a17020d926c23 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 11:11:12 -0800 Subject: [PATCH 048/137] Remove unused dep pyvcf --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 7b2c3a03..7dcb3ae7 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,6 @@ def check_nonpython_dependencies(): 'numpy>=1.13.1', 'pandas>=1.1, <2', 'pysam>=0.9, <=0.15.2', - 'pyvcf==0.6.8', 'shortuuid>=0.5.0', 'svgwrite', 'mavis_config>=1.1.0, <2.0.0', From 6af3454075be56172653f640635b38891daf1ab4 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 16:02:05 -0800 Subject: [PATCH 049/137] use pandas instead of pysam to read vcfs resolves: #265 --- .gitignore | 1 + setup.py | 2 +- src/mavis/tools/vcf.py | 143 +++++++++++++++++++++++++++++++---- tests/data/manta_events.vcf | 4 +- tests/unit/test_tools_vcf.py | 9 +++ 5 files changed, 142 insertions(+), 17 deletions(-) create mode 100644 tests/unit/test_tools_vcf.py diff --git a/.gitignore b/.gitignore index 0745a3b2..1f4c4214 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ junit .tox *eggs/ .mypy_cache +.snakemake # aligners blat diff --git a/setup.py b/setup.py index 7dcb3ae7..4afe997a 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ def check_nonpython_dependencies(): 'networkx==1.11.0', 'numpy>=1.13.1', 'pandas>=1.1, <2', - 'pysam>=0.9, <=0.15.2', + 'pysam', 'shortuuid>=0.5.0', 'svgwrite', 'mavis_config>=1.1.0, <2.0.0', diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index 4ffa1e2b..77a2e22a 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -1,12 +1,56 @@ +import logging import re -from typing import Dict, List, Tuple +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +import pandas as pd from pysam import VariantFile +from typing_extensions import TypedDict from ..constants import COLUMNS, ORIENT, SVTYPE from ..util import DEVNULL from .constants import SUPPORTED_TOOL +PANDAS_DEFAULT_NA_VALUES = [ + '-1.#IND', + '1.#QNAN', + '1.#IND', + '-1.#QNAN', + '#N/A', + 'N/A', + 'NA', + '#NA', + 'NULL', + 'NaN', + '-NaN', + 'nan', + '-nan', +] + + +class VcfInfoType(TypedDict, total=False): + SVTYPE: str + CHR2: str + CIPOS: Tuple[int, int] + CIEND: Tuple[int, int] + CT: str + END: Optional[int] + PRECISE: bool + + +@dataclass +class VcfRecordType: + id: str + pos: int + chrom: str + alts: List[Optional[str]] + info: VcfInfoType + ref: str + + @property + def stop(self) -> Optional[int]: + return self.info.get('END', self.pos) + def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: """ @@ -88,6 +132,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: - duplication: 5to3 """ records = [] + for alt in record.alts if record.alts else [None]: info = {} for key in record.info.keys(): @@ -106,7 +151,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: if record.id and record.id != 'N': # to account for NovoBreak N in the ID field std_row['id'] = record.id - if info.get('SVTYPE', None) == 'BND': + if info.get('SVTYPE') == 'BND': chr2, end, orient1, orient2, ref, alt = parse_bnd_alt(alt) std_row[COLUMNS.break1_orientation] = orient1 std_row[COLUMNS.break2_orientation] = orient2 @@ -172,6 +217,82 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: return records +def convert_pandas_rows_to_variants(df): + def parse_info(info_field): + info = {} + for pair in info_field.split(';'): + if '=' in pair: + key, value = pair.split('=', 1) + info[key] = value + else: + info[pair] = True + + # convert info types + for key in info: + if key in {'CIPOS', 'CIEND'}: + ci_start, ci_end = info[key].split(',') + info[key] = (int(ci_start), int(ci_end)) + elif key == 'END': + info[key] = int(info[key]) + + return info + + df['info'] = df['INFO'].apply(parse_info) + df['alts'] = df['ALT'].apply(lambda a: a.split(',')) + + rows = [] + for _, row in df.iterrows(): + + rows.append( + VcfRecordType( + id=row['ID'], + pos=row['POS'], + info=VcfInfoType(row['info']), + chrom=row['CHROM'], + ref=row['REF'], + alts=row['alts'], + ) + ) + return rows + + +def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: + """ + Read a standard vcf file into a pandas dataframe + """ + # read the comment/header information + header_lines = [] + with open(input_file, 'r') as fh: + line = '##' + while line.startswith('##'): + header_lines.append(line) + line = fh.readline().strip() + header_lines = header_lines[1:] + # read the data + df = pd.read_csv( + input_file, + sep='\t', + skiprows=len(header_lines), + dtype={ + 'CHROM': str, + 'POS': int, + 'ID': str, + 'INFO': str, + 'FORMAT': str, + 'REF': str, + 'ALT': str, + }, + na_values=PANDAS_DEFAULT_NA_VALUES + ['.'], + ) + df = df.rename(columns={df.columns[0]: df.columns[0].replace('#', '')}) + required_columns = ['CHROM', 'INFO', 'POS', 'REF', 'ALT', 'ID'] + for col in required_columns: + if col not in df.columns: + raise KeyError(f'Missing required column: {col}') + # convert the format fields using the header + return header_lines, df + + def convert_file(input_file: str, file_type: str, log): """process a VCF file @@ -183,18 +304,12 @@ def convert_file(input_file: str, file_type: str, log): err: [description] """ rows = [] - vfile = VariantFile(input_file) - try: - vfile.header.info.add('END', number=1, type='Integer', description='End of the interval') - except ValueError: - pass - for vcf_record in vfile.fetch(): + _, data = pandas_vcf(input_file) + + for variant_record in convert_pandas_rows_to_variants(data): try: - rows.extend(convert_record(vcf_record, log=log)) - except Exception as err: - if file_type != SUPPORTED_TOOL.STRELKA: - raise err - else: - log('Ignoring', vcf_record) + rows.extend(convert_record(variant_record, log=log)) + except NotImplementedError as err: + logging.warning(str(err)) return rows diff --git a/tests/data/manta_events.vcf b/tests/data/manta_events.vcf index b1f5200f..ac384904 100644 --- a/tests/data/manta_events.vcf +++ b/tests/data/manta_events.vcf @@ -133,5 +133,5 @@ 7 126098487 MantaINV:4:28281:28286:0:0:0 T . PASS END=126167443;SVTYPE=INV;SVLEN=68956;CIPOS=0,3;CIEND=-3,0;HOMLEN=3;HOMSEQ=ATG;INV5;SOMATIC;SOMATICSCORE=120 PR:SR 42,0:48,0 95,45:104,36 9 28031861 MantaINV:162252:2:3:0:0:0 A . PASS END=28034467;SVTYPE=INV;SVLEN=2606;CIPOS=0,1;CIEND=-1,0;HOMLEN=1;HOMSEQ=C;SVINSLEN=11;SVINSSEQ=TTTTCGGAATT;INV5;SOMATIC;SOMATICSCORE=104 PR:SR 45,0:42,0 41,19:26,19 X 31196943 MantaDEL:290420:0:1:0:0:0 A . PASS END=31216210;SVTYPE=DEL;SVLEN=-19267;SVINSLEN=8;SVINSSEQ=ATGTAGTG;SOMATIC;SOMATICSCORE=124 PR:SR 35,0:25,0 43,32:32,31 -1 17051724 MantaBND:207:0:1:0:0:0:0 C [1:234912188[GCCCCATC 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:1;SVINSLEN=7;SVINSSEQ=GCCCCAT;BND_DEPTH=5;MATE_BND_DEPTH=4 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . . -1 234912188 MantaBND:207:0:1:0:0:0:1 A [1:17051724[ATGGGGCA 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:0;SVINSLEN=7;SVINSSEQ=ATGGGGC;BND_DEPTH=4;MATE_BND_DEPTH=5 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . . +1 17051724 MantaBND:207:0:1:0:0:0:0 C [1:234912188[GCCCCATC 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:1;SVINSLEN=7;SVINSSEQ=GCCCCAT;BND_DEPTH=5;MATE_BND_DEPTH=4 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . . +1 234912188 MantaBND:207:0:1:0:0:0:1 A [1:17051724[ATGGGGCA 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:0;SVINSLEN=7;SVINSSEQ=ATGGGGC;BND_DEPTH=4;MATE_BND_DEPTH=5 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . . diff --git a/tests/unit/test_tools_vcf.py b/tests/unit/test_tools_vcf.py new file mode 100644 index 00000000..cffe9ade --- /dev/null +++ b/tests/unit/test_tools_vcf.py @@ -0,0 +1,9 @@ +from mavis.tools.vcf import pandas_vcf + +from ..util import get_data + + +def test_read_vcf(): + header, df = pandas_vcf(get_data('delly_events.vcf')) + assert len(header) == 63 + assert df.shape[0] == 31 From 034363ce84cfaabd9f8be46993f6b66e9586bac7 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 16:16:06 -0800 Subject: [PATCH 050/137] Drop support for 3.6, add 3.9 and 3.10 --- .github/workflows/build.yml | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b3cbc8bd..23aa2a5a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8, 3.9, 3.10] name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 @@ -92,7 +92,7 @@ jobs: - name: Install workflow dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install mavis_config pandas + pip install mavis_config pandas snakemake - uses: eWaterCycle/setup-singularity@v6 with: singularity-version: 3.6.4 diff --git a/setup.py b/setup.py index 4afe997a..015bb674 100644 --- a/setup.py +++ b/setup.py @@ -114,7 +114,7 @@ def check_nonpython_dependencies(): }, tests_require=TEST_REQS, setup_requires=['pip>=9.0.0', 'setuptools>=36.0.0'], - python_requires='>=3.6', + python_requires='>=3.7', author='Caralyn Reisle', author_email='creisle@bcgsc.ca', test_suite='tests', From 53535cfe46b7ac9683dd5954fb7e4bc78d7b01e3 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 16:21:36 -0800 Subject: [PATCH 051/137] Fix pointer to old license file --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8b93e0c0..d3fbd1f4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ COPY setup.py setup.py COPY setup.cfg setup.cfg COPY src src -COPY LICENSE.txt LICENSE.txt +COPY LICENSE LICENSE COPY README.md README.md # install python package From ecefdd850877a6349f609eef0c1d8226b4746a65 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 16:22:21 -0800 Subject: [PATCH 052/137] Quote python versions --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 23aa2a5a..b5809403 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9, 3.10] + python-version: ["3.7", "3.8", "3.9", "3.10"] name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 From ed02f49831b35a20b3ccfc8bd6689e7d2fd7a82c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 16:23:51 -0800 Subject: [PATCH 053/137] Also update quick tests --- .github/workflows/quick-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml index c74ae1be..a1d333b4 100644 --- a/.github/workflows/quick-tests.yml +++ b/.github/workflows/quick-tests.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ["3.7", "3.8", "3.9", "3.10"] name: python-${{ matrix.python-version }} quick steps: - uses: actions/checkout@v2 From 9b3c72c889594cd0cf3b6062b528792badaa7494 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 16:41:39 -0800 Subject: [PATCH 054/137] Only support 3.7/3.8 for now --- .github/workflows/build.yml | 2 +- .github/workflows/quick-tests.yml | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b5809403..17b952f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8"] name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml index a1d333b4..178eab7b 100644 --- a/.github/workflows/quick-tests.yml +++ b/.github/workflows/quick-tests.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8"] name: python-${{ matrix.python-version }} quick steps: - uses: actions/checkout@v2 diff --git a/setup.cfg b/setup.cfg index 1df04ed1..afe52b99 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,7 +10,7 @@ process-timeout=600 [metadata] description-file = README.md -license_file = LICENSE.txt +license_file = LICENSE [bdist_wheel] universal = 1 From 16c3f3bff759c2a8cf0a2ba1573cfa4fe2d62ec0 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 21:49:05 -0800 Subject: [PATCH 055/137] Upgrade networkx to v2 --- setup.py | 2 +- src/mavis/assemble.py | 33 +++++++++++++++++++++------------ tests/unit/test_assemble.py | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 015bb674..fdeeaf77 100644 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ def check_nonpython_dependencies(): 'biopython>=1.70, <1.78', 'braceexpand==0.1.2', 'colour', - 'networkx==1.11.0', + 'networkx>=2.5,<3', 'numpy>=1.13.1', 'pandas>=1.1, <2', 'pysam', diff --git a/src/mavis/assemble.py b/src/mavis/assemble.py index f7a9d301..0370ea01 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/assemble.py @@ -66,6 +66,15 @@ class DeBruijnGraph(nx.DiGraph): enforces edge weights """ + def get_out_edges(self, *args, **kwargs): + return list(self.out_edges(*args, **kwargs)) + + def get_in_edges(self, *args, **kwargs): + return list(self.in_edges(*args, **kwargs)) + + def get_nodes(self, *args, **kwargs): + return list(self.nodes(*args, **kwargs)) + def get_edge_freq(self, n1, n2): """ returns the freq from the data attribute for a specified edge @@ -85,7 +94,7 @@ def add_edge(self, n1, n2, freq=1): nx.DiGraph.add_edge(self, n1, n2, freq=freq) def all_edges(self, *nodes, data=False): - return self.in_edges(*nodes, data=data) + self.out_edges(*nodes, data=data) + return self.get_in_edges(*nodes, data=data) + self.get_out_edges(*nodes, data=data) def trim_tails_by_freq(self, min_weight): """ @@ -95,7 +104,7 @@ def trim_tails_by_freq(self, min_weight): min_weight (int): the minimum weight for an edge to be retained """ ends = sorted( - [n for n in self.nodes() if self.out_degree(n) == 0 or self.in_degree(n) == 0] + [n for n in self.get_nodes() if self.out_degree(n) == 0 or self.in_degree(n) == 0] ) visited = set() @@ -126,16 +135,16 @@ def trim_forks_by_freq(self, min_weight): for all nodes in the graph, if the node has an out-degree > 1 and one of the outgoing edges has freq < min_weight. then that outgoing edge is deleted """ - nodes = [n for n in self.nodes() if self.degree(n) > 2] + nodes = [n for n in self.get_nodes() if self.degree(n) > 2] for node in sorted(nodes): if self.out_degree(node) > 1: - outgoing_edges = self.out_edges(node, data=True) + outgoing_edges = self.get_out_edges(node, data=True) best = max([e[2]['freq'] for e in outgoing_edges]) for src, tgt, data in outgoing_edges: if data['freq'] < min_weight and data['freq'] != best: self.remove_edge(src, tgt) if self.in_degree(node) > 1: - ingoing_edges = self.in_edges(node, data=True) + ingoing_edges = self.get_in_edges(node, data=True) best = max([e[2]['freq'] for e in ingoing_edges]) for src, tgt, data in ingoing_edges: if data['freq'] < min_weight and data['freq'] != best: @@ -157,7 +166,7 @@ def trim_noncutting_paths_by_freq(self, min_weight): else: path = [] while self.in_degree(src) == 1 and self.out_degree(src) == 1: - s, t, data = self.in_edges(src, data=True)[0] + s, t, data = self.get_in_edges(src, data=True)[0] if data['freq'] >= min_weight or s in path: break path.insert(0, src) @@ -165,7 +174,7 @@ def trim_noncutting_paths_by_freq(self, min_weight): path.insert(0, src) while self.in_degree(tgt) == 1 and self.out_degree(tgt) == 1: - s, t, data = self.out_edges(tgt, data=True)[0] + s, t, data = self.get_out_edges(tgt, data=True)[0] if data['freq'] >= min_weight or t in path: break path.append(tgt) @@ -193,7 +202,7 @@ def get_sinks(self, subgraph=None): """ nodeset = set() if subgraph is None: - subgraph = self.nodes() + subgraph = self.get_nodes() for node in subgraph: if self.out_degree(node) == 0: nodeset.add(node) @@ -205,7 +214,7 @@ def get_sources(self, subgraph=None): """ nodeset = set() if subgraph is None: - subgraph = self.nodes() + subgraph = self.get_nodes() for node in subgraph: if self.in_degree(node) == 0: nodeset.add(node) @@ -227,7 +236,7 @@ def digraph_connected_components(graph, subgraph=None): List[List]: returns a list of compnents which are lists of node names """ if subgraph is None: - subgraph = set(graph.nodes()) + subgraph = set(graph.get_nodes()) g = nx.Graph() for src, tgt in graph.edges(): if src in subgraph and tgt in subgraph: @@ -387,7 +396,7 @@ def assemble( for kmer in kmers_list: assembly.add_edge(kmer[:-1], kmer[1:]) # use the ab min edge weight to remove all low weight edges first - nodes = list(assembly.nodes()) + nodes = assembly.get_nodes() for n in nodes: if assembly.in_degree(n) == 0 and assembly.out_degree(n) == 0: assembly.remove_node(n) @@ -396,7 +405,7 @@ def assemble( subgraph = assembly.subgraph(component) if not nx.is_directed_acyclic_graph(subgraph): log('dropping cyclic component', time_stamp=False) - for node in subgraph.nodes(): + for node in subgraph.get_nodes(): assembly.remove_node(node) # initial data cleaning assembly.trim_forks_by_freq(min_edge_trim_weight) diff --git a/tests/unit/test_assemble.py b/tests/unit/test_assemble.py index fbd5d0cb..73b3c6bf 100644 --- a/tests/unit/test_assemble.py +++ b/tests/unit/test_assemble.py @@ -105,7 +105,7 @@ def test_trim_tails_by_freq_forks(self): g.add_edge(8, 7) g.add_edge(9, 8) g.trim_tails_by_freq(2) - assert sorted(g.nodes()) == [1, 2, 3, 4, 5, 6] + assert sorted(g.get_nodes()) == [1, 2, 3, 4, 5, 6] g = DeBruijnGraph() for s, t in itertools.combinations([1, 2, 3, 4, 5, 6], 2): @@ -117,7 +117,7 @@ def test_trim_tails_by_freq_forks(self): g.add_edge(8, 7) g.add_edge(9, 8) g.trim_tails_by_freq(2) - assert sorted(g.nodes()) == [1, 2, 3, 4, 5, 6, 7, 8] + assert sorted(g.get_nodes()) == [1, 2, 3, 4, 5, 6, 7, 8] g = DeBruijnGraph() for s, t in itertools.combinations([1, 2, 3, 4, 5, 6], 2): @@ -128,7 +128,7 @@ def test_trim_tails_by_freq_forks(self): g.add_edge(7, 8) g.add_edge(9, 8) g.trim_tails_by_freq(2) - assert sorted(g.nodes()) == [1, 2, 3, 4, 5, 6] + assert sorted(g.get_nodes()) == [1, 2, 3, 4, 5, 6] def test_add_edge(self): g = DeBruijnGraph() @@ -151,22 +151,25 @@ def test_trim_noncutting_paths_by_freq_degree_stop(self): for edge in g.edges(): print(edge) g.trim_noncutting_paths_by_freq(3) - assert g.nodes() == list(range(1, 9)) + path1[1:-1] + print('g.nodes', g.nodes) + assert g.get_nodes() == list(range(1, 9)) + path1[1:-1] + print('g.nodes', g.nodes) # add an equal weight path to force namesorting path2 = [5, 13, 14, 15, 16, 1] for s, t in zip(path2, path2[1:]): g.add_edge(s, t) - + print('g.nodes', g.nodes) g.trim_noncutting_paths_by_freq(3) - assert g.nodes() == list(range(1, 9)) + path2[1:-1] + print('g.nodes', g.nodes) + assert g.get_nodes() == list(range(1, 9)) + path2[1:-1] # add back the original path with a higher (but still low) weight for s, t in zip(path1, path1[1:]): g.add_edge(s, t, freq=2) g.trim_noncutting_paths_by_freq(3) - assert g.nodes() == list(range(1, 9)) + path1[1:-1] + assert g.get_nodes() == list(range(1, 9)) + path1[1:-1] # add the second path with 1 high weight edge path2 = [5, 13, 14, 15, 16, 1] @@ -175,7 +178,7 @@ def test_trim_noncutting_paths_by_freq_degree_stop(self): g.add_edge(14, 15, freq=6) g.trim_noncutting_paths_by_freq(3) - assert g.nodes() == list(range(1, 9)) + path2[1:-1] + assert g.get_nodes() == list(range(1, 9)) + path2[1:-1] @pytest.fixture From af634de4aafb36e2edeb83855da0ccb91fb8ec81 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 21:50:52 -0800 Subject: [PATCH 056/137] Add python 3.9/3.10 to workflows --- .github/workflows/build.yml | 2 +- .github/workflows/quick-tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 17b952f1..b5809403 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.7", "3.8", "3.9", "3.10"] name: python-${{ matrix.python-version }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml index 178eab7b..a1d333b4 100644 --- a/.github/workflows/quick-tests.yml +++ b/.github/workflows/quick-tests.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.7", "3.8", "3.9", "3.10"] name: python-${{ matrix.python-version }} quick steps: - uses: actions/checkout@v2 From 21366907cb1706941f77f70a599afde2a3c1f628 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 21:51:08 -0800 Subject: [PATCH 057/137] do not include docs/tests in dist --- MANIFEST.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 2ab6e3c1..c1af92d1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include src *.py *.json include README.md include LICENSE -prune docs/build -prune docs/source/auto +prune docs +prune tests From a9415e28d99fa95b70afbb4a4011123db091849b Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 21:52:23 -0800 Subject: [PATCH 058/137] Remove m2r dependency --- setup.py | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index fdeeaf77..6cb7600d 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,14 @@ import os import re +from pathlib import Path from setuptools import find_packages, setup +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text() VERSION = '2.2.8' -def parse_md_readme(): - """ - pypi won't render markdown. After conversion to rst it will still not render unless raw directives are removed - """ - try: - from m2r import parse_from_file - - rst_lines = parse_from_file('README.md').split('\n') - long_description = [ - '.. image:: http://mavis.bcgsc.ca/docs/latest/_static/acronym.svg\n\n|\n' - ] # backup since pip can't handle raw directives - i = 0 - while i < len(rst_lines): - if re.match(r'^..\s+raw::.*', rst_lines[i]): - i += 1 - while re.match(r'^(\s\s+|\t|$).*', rst_lines[i]): - i += 1 - else: - long_description.append(re.sub('>`_ ', '>`__ ', rst_lines[i])) # anonymous links - i += 1 - long_description = '\n'.join(long_description) - except (ImportError, OSError): - long_description = '' - return long_description - - def check_nonpython_dependencies(): """ check that the non-python dependencies have been installed. @@ -92,7 +69,7 @@ def check_nonpython_dependencies(): 'mavis_config>=1.1.0, <2.0.0', ] -DEPLOY_REQS = ['twine', 'm2r', 'wheel'] +DEPLOY_REQS = ['twine', 'wheel'] setup( @@ -103,7 +80,8 @@ def check_nonpython_dependencies(): package_dir={'': 'src'}, packages=find_packages(where='src'), description='A Structural Variant Post-Processing Package', - long_description=parse_md_readme(), + long_description=long_description, + long_description_content_type='text/markdown', install_requires=INSTALL_REQS, extras_require={ 'docs': DOC_REQS, From dd6f0e1b59c6944311e7f6f7be0557c95a9a6d19 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 21:58:07 -0800 Subject: [PATCH 059/137] Fix mapping import for 3.10 --- src/mavis/schemas/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mavis/schemas/__init__.py b/src/mavis/schemas/__init__.py index a0568bac..f41bda5c 100644 --- a/src/mavis/schemas/__init__.py +++ b/src/mavis/schemas/__init__.py @@ -1,10 +1,14 @@ -import collections +try: + from collections import Mapping +except ImportError: + from collections.abc import Mapping + import os from snakemake.utils import validate as snakemake_validate -class ImmutableDict(collections.Mapping): +class ImmutableDict(Mapping): def __init__(self, data): self._data = data From b53ff6476a833104df104b8f950bf81b80957041 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 28 Dec 2021 14:52:44 -0800 Subject: [PATCH 060/137] added edited toml files --- pyproject.toml | 1 + setup.cfg | 95 +++++++++++++++++++++++++++++++++++++++++++------- setup.py | 83 ++----------------------------------------- 3 files changed, 85 insertions(+), 94 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..9e5a9848 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1 @@ +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index afe52b99..2459069c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,23 +1,92 @@ -[nosetests] -with-coverage=1 -cover-package=mavis,tab -cover-html=1 -cover-html-dir=coverage -cover-inclusive=1 -cover-erase=1 -processes=2 -process-timeout=600 - [metadata] -description-file = README.md +name = mavis +version = 2.2.10 +url = https://github.com/bcgsc/mavis.git +download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz +description = A Structural Variant Post-Processing Package +author_email = creisle@bcgsc.ca +author = Caralyn Reisle +maintainer_email = creisle@bcgsc.ca +maintainer = Caralyn Reisle +long_description = file: README.md, LICENSE +long_description_content_type = text/markdown license_file = LICENSE +project_urls = mavis = http://mavis.bcgsc.ca [bdist_wheel] universal = 1 [pycodestyle] -ignore = E501,W503,E203 +ignore = E501 + W503 + E203 statistics = True [flake8] -ignore = E501,W503,E203 +ignore = E501 + W503 + E203 + +[options] +packages = find: +python_requires = >=3.2 +dependency_links = [] +include_package_data = True +install_requires = + Distance>=0.1.3 + Shapely>=1.6.4.post1 + biopython>=1.70, <1.78 + braceexpand==0.1.2 + colour + networkx>=2.5,<3 + numpy>=1.13.1 + pysam + shortuuid>=0.5.0 + svgwrite +setup_requires = + pip>=9.0.0 + setuptools>=36.0.0 + +[options.packages.find] +exclude = + tests + +[options.extras_require] +doc = + mkdocs==1.1.2 + markdown-refdocs + mkdocs-material==5.4.0 + markdown-include + mkdocs-simple-hooks==0.1.2 +test = + timeout-decorator>=0.3.3 + coverage>=4.2 + pycodestyle>=2.3.1 + pytest + pytest-cov +dev = + black + flake8 + twine + wheel + timeout-decorator>=0.3.3 + coverage>=4.2 + pycodestyle>=2.3.1 + pytest + pytest-cov + mkdocs==1.1.2 + markdown-refdocs + mkdocs-material==5.4.0 + markdown-include + mkdocs-simple-hooks==0.1.2 +deploy = + twine + wheel +tools = + pyensembl + simplejson + +[options.entry_points] +console_scripts = + mavis = mavis.main:main + calculate_ref_alt_counts = tools.calculate_ref_alt_counts:main diff --git a/setup.py b/setup.py index 6cb7600d..853de05b 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,7 @@ import os import re -from pathlib import Path -from setuptools import find_packages, setup - -this_directory = Path(__file__).parent -long_description = (this_directory / "README.md").read_text() -VERSION = '2.2.8' +from setuptools import setup def check_nonpython_dependencies(): @@ -31,79 +26,5 @@ def check_nonpython_dependencies(): print('Found: aligner at', pth) -# HSTLIB is a dependency for pysam. -# The cram file libraries fail for some OS versions and mavis does not use cram files so we disable these options -os.environ['HTSLIB_CONFIGURE_OPTIONS'] = '--disable-lzma --disable-bz2 --disable-libcurl' - - -TEST_REQS = [ - 'timeout-decorator>=0.3.3', - 'coverage>=4.2', - 'pycodestyle>=2.3.1', - 'pytest', - 'pytest-cov', -] - - -DOC_REQS = [ - 'mkdocs==1.1.2', - 'markdown_refdocs', - 'mkdocs-material==5.4.0', - 'markdown-include', - 'mkdocs-simple-hooks==0.1.2', -] - - -INSTALL_REQS = [ - 'Distance>=0.1.3', - 'Shapely>=1.6.4.post1', - 'biopython>=1.70, <1.78', - 'braceexpand==0.1.2', - 'colour', - 'networkx>=2.5,<3', - 'numpy>=1.13.1', - 'pandas>=1.1, <2', - 'pysam', - 'shortuuid>=0.5.0', - 'svgwrite', - 'mavis_config>=1.1.0, <2.0.0', -] - -DEPLOY_REQS = ['twine', 'wheel'] - - -setup( - name='mavis', - version='{}'.format(VERSION), - url='https://github.com/bcgsc/mavis.git', - download_url='https://github.com/bcgsc/mavis/archive/v{}.tar.gz'.format(VERSION), - package_dir={'': 'src'}, - packages=find_packages(where='src'), - description='A Structural Variant Post-Processing Package', - long_description=long_description, - long_description_content_type='text/markdown', - install_requires=INSTALL_REQS, - extras_require={ - 'docs': DOC_REQS, - 'test': TEST_REQS, - 'dev': ['black==20.8b1', 'flake8'] + DOC_REQS + TEST_REQS + DEPLOY_REQS, - 'deploy': DEPLOY_REQS, - 'tools': ['pyensembl', 'simplejson'], - }, - tests_require=TEST_REQS, - setup_requires=['pip>=9.0.0', 'setuptools>=36.0.0'], - python_requires='>=3.7', - author='Caralyn Reisle', - author_email='creisle@bcgsc.ca', - test_suite='tests', - entry_points={ - 'console_scripts': [ - 'mavis = mavis.main:main', - 'calculate_ref_alt_counts = tools.calculate_ref_alt_counts:main', - ] - }, - include_package_data=True, - data_files=[('mavis', ['src/mavis/schemas/config.json', 'src/mavis/schemas/overlay.json'])], - project_urls={'mavis': 'http://mavis.bcgsc.ca'}, -) +setup() check_nonpython_dependencies() From a646440e548775d86efa6756def4c1448a4a5d14 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 22:33:57 -0800 Subject: [PATCH 061/137] Swap maintainer to mavis email alias --- setup.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 2459069c..c8f751e8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,9 +6,9 @@ download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz description = A Structural Variant Post-Processing Package author_email = creisle@bcgsc.ca author = Caralyn Reisle -maintainer_email = creisle@bcgsc.ca -maintainer = Caralyn Reisle -long_description = file: README.md, LICENSE +maintainer_email = mavis@bcgsc.ca +maintainer = mavis +long_description = file: README.md long_description_content_type = text/markdown license_file = LICENSE project_urls = mavis = http://mavis.bcgsc.ca From 0970918a4770c9765ae11cb4f58425bc3290a62f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 22:42:41 -0800 Subject: [PATCH 062/137] Specify src dir for install --- setup.cfg | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index c8f751e8..8b1a07da 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,7 +29,9 @@ ignore = E501 [options] packages = find: -python_requires = >=3.2 +package_dir = + = src +python_requires = >=3.7 dependency_links = [] include_package_data = True install_requires = @@ -48,8 +50,8 @@ setup_requires = setuptools>=36.0.0 [options.packages.find] -exclude = - tests +exclude = tests +where = src [options.extras_require] doc = From 2cf3065ab69b99004cc3590f084c3223706e952f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 28 Dec 2021 22:47:03 -0800 Subject: [PATCH 063/137] Add v3 dependencies --- setup.cfg | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8b1a07da..b5cfe3a3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,14 +35,16 @@ python_requires = >=3.7 dependency_links = [] include_package_data = True install_requires = - Distance>=0.1.3 - Shapely>=1.6.4.post1 biopython>=1.70, <1.78 braceexpand==0.1.2 colour + Distance>=0.1.3 + mavis_config>=1.1.0, <2.0.0 networkx>=2.5,<3 numpy>=1.13.1 + pandas>=1.1, <2 pysam + Shapely>=1.6.4.post1 shortuuid>=0.5.0 svgwrite setup_requires = From f16b17492371557bedd725d93e5ad4e139cac0b7 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 4 Jan 2022 11:34:57 -0800 Subject: [PATCH 064/137] Add typing extensions as dependency --- setup.cfg | 1 + src/mavis/tools/vcf.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b5cfe3a3..a34fc7ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,6 +47,7 @@ install_requires = Shapely>=1.6.4.post1 shortuuid>=0.5.0 svgwrite + typing_extensions>=4 setup_requires = pip>=9.0.0 setuptools>=36.0.0 diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index 77a2e22a..f87aa764 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -5,7 +5,12 @@ import pandas as pd from pysam import VariantFile -from typing_extensions import TypedDict + +try: + # TypedDict added to typing package directly in later versions + from typing import TypedDict +except ImportError: + from typing_extensions import TypedDict from ..constants import COLUMNS, ORIENT, SVTYPE from ..util import DEVNULL From 967bdb481b49001ba62a1d7473e45b1debd8dd91 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 4 Jan 2022 11:50:34 -0800 Subject: [PATCH 065/137] Try being more explicit about json in manifest --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index c1af92d1..691ef59e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ recursive-include src *.py *.json +include src/mavis/schemas/*.json include README.md include LICENSE prune docs From c3363cc55072035bf3754279f4571e136647b3a0 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 4 Jan 2022 12:16:14 -0800 Subject: [PATCH 066/137] Copy manifest and pyproject.toml to docker container --- Dockerfile | 2 ++ MANIFEST.in | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d3fbd1f4..b62ea761 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,8 @@ RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ COPY setup.py setup.py COPY setup.cfg setup.cfg +COPY MANIFEST.in MANIFEST.in +COPY pyproject.toml pyproject.toml COPY src src COPY LICENSE LICENSE COPY README.md README.md diff --git a/MANIFEST.in b/MANIFEST.in index 691ef59e..c1af92d1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ recursive-include src *.py *.json -include src/mavis/schemas/*.json include README.md include LICENSE prune docs From 86b046fe9f81e4a5c2ed5e2621093e0f36276ed4 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 4 Jan 2022 12:42:14 -0800 Subject: [PATCH 067/137] Create editable copy of subgraph In networkx v2 subgraphs are frozen and must be copied to be edited --- src/mavis/assemble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/assemble.py b/src/mavis/assemble.py index 0370ea01..0683dbad 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/assemble.py @@ -418,7 +418,7 @@ def assemble( # pull the path scores path_scores.update( pull_contigs_from_component( - assembly.subgraph(component), + assembly.subgraph(component).copy(), component, min_edge_trim_weight=min_edge_trim_weight, assembly_max_paths=assembly_max_paths, From ca06372fe0d88b257981632b34fb52af4c54420d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 4 Jan 2022 13:04:10 -0800 Subject: [PATCH 068/137] Increase assembly timeout to account for new graph copy requirement --- tests/integration/test_assemble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_assemble.py b/tests/integration/test_assemble.py index 8cae4394..6930b685 100644 --- a/tests/integration/test_assemble.py +++ b/tests/integration/test_assemble.py @@ -351,7 +351,7 @@ def test_multiple_events(self): assert assemblies[0].seq == expected assert len(assemblies) == 1 - @timeout_decorator.timeout(300) + @timeout_decorator.timeout(600) @long_running_test def test_large_assembly(self, large_assembly_seq): # simply testing that this will complete before the timeout From 94b7630c2d7629dab482fc1495342fbafec6672b Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 4 Jan 2022 23:41:47 -0800 Subject: [PATCH 069/137] cleaned up vcf.py and added unit tests for sniffles and cuteSV --- src/mavis/tools/vcf.py | 1 - tests/data/cuteSV.vcf | 326 ++++++++++++++++++++++++++++ tests/data/sniffles.vcf | 355 +++++++++++++++++++++++++++++++ tests/end_to_end/test_convert.py | 14 ++ 4 files changed, 695 insertions(+), 1 deletion(-) create mode 100644 tests/data/cuteSV.vcf create mode 100644 tests/data/sniffles.vcf diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index f87aa764..eea0fadf 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -4,7 +4,6 @@ from typing import Dict, List, Optional, Tuple import pandas as pd -from pysam import VariantFile try: # TypedDict added to typing package directly in later versions diff --git a/tests/data/cuteSV.vcf b/tests/data/cuteSV.vcf new file mode 100644 index 00000000..d95fb3f4 --- /dev/null +++ b/tests/data/cuteSV.vcf @@ -0,0 +1,326 @@ +##fileformat=VCFv4.2 +##FILTER= +##source=cuteSV-1.0.11 +##fileDate=2021-06-18 18:52:00 5-PDT +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##CommandLine="cuteSV --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 -t 72 -s 5 --max_split_parts 30 --report_readid --genotype /projects/jfan_prj/jfan_prj/Nanopore_Testing/2021_nanopore_sv_testing/scratch/depth_testing/POG/COLO829/minimap2_bam/F24721_merged_sorted.bam /projects/jfan_prj/jfan_prj/Nanopore_Testing/2021_nanopore_sv_testing/scratch/depth_testing/POG/COLO829/hg38_no_alt_phage_lambda.fa F24721_merged_sorted.bam_5_read.vcf F24721_merged_sorted.bam" +##bcftools_viewVersion=1.11+htslib-1.11 +##bcftools_viewCommand=view --regions chr1 F24721_merged_sorted.bam_5_read.vcf.gz; Date=Tue Jan 4 22:11:46 2022 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NULL +chr1 10011 cuteSV.BND.0 N [chr17:41490824[N 0.1 q5 IMPRECISE;SVTYPE=BND;RE=6;RNAMES=ee2ad47c-5065-4825-9697-1aab02c409eb,debd3e06-6b28-4e12-b5a5-2064270d20e7,533ec7a2-4008-4148-87c7-5c5e9a6a7b05,0195fcbc-037a-4026-917c-768b3effe0b1,0163af83-26fa-455b-a6ee-b1070203087a,d8d2eb1f-41b7-4d71-89bd-a78ed9de2a66 GT:DR:DV:PL:GQ 0/0:24:6:0,19,172:19 +chr1 10027 cuteSV.BND.1 N [chr3:198172735[N 0 q5 IMPRECISE;SVTYPE=BND;RE=7;RNAMES=4968b938-ac4d-4e30-90a5-5838c58fad34,1553213b-5917-497f-be48-78dee4b1c26b,d60daf48-6cf9-488d-b9b0-3a4a18cc0340,568571dd-4342-4a25-bdc9-552213bf3765,77368f76-6a14-4bf7-a9fe-f0dede9c19ff,86dfbc36-4f4d-47e0-b9b8-6b38bc96dc72,1f0a2c70-15ac-4553-ae9e-bb7277d2f987 GT:DR:DV:PL:GQ 0/0:28:7:0,23,200:22 +chr1 10468 cuteSV.BND.2 N N]chr17:41490879] 0.1 q5 IMPRECISE;SVTYPE=BND;RE=6;RNAMES=620955cc-9f3e-4849-b2a6-9f20c2550a73,d84b9ae7-9ed2-4e60-8d4f-1e31d4a2e362,88393d96-032b-4781-95e8-900c7c8e2d3c,efb016bc-90bb-49bc-82a1-28bd881f602f,15a3c0f0-28aa-4638-aa6e-bec6b81e1188,12b2ee2b-f19d-4bd5-9b63-fe8c4310b3ee GT:DR:DV:PL:GQ 0/0:24:6:0,19,172:19 +chr1 10469 cuteSV.BND.3 N N]chrX:156030800] 0.1 q5 IMPRECISE;SVTYPE=BND;RE=6;RNAMES=1cde1b1c-7d45-48da-a8e0-78edcae385d1,1553213b-5917-497f-be48-78dee4b1c26b,78301a41-17b9-4c18-92e0-1a2b3dbf2e38,f9aa8b4b-db03-4e80-aef8-fead40f28940,99568c1f-8dec-4d4b-a9e0-404c677aeef5,c2820162-0910-4105-90eb-4abb80bf1b5a GT:DR:DV:PL:GQ 0/0:24:6:0,19,172:19 +chr1 35143 cuteSV.BND.4 N N[chr20:60000[ 0.2 q5 IMPRECISE;SVTYPE=BND;RE=6;RNAMES=3723fb60-862e-4a3e-804b-744b3f741267,4f2f6804-bf08-4527-8a19-18dba1b89cf1,b2b644f7-d646-4f3b-b262-bbe8880eb088,74860d07-8b65-4643-8881-d64b4ef8b31b,5f6c3321-121c-471b-be42-36559a25227f,fb776225-90b5-4063-9528-0a8623d3297e GT:DR:DV:PL:GQ 0/0:22:6:0,14,153:14 +chr1 136878 cuteSV.INS.5 G GAGTGTGGAGGGCCGGTGTGAGTAAGGCTCACGCTGACCTCTGTCCGCGAGGGCCGGTGCAGACAAGGGGCCCGGCTGACCTCTCTCAGCTGGGAGGGCCAGCAGAAAGCAAGGCTCACACTGACCTCTCTCAGCATGGAGAGGGCCTGGTGTGAGA 190.8 PASS PRECISE;SVTYPE=INS;SVLEN=156;END=136878;CIPOS=-72,72;CILEN=-33,33;RE=20;RNAMES=25796517-ff72-4482-b6c0-2aa7de0174ce,c58bf71a-cc0c-4268-8d89-a2ca5fdb4d9c,f2228c6c-d17a-4262-b2c0-4971299a4e24,0985e9ba-ea9b-40ab-b4da-d1ed9544fb8e,a3395cec-d8bf-486b-9599-7547f716cd59,f8b67d43-8045-461f-ada1-969d92564c23,4b73faff-93bd-4af7-adcd-d96abc1a3b28,3642879a-10de-45c3-9d93-02a5063d3a8c,99251e1c-7d12-4db5-841f-1bf324dfcc15,9ee38079-241a-4368-805f-ba5dc55ceb68,91a0c677-b499-48db-a6f7-081686ec0420,b22476b1-d308-4bb6-8d73-49fafbcf0ee2,37f73b02-d397-489b-ab42-f27037974ec4,8890ae37-c248-4c21-9763-5d6dcf8f7774,97ee061a-3254-45ac-b6e5-3e786ec8450f,81ab4c6a-f5fd-4793-bb3c-fffff54b38ac,580f9c90-35f2-4c5a-b2e8-6e829ff63b20,82344b3a-4a3e-4eaf-852f-23d5706d97b2,9c8a04c0-2b21-4943-a19d-e806a598087a,0ae4e7a8-5a98-4cf8-8220-a546796f6814 GT:DR:DV:PL:GQ 1/1:0:20:191,51,0:51 +chr1 180090 cuteSV.BND.5 N [chr3:198172703[N 0 q5 IMPRECISE;SVTYPE=BND;RE=5;RNAMES=9d16f233-21de-4067-aff5-4002f8b66bfe,3e609e79-bf43-495f-abc5-589e153a386c,30da9670-d745-42cd-8a23-4ab34b593e92,5231d16f-9701-43a6-89da-319ee2ddf6e0,0053b59d-06cc-4ee6-ba75-80d244838eef GT:DR:DV:PL:GQ 0/0:25:5:0,29,191:28 +chr1 180785 cuteSV.INS.6 A ACCACCCTCCCTTTCCACTGCCCTAACCCACCTCACCCTCTGCCCTCACCCTCACCTCACCC 49.2 PASS PRECISE;SVTYPE=INS;SVLEN=61;END=180785;CIPOS=-71,71;CILEN=-17,17;RE=7;RNAMES=e3eb0785-89bc-47fe-b010-5507ded6f073,445e45f8-b550-4995-83c5-4101693ca335,97f41f36-3bef-40ad-b76d-86ebb2d4501d,c1433f2c-0190-405a-b4f0-de89b0361a12,80ad0bf2-d144-4b4e-9e93-6dd5e31da974,9d16f233-21de-4067-aff5-4002f8b66bfe,0053b59d-06cc-4ee6-ba75-80d244838eef GT:DR:DV:PL:GQ 1/1:2:7:49,5,1:5 +chr1 181215 cuteSV.DEL.2 AGGCGCAGAGAGGCGCGCCTCGCCGGCGCAGGCGCAGAGAGGCGCGCCGGGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGACACATGCTAGCGCGTCCAGGGGAGGAGGCGTGGCACAGGCGCAGAGACACATGCTAGCGCGCCCAGGGGAGGAGGCGTGGCGCAGGCGCAGAGAGGCGCGCCGTGCTGCCGC A 0 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-238;END=181453;CIPOS=-4,4;CILEN=-1,1;RE=11;RNAMES=c1433f2c-0190-405a-b4f0-de89b0361a12,dc30defc-03f2-4e71-80f0-4cb7ce0c9ded,607fca6a-e4b6-4c1b-b14a-eaa480324b1a,803b38ee-2f2f-472c-b1b3-d1ddd8348fe6,bd08e14e-77b6-4bf5-9f69-8293094292b0,0cc7342e-adaa-4409-a1ec-1a1995a4284a,2be11552-cfea-4e6e-ad37-6b2d629ae1de,f64b79a5-cd0f-42cb-8d25-2c3425a0f91d,6f23d6fc-a187-466e-8bcb-ce6a6bf5eea1,592065f2-2bf2-4b38-b6d3-5cea446e07c0,f6549ae3-8ffe-4eda-be96-82ff76067a4c;STRAND=+- GT:DR:DV:PL:GQ 0/0:41:11:0,28,286:27 +chr1 181229 cuteSV.INS.7 G GCGCAGGGCGCGCGCGCGCGCAGCGCAGAGAGCGCGCGGCGCAGCACCGGCGCAGCGCCGCGCAGCGCAGCGCGCAGAGGGCGCGCCCCCGCGCGCGCA 161.2 PASS PRECISE;SVTYPE=INS;SVLEN=98;END=181229;CIPOS=-18,18;CILEN=-12,12;RE=30;RNAMES=1cee1d17-283c-4da6-8235-f37bf60b4c07,3e609e79-bf43-495f-abc5-589e153a386c,445e45f8-b550-4995-83c5-4101693ca335,9d16f233-21de-4067-aff5-4002f8b66bfe,cb388ae1-da56-4217-92f9-ff5c1d0f38f3,598e10ed-9e0f-4b91-9321-cf49d264894a,f2447712-6638-4688-8634-6456e3381ab1,b8c1d59c-7bf5-4771-a26d-18ff414db526,153bb516-869b-4774-922a-ac7a3bfb3819,8635611c-116a-4462-a83d-2ed4adf3d48b,6977b0a3-c47e-4c52-becf-86060ca26a56,71bacfe7-4813-4186-90ed-c5c3c11ceac9,2400200b-b619-4e31-a31f-d5ca55fe43b0,a3eefdd5-15ef-496c-80d1-1c648cb3a460,333b6845-d254-4729-85c6-deff57311b89,5231d16f-9701-43a6-89da-319ee2ddf6e0,50ccacdc-8edc-4e6d-b0aa-f5d572119e74,64ccea5c-2f30-4678-bc7d-d4c38e8b78c7,30da9670-d745-42cd-8a23-4ab34b593e92,34ee1f1f-8477-4f30-b5b6-b8ba0d69fd0e,7bfe3eda-c76e-4d8a-8087-decae5495eb5,01e20e44-6a4f-46cd-9e47-21e4672bd6a1,ccee1775-5c00-4eac-981c-a36ea051336e,b899737e-2777-4b81-a079-f006d634cf81,eea77227-1562-4bb0-9ee6-b1c56c82d8cb,055a3afd-77bc-4577-a665-378d54328dee,4e248870-b369-48c0-9e31-1da436de798d,ce435c54-3e21-464c-9522-4a19bf9ed9ed,97f41f36-3bef-40ad-b76d-86ebb2d4501d,e3eb0785-89bc-47fe-b010-5507ded6f073 GT:DR:DV:PL:GQ 0/1:19:30:161,0,56:56 +chr1 257667 cuteSV.BND.6 N [chr5:181462058[N 0 q5 IMPRECISE;SVTYPE=BND;RE=5;RNAMES=09cdbd93-aa21-4979-8110-75c36774feb8,5204edda-4746-4ea2-8928-dbcf3c36a0ea,15190a47-b61a-4cbe-9183-342e97bed9f7,4ce7a619-865d-4cf2-9b05-6b8b27acdf9e,9992b757-2b9c-4e98-a0ec-14e3605ab7a8 GT:DR:DV:PL:GQ 0/0:25:5:0,29,191:28 +chr1 350807 cuteSV.INS.8 A AACTCACTGAAGGTGGAGGGAAAATGGTGTTGACCTAAG 32 PASS PRECISE;SVTYPE=INS;SVLEN=38;END=350807;CIPOS=0,0;CILEN=-1,1;RE=6;RNAMES=018ed570-e6ef-4162-8426-bccc94c4e150,b2f4c3c1-f080-4a1f-a536-3c22212f9dcd,da720310-699e-4889-ab3b-a3a01122d079,31af45fc-5610-4010-8c97-337ba5b2f823,a25c7fbb-74e3-4c57-985c-159e0e233a27,522ac18e-d321-4dde-b480-3704593b22d2 GT:DR:DV:PL:GQ 0/1:4:6:32,0,13:12 +chr1 368928 cuteSV.DEL.3 CAGCTCACGGTGTGGAAACTGCGACACTCACGTGGGTGCCATCTCAGCAGCTCACGGTGTAGAAACTGCGACACTCCCATGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTAGAAACTGCGACACTCCCATGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACG C 5.6 PASS PRECISE;SVTYPE=DEL;SVLEN=-315;END=369243;CIPOS=-55,55;CILEN=-41,41;RE=9;RNAMES=f770c333-5b0a-4b62-af96-3f7ab5daa662,d826c547-1c7c-4d2f-a379-6b6ee1c5425a,31af45fc-5610-4010-8c97-337ba5b2f823,b2f4c3c1-f080-4a1f-a536-3c22212f9dcd,bcad7e7f-53cc-4b01-a0ff-82552cf16d8f,522ac18e-d321-4dde-b480-3704593b22d2,11b4adf7-f440-4ffb-848d-3bb844f9c5dc,a25c7fbb-74e3-4c57-985c-159e0e233a27,da720310-699e-4889-ab3b-a3a01122d079;STRAND=+- GT:DR:DV:PL:GQ 0/1:23:9:6,1,139:5 +chr1 372668 cuteSV.DEL.4 TAACCGACCACCTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGCATATATGTATAATATATATTATATATGGTCCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCT T 7.6 PASS PRECISE;SVTYPE=DEL;SVLEN=-181;END=372849;CIPOS=-41,41;CILEN=-2,2;RE=9;RNAMES=018ed570-e6ef-4162-8426-bccc94c4e150,da720310-699e-4889-ab3b-a3a01122d079,d826c547-1c7c-4d2f-a379-6b6ee1c5425a,31af45fc-5610-4010-8c97-337ba5b2f823,a25c7fbb-74e3-4c57-985c-159e0e233a27,6433ec8d-025c-403f-8d9f-6ed36291d563,522ac18e-d321-4dde-b480-3704593b22d2,f770c333-5b0a-4b62-af96-3f7ab5daa662,b2f4c3c1-f080-4a1f-a536-3c22212f9dcd;STRAND=+- GT:DR:DV:PL:GQ 0/1:22:9:8,1,132:7 +chr1 374047 cuteSV.INS.9 C CCCCCCTCTCCTTTCTCCTCTCCATCCCCCCTCTCCATCTCCTCTCCTTTCTCCTCTCTCGCCCCCTCTCCTTTCTCCCTCTCTATCCCCCTCTCCTTTCTCCCTCTCTCCCCCTCTCCTTTCTCCTCTCCATCCCCTCTCCATCCCCCTCTCCATCTCCTCTCCTTTCTCCTCTCTAGCCCCTCTCCTTTCTCTCTCCTCCCCCTCTC 24 PASS PRECISE;SVTYPE=INS;SVLEN=208;END=374047;CIPOS=-33,33;CILEN=-12,12;RE=10;RNAMES=2025839f-8c5c-44ad-939d-e173d15a60ba,1ca42b59-8622-43bf-ab89-8dcdba70a3ef,da720310-699e-4889-ab3b-a3a01122d079,018ed570-e6ef-4162-8426-bccc94c4e150,f770c333-5b0a-4b62-af96-3f7ab5daa662,31af45fc-5610-4010-8c97-337ba5b2f823,d826c547-1c7c-4d2f-a379-6b6ee1c5425a,522ac18e-d321-4dde-b480-3704593b22d2,b2f4c3c1-f080-4a1f-a536-3c22212f9dcd,a25c7fbb-74e3-4c57-985c-159e0e233a27 GT:DR:DV:PL:GQ 0/1:18:10:24,0,100:23 +chr1 598067 cuteSV.INS.10 A ACAGCAGCTCATGGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGGAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGGAAACTGCGACACTCACGTCGTGGTGCCGTCTCAGCAGCTCACGGTGTGGAAACTGCGAAGACCTCAGCGGGTGCCGTCTCCGCCCCCCAGCAGCTCACGGTGTGGAAACTGCGACACTCACGCGGGCAGGTGCCGTTCCCTCAGCAGCTCACGGAAACTGCGACACCACGCGGGTGCCGGCCTCAGCAGCTCACGGTGGAA 2.6 q5 IMPRECISE;SVTYPE=INS;SVLEN=306;END=598067;CIPOS=-187,187;CILEN=-44,44;RE=5;RNAMES=5f2941d5-b81b-43e0-8fbf-40f994c61e82,e5eafbed-6121-4116-87c6-5fcea74eca7c,4d65b4bf-fd57-442c-b4a2-60854df8bb12,6e49ca59-0cc7-4d01-ba69-dd67194f9cb0,0c83858c-eba8-4369-aab3-62ee10f058a2 GT:DR:DV:PL:GQ 0/0:14:5:3,3,89:3 +chr1 606602 cuteSV.DEL.5 GTCAGAGCTGTCCTGGGTCAGAGCTGCCCATGG G 28.4 PASS PRECISE;SVTYPE=DEL;SVLEN=-32;END=606634;CIPOS=-2,2;CILEN=0,0;RE=11;RNAMES=e3961b9e-e239-4d5e-853d-074ce238ba5a,9caef6b7-8947-4df9-8c86-78bdf7b4c050,1a51cfde-0cea-4573-a6ec-ddc8c363913f,3e4d6ee4-a0ec-4624-a4f9-1bc096415618,60af2d8e-6533-4761-a57f-9f5bab88f59b,d7b0fc8f-8fd3-4e73-97fb-de5d4ddd60c6,db625de5-0805-43b2-b461-e64674fec855,dba99022-afad-445d-a6f2-8cba1ad2d85a,dd79d67c-2b2b-468c-a9f4-ffc9e3713644,e6343866-bdcb-4e35-9928-d6a90089192b,83f5c1b9-f820-43f8-809d-453c6d2028ee;STRAND=+- GT:DR:DV:PL:GQ 0/1:19:11:28,0,105:28 +chr1 609585 cuteSV.DEL.6 CTGTGGCCAGCAGGCGGCGCTGCAGGAGAGGAGATGCCCAGGCCTGGCGGCACACGCGGGTTCT C 23.3 PASS PRECISE;SVTYPE=DEL;SVLEN=-63;END=609648;CIPOS=-19,19;CILEN=-1,1;RE=11;RNAMES=874f43d4-264a-45ef-8668-1550210369a2,dba99022-afad-445d-a6f2-8cba1ad2d85a,9caef6b7-8947-4df9-8c86-78bdf7b4c050,0b573944-2eb4-467a-b944-dddce54e418b,ddd71e0f-7905-4685-b7c0-05d7f216243f,e6343866-bdcb-4e35-9928-d6a90089192b,1a51cfde-0cea-4573-a6ec-ddc8c363913f,e3961b9e-e239-4d5e-853d-074ce238ba5a,60af2d8e-6533-4761-a57f-9f5bab88f59b,dc1e925e-5165-455b-a29f-5dda1560ad67,dd79d67c-2b2b-468c-a9f4-ffc9e3713644;STRAND=+- GT:DR:DV:PL:GQ 0/1:21:11:23,0,119:23 +chr1 609935 cuteSV.DEL.7 CGGTGCTGCAGGAGAGGAGATGCCCAGGCCTGGCGGCCGGCGCACGCGGGTTCTCTGTGGCCAGC C 1.1 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-64;END=609999;CIPOS=-60,60;CILEN=-1,1;RE=8;RNAMES=0b573944-2eb4-467a-b944-dddce54e418b,db625de5-0805-43b2-b461-e64674fec855,83f5c1b9-f820-43f8-809d-453c6d2028ee,9caef6b7-8947-4df9-8c86-78bdf7b4c050,ddd71e0f-7905-4685-b7c0-05d7f216243f,dd79d67c-2b2b-468c-a9f4-ffc9e3713644,1a51cfde-0cea-4573-a6ec-ddc8c363913f,d7b0fc8f-8fd3-4e73-97fb-de5d4ddd60c6;STRAND=+- GT:DR:DV:PL:GQ 0/0:24:8:1,6,154:6 +chr1 610369 cuteSV.DEL.8 AGGCGGCCGGCGCACGCGGGTTCTCTGTGGCCAGCAGGCGGTGCTGCAGGAGAGGAGATGCCCAGGCCTGGCGGCCGGCGCACGCGGGTTCTCTGTGGCCAGCAGGCGGCGCTGCAGGAGAGGAGATGCCC A 0.1 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-130;END=610499;CIPOS=-30,30;CILEN=-3,3;RE=5;RNAMES=83f5c1b9-f820-43f8-809d-453c6d2028ee,874f43d4-264a-45ef-8668-1550210369a2,60af2d8e-6533-4761-a57f-9f5bab88f59b,e3961b9e-e239-4d5e-853d-074ce238ba5a,dba99022-afad-445d-a6f2-8cba1ad2d85a;STRAND=+- GT:DR:DV:PL:GQ 0/0:20:5:0,16,143:16 +chr1 610490 cuteSV.DEL.9 GGAGATGCCCAGGCCAGGCGGCCGGCGCACGCGGGTTCTCTGTGGCCAGCAGGCGGTGCTGCAGGAG G 0.8 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-66;END=610556;CIPOS=-57,57;CILEN=-2,2;RE=7;RNAMES=0b573944-2eb4-467a-b944-dddce54e418b,9caef6b7-8947-4df9-8c86-78bdf7b4c050,3e4d6ee4-a0ec-4624-a4f9-1bc096415618,1a51cfde-0cea-4573-a6ec-ddc8c363913f,58c69596-3118-4b49-91eb-0b5ad9cc07b0,ddd71e0f-7905-4685-b7c0-05d7f216243f,dd79d67c-2b2b-468c-a9f4-ffc9e3713644;STRAND=+- GT:DR:DV:PL:GQ 0/0:22:7:1,8,144:7 +chr1 610569 cuteSV.INS.11 G GTTGGCCTGTGAGGTTCTCTGTGGCCAGCAGGCGGCGCTGCAGGGAGGGTACCCAGGCCTGGCGGC 2.9 q5 IMPRECISE;SVTYPE=INS;SVLEN=65;END=610569;CIPOS=-70,70;CILEN=-3,3;RE=8;RNAMES=569f47cd-bf9a-4cda-bd5b-4b855b1a191f,a1b31d74-bce7-4754-84d7-d3e0c9ba4cc1,a90309df-44df-4a58-bd90-a6b6a40d319e,18f659db-5ad3-4c14-99b4-8bc6eb6371ff,bcd37f7e-7f75-4374-8969-c5c5b6847ef1,037d7275-443d-4952-a66f-08e5b32a531e,3d21c734-2938-4a1e-a06c-95f413f7729e,b534f964-ed10-4a02-98c2-5562bf4f6853 GT:DR:DV:PL:GQ 0/0:22:8:3,3,136:3 +chr1 611307 cuteSV.DEL.10 GTGGGTGTGACAGGGTGTGTTCTGTGTGAGAACATGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGATGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTTGGTGTGAGTTCATGGGTGTGACGGGGTGTGCTGTGTGAGAACGTGTGTGTAGTGTTCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTC G 1.8 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-725;END=612032;CIPOS=-1,1;CILEN=-1,1;RE=8;RNAMES=dba99022-afad-445d-a6f2-8cba1ad2d85a,db625de5-0805-43b2-b461-e64674fec855,ddd71e0f-7905-4685-b7c0-05d7f216243f,3e4d6ee4-a0ec-4624-a4f9-1bc096415618,874f43d4-264a-45ef-8668-1550210369a2,dc1e925e-5165-455b-a29f-5dda1560ad67,1a51cfde-0cea-4573-a6ec-ddc8c363913f,d7b0fc8f-8fd3-4e73-97fb-de5d4ddd60c6;STRAND=+- GT:DR:DV:PL:GQ 0/0:23:8:2,5,145:4 +chr1 626790 cuteSV.INS.12 T TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG 0.1 q5 IMPRECISE;SVTYPE=INS;SVLEN=33;END=626790;CIPOS=-13,13;CILEN=-1,1;RE=5;RNAMES=03da607d-ec0b-4c16-af37-c7bffd0fde4f,701d440d-d52b-477c-a132-204fc867a873,26f3608e-9c4c-4bf9-bebb-bab47529ad41,5bc4ad69-db16-48c0-9c91-9d61c80e47ea,7fb7bf07-820d-4347-97b9-46e8d81505a0 GT:DR:DV:PL:GQ 0/0:21:5:0,19,153:18 +chr1 744867 cuteSV.INS.13 G GATATATATATATATATATATATATATATATATATA 0.9 q5 IMPRECISE;SVTYPE=INS;SVLEN=35;END=744867;CIPOS=0,0;CILEN=-3,3;RE=6;RNAMES=428523af-d83b-4029-9ba6-02b2e053dad8,69c14cf9-ae64-4646-9a67-871e6ae789c8,9e8e62e7-3bef-4120-b1ed-db57e7644483,c9073453-c215-4531-a6f5-bd03a20d3303,9d7cd991-e98d-4774-af05-9da0fc07f75d,ce2a8020-a552-477f-9df5-b7d97f0a1078 GT:DR:DV:PL:GQ 0/0:19:6:1,7,125:7 +chr1 814585 cuteSV.INS.14 A AAAAAGATGTGAGAACCTATTTTCAGATGTATTTCCCTTCTAAATATCTAACACAACACACAGAAGAGAAAGTCAAGTCAATTTTACATATAAGTT 132 PASS PRECISE;SVTYPE=INS;SVLEN=95;END=814585;CIPOS=-12,12;CILEN=-2,2;RE=24;RNAMES=f8d73f18-e9e3-4117-84b3-25f6cbcccc07,71e7628e-be56-4af8-8cde-d459504c90b8,20f3552c-6685-4f0d-a9f3-b2bced8bdfbf,dea6d8a4-d4a4-46ff-bc8a-29f398b22dd1,748b49cc-0b65-4897-becf-ebe5de749e4c,68668099-f1bd-40fe-ba1b-375c60d26882,27e89a23-a2bd-4eb7-8f63-8be09957d3c5,fc6dd300-5ee5-4cd2-93fa-837c6a4be50a,7a47a58a-763b-4bdb-b38e-9a5b15ca84d9,0f47bd21-3cf1-45e1-a4dd-b3568472f922,35cc5f0c-2f47-44bd-bbc2-748fffaa3548,5a1ac598-afc9-4de2-b58e-6c7766d58dd3,1fc46ec8-a4bc-4c5a-a723-1dc6666ff1e2,3234576d-19b1-47a8-8f6f-33c9ec8a3154,c9aba122-a878-4883-8df2-ba537038e718,249f7119-afa9-400d-802f-b3dedec8dbb7,3112a6c0-ca14-42f1-b167-31cd65936400,c6859633-f656-4ffb-a43c-310e3568d902,2f5c7650-f34e-4dd6-914e-82f56b9da016,7661c62d-827c-49a7-8929-e47f1a2f72bd,d4ccaa89-de65-476c-9724-ff94f9eba74f,6d04f2bb-637c-4d61-b504-201d582a7d9b,22ed4c8d-bcf3-44b1-a3ac-6e1670a1ac02,1ddd235d-02fa-4705-80f9-544ffbf76a46 GT:DR:DV:PL:GQ 0/1:14:24:132,0,37:36 +chr1 820896 cuteSV.INS.15 T TCTACACTGCCTGGGCAGTAGTTCACGCAATCTCCCCTACCTGCCTCCTCCTTGAACCAGCCCTATCTATACTACTTGCCTGTCCAGCAGATCCACTCTATCTACACGACCTGCCTGTCCAACAGATCCACCCTGTCTACACTACCTGCTTGTCCAGCAGGTCCACCCTGTCTATACTACCTGCCTGGCCAGTAGATCCACACTATCTACACTGCCTGCCACAGCCAAGATCCACCCTGTCTAC 69 PASS PRECISE;SVTYPE=INS;SVLEN=243;END=820896;CIPOS=-10,10;CILEN=-2,2;RE=19;RNAMES=7d52fe89-e304-42a2-b483-33b0ff06ccbc,0a0295eb-c248-4bed-8d45-06bfa3a76406,5af3910e-b9f4-451a-8710-917ca5fa90b8,cc4589e4-36c0-4e71-8151-b61449bba14a,2bc25ca5-c598-4437-b822-36903e39cd43,c0757315-cc54-4e20-abfc-0292cd3e588b,60f7349e-a821-4ce7-8442-5771681f63b5,428aa746-97d1-4764-95e3-6dd4e6867aee,ed57ee60-11d6-47ac-a344-c795f4f4c269,8783cda6-9447-4cf3-8da0-23207f536755,1a85a944-e29e-4900-94ac-0db738749b1f,236d703b-56fc-4715-9fd3-7d0bad6fb350,bb9b2ee4-a208-479f-87a1-82390847b380,80431cc1-baa5-4287-9918-5269169e50e9,6ba9a389-ec6c-4aba-9462-72e2aaa04ecb,e46d6475-fcbb-4b6e-b8c4-faf7026366dd,13748a33-386b-4d95-af9c-8e12fcd16ec2,e741df71-7c61-4b64-b810-fed29fcf9029,11e05e53-15f3-4fee-a1f5-0169b9a4e390 GT:DR:DV:PL:GQ 0/1:25:19:69,0,126:68 +chr1 820911 cuteSV.DEL.11 CCCTGTCTACACTACCTGCTTGTCCAGCAGGTCCAC C 70.9 PASS PRECISE;SVTYPE=DEL;SVLEN=-35;END=820946;CIPOS=-3,3;CILEN=-1,1;RE=20;RNAMES=dea6d8a4-d4a4-46ff-bc8a-29f398b22dd1,c6859633-f656-4ffb-a43c-310e3568d902,1ddd235d-02fa-4705-80f9-544ffbf76a46,a0d6875e-4ed6-4fd8-8546-73e206e1b988,d4ccaa89-de65-476c-9724-ff94f9eba74f,748b49cc-0b65-4897-becf-ebe5de749e4c,3234576d-19b1-47a8-8f6f-33c9ec8a3154,0f47bd21-3cf1-45e1-a4dd-b3568472f922,249f7119-afa9-400d-802f-b3dedec8dbb7,35cc5f0c-2f47-44bd-bbc2-748fffaa3548,6baa71b3-614c-48cc-80e1-5d35913a2176,7661c62d-827c-49a7-8929-e47f1a2f72bd,9b38ae98-7dcb-45de-94d1-552edb57b5e1,68668099-f1bd-40fe-ba1b-375c60d26882,755c1eb0-0cc3-40a7-9333-9d5a36d75142,5a1ac598-afc9-4de2-b58e-6c7766d58dd3,cf4de677-11bd-42f4-b872-918466e13387,57ca2209-38dd-410f-b078-90fefd7ff16c,d00795b8-c323-4b84-ad84-2633e778c395,9b0c7ca8-7b5f-4cbe-8acc-210aee11213f;STRAND=+- GT:DR:DV:PL:GQ 0/1:27:20:71,0,138:70 +chr1 822420 cuteSV.DEL.12 TACTACCTCCCTGGCCAGCAGATCCACCCTGTCTA T 68.3 PASS PRECISE;SVTYPE=DEL;SVLEN=-34;END=822454;CIPOS=-8,8;CILEN=0,0;RE=20;RNAMES=3191037b-4588-4ca9-b10d-30f72b22dae0,0a0295eb-c248-4bed-8d45-06bfa3a76406,11e05e53-15f3-4fee-a1f5-0169b9a4e390,cc4589e4-36c0-4e71-8151-b61449bba14a,d3859d16-3b84-42c1-b62d-880e59fe5668,0568638e-ff8a-4c0c-b359-651d5856a156,60f7349e-a821-4ce7-8442-5771681f63b5,428aa746-97d1-4764-95e3-6dd4e6867aee,acd0961e-11d1-44e6-9521-38bf4df5a748,ed57ee60-11d6-47ac-a344-c795f4f4c269,13748a33-386b-4d95-af9c-8e12fcd16ec2,1a85a944-e29e-4900-94ac-0db738749b1f,236d703b-56fc-4715-9fd3-7d0bad6fb350,2bc25ca5-c598-4437-b822-36903e39cd43,6ba9a389-ec6c-4aba-9462-72e2aaa04ecb,80431cc1-baa5-4287-9918-5269169e50e9,8783cda6-9447-4cf3-8da0-23207f536755,e741df71-7c61-4b64-b810-fed29fcf9029,069cec55-5212-49f9-a7bf-4f1538651d94,c0757315-cc54-4e20-abfc-0292cd3e588b;STRAND=+- GT:DR:DV:PL:GQ 0/1:28:20:68,0,145:68 +chr1 839484 cuteSV.DEL.13 ACACCTGGACAAACACACCTGGACACACACACCTAGACAC A 64.5 PASS PRECISE;SVTYPE=DEL;SVLEN=-39;END=839523;CIPOS=-9,9;CILEN=-1,1;RE=18;RNAMES=236d703b-56fc-4715-9fd3-7d0bad6fb350,e9c8a5e0-ae6a-43fc-aa8d-443d0299178a,d3859d16-3b84-42c1-b62d-880e59fe5668,75f1ae85-0230-43d5-ab85-1b3c037a9f9e,13748a33-386b-4d95-af9c-8e12fcd16ec2,5c3ba9cd-3b23-4144-a829-69e31a658eea,d731a5a5-f6ba-484e-a949-a306b48b24a0,4fec7be0-fba7-4412-8a60-2a107ec4be60,b4514bb9-2852-4a77-a4c7-4c3ba323d1de,ce40ecf4-62b7-4557-b021-c1c3930f1bec,319e2bdd-20b7-4b43-a442-8b53e9cf9f45,e88eb53c-dac6-4220-adfc-2ddf08b01e06,f660919f-7fd0-4f1b-85e4-8a84fb7ada46,251ef908-eb88-4b1a-a3b4-7ced2e9c18d4,cff27005-11f2-4e9b-a8d4-fc4b8ab6a17c,c4e74269-fc7e-45a6-bcca-9ed3f426b382,428aa746-97d1-4764-95e3-6dd4e6867aee,b0a25212-295d-4995-9664-74983d034114;STRAND=+- GT:DR:DV:PL:GQ 0/1:24:18:65,0,122:64 +chr1 853529 cuteSV.DEL.14 ATGACCGCCGTGTGGTAAACTGATGAACCCCGACCCTGATGAACGTGAG A 58.9 PASS PRECISE;SVTYPE=DEL;SVLEN=-48;END=853577;CIPOS=-19,19;CILEN=-1,1;RE=15;RNAMES=b0a25212-295d-4995-9664-74983d034114,c4e74269-fc7e-45a6-bcca-9ed3f426b382,c78f3f96-0eca-4165-8039-a726409e315f,b1b40cdb-900d-4845-9898-6170d90cd3b2,4a58eaf6-68f1-4fee-a0c4-c7c6566fa5cc,eb6cac52-99f7-4d03-8f98-daccd8fca025,605f1e39-e4c5-4793-8bfa-a49d3a06c231,e741df71-7c61-4b64-b810-fed29fcf9029,72c69633-5575-4780-837f-327eb7e539e3,536a63a2-dfd4-433e-9d9e-5d9b7c94d3f4,251ef908-eb88-4b1a-a3b4-7ced2e9c18d4,f475a1f3-22bf-4b8f-b041-b81e6efb0211,83af1174-2b47-4abd-9718-293c2bbeb232,57e8980a-683c-41fa-b8be-6bf98f105d33,9db3952b-26fb-4232-9af8-58a98f45f237;STRAND=+- GT:DR:DV:PL:GQ 0/1:18:15:59,0,88:58 +chr1 860179 cuteSV.DEL.15 CTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT C 0.4 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-30;END=860209;CIPOS=0,0;CILEN=0,0;RE=7;RNAMES=22085938-8eeb-4b08-8c96-7afe9d261e81,7d61b37a-ec51-4e39-ab16-9262a0dc0754,9c386053-d761-4960-89e6-cd465103bc5d,bb30818e-d2cd-4f4f-a1af-e47bab969faa,8df62003-1bf3-49fa-9b11-c29769761226,cea5332f-05e1-4540-9026-c7895ec31961,f709d9a2-d943-493c-90e3-86b947f99927;STRAND=+- GT:DR:DV:PL:GQ 0/0:23:7:0,10,153:10 +chr1 866742 cuteSV.INS.16 G GCCTGAGCCCTCACGTGGTCCTCCCCTGTGACTCCTGAGTCTCACATGGTCCTCCCCTGCACTCACATCCCTGACATCCTTCCCGTGCCTCACGTGGTCCTCCCCGTGAATTCCACA 85 PASS PRECISE;SVTYPE=INS;SVLEN=116;END=866742;CIPOS=-51,51;CILEN=-11,11;RE=18;RNAMES=9b371dff-6873-48fe-b8da-a7064d536115,f475a1f3-22bf-4b8f-b041-b81e6efb0211,a991d6ca-a60b-4607-8762-3ab46be04d0b,8df62003-1bf3-49fa-9b11-c29769761226,28b79b1a-aeec-4ef1-8702-0a96a47277ad,ef6676f0-38c6-4daf-b0ec-af5991af3cde,8ca5b19a-40d7-4c67-a443-2a4c4ad1fd2c,dfdd1f76-41a0-4067-b78a-2ad10449afb2,22d84ee0-219b-458b-8911-886548b6efc7,e741df71-7c61-4b64-b810-fed29fcf9029,83d113a6-1c88-4ed4-9bcd-c0d7486282ee,eb6cac52-99f7-4d03-8f98-daccd8fca025,94a20cbf-3758-4b42-8c6a-da7d7500fd11,4bbda435-5485-4814-be91-34dc058cdc5a,33e89011-37df-42be-a64a-bf3a398c2c39,b8484e0f-d788-42c1-a228-10b46af1b795,b1b40cdb-900d-4845-9898-6170d90cd3b2,530b293c-b9e8-492f-a8af-c8b3529bf79d GT:DR:DV:PL:GQ 0/1:16:18:85,0,66:65 +chr1 872746 cuteSV.INS.17 T TTCTCAGGTTTGACTCTGACAATTCCCTAACAGGGAAGCTGCTGTCCTATAACTCTGGGGGAGGGGTTTCATTTGCTCCCTGGCAGGTTGGCTTCAGTCTCAGGTTTGACTCCTACCTGACTTAATTCCTAACAGAGGCTGCTGTCCTGTGACTCTCTGGAGAAGGGGGTTTCATTTTACTCCACCTGCAGTAGGGTCTGTTAGCCCAGGGGGAGGGGGTTTCATTTTGTACCTGCAGCCAGGGTTAGCCCATCTCAGGTTGGCCTCCTGTTAGTAATTCCTAACAGGGAGGGGAAGCTGCTGTCCTGTGACTCTGGGAGAGGCTTCACTGACTCCTGTAACTCTGGGAGGGGTTCATTTTCTCCACCCCTCCTGGCCGAGGTTAGCCCATCTCAGGTTTGACTCCTGACTTAATTCCT 62.8 PASS PRECISE;SVTYPE=INS;SVLEN=418;END=872746;CIPOS=-94,94;CILEN=-4,4;RE=13;RNAMES=dc2fa176-2e22-4efc-9310-d5e581286c22,b531fd8f-94a3-401d-b47a-cc8d15a8d5d7,fa9da229-e6a6-4179-b397-2d141edd48aa,3b3ec0bb-28b0-47d0-bfb6-9aaf46c33103,99b88d32-1ef1-40c4-b00a-5705363b8477,1ab37623-f1cc-461e-a660-23d807f676b7,22085938-8eeb-4b08-8c96-7afe9d261e81,f9ed5ade-463c-4908-b7de-c9e6d9435c36,d402008b-79c3-4fae-b528-5b6355c2a300,ab958f04-03ad-4eeb-b923-3bd9126b4605,bf7f96a4-1992-465e-afa5-a0be4c87825a,ccecf5ea-f571-442c-95c3-84df5139fa08,7d61b37a-ec51-4e39-ab16-9262a0dc0754 GT:DR:DV:PL:GQ 0/1:11:13:63,0,44:43 +chr1 875831 cuteSV.DUP.1 C 27.1 PASS PRECISE;SVTYPE=DUP;SVLEN=509;END=876340;RE=13;STRAND=-+;RNAMES=bf7f96a4-1992-465e-afa5-a0be4c87825a,e9278a48-f13d-452e-b116-b334f56aea9f,2e7f44ed-26c9-4d08-8b97-14cc1bce418d,8a996251-bd12-464d-8702-52627014ab5c,54c716dc-2974-49bb-9da1-7517688122b1,157884a7-c3c2-4e4a-949a-53de16111ee0,0f3c8ebf-04b2-45c9-943e-6aa19617fd25,594f75ad-7c93-4a51-a735-43f49eeaa22f,fa9da229-e6a6-4179-b397-2d141edd48aa,f804818a-f647-4493-942e-29808c2368a9,30a10600-8e17-45f1-ba0a-45befaf2a980,8d5882ef-92c9-4618-835c-e2746bf2976d,833d5e0e-a83a-4d01-95b5-1e1327f581e0 GT:DR:DV:PL:GQ 0/1:25:13:27,0,142:27 +chr1 875973 cuteSV.INS.18 C CAGTGTCCTGTCGTCGAAGGGGACAATCCTTAATTCAGTCCTCAGTTTGGACTCTACCTCCCGATTGGAGCGACGTCCACCTCGTTTACTTTGGGGGGGGAGGGGTCTCAATGTCCTGTCGAAGGACAATCTTAATCTCAGTCCTCAGTTTGGACTCTACCCGATTGTCTGGAGTGACGTCCATCTTGTTTACTTTGGGGAGGGGGCCCTCCCAGTGTCCTGGAACCGAAGGGGACAATCCTTAATTCAGTCCTCAGTTTGGACTCTACCCGATGAATGAGCGACGTCCACCTCGTTTACTTTGGAGGGGGTCTCAATGTCCTGTCGTCGAAGGGGACAATCCTTAATTCAGTCCTCAGTTTGGACTCTACCCGATTGTCTGGAGTGACGTCCACCTCGTTTACTTTGGGGGAAGAGGTCTCAGTGGTCCTGTCGTCGAAGGGGACAATCCTTAATTCAGTCCTCAGTTTGGACTCTGTCCGATTGGAGTGACCCACCTCGTTTACTTTGGGGGAGGGGTCTCAATATCCTGTCGGTGTCGAAGGGGACAATCCTTAATTCAGTCCTCAGTTTGACTCTGCCTCGATTGTCTGGAGTGACGTCCACCTTGGTAACTTTGGGGGAGGGGTCTCAATGTCCTGTGTAAAATCCAGAATCTGAAGTTATAAATTTTATAAACAGAGACCTTAAACTTTTATAGAAGACATGGAACAAATGACCTAATTTCCGTGTTAGTGAATAAGTAGACTAATCATTAGTTTCAAATTTTAAATAAATTATGAACTACAATGAAACAAAATATATTTACACATAAAGATGCTATCAAAATCACAGTAGATTATAATAAAAGTTACAACATTAAAAATAAATATACTTAGAAACACAATATGTAGAATTACAGTGACCATAATTTTGATATACACGTTTATCGAACTTTATCGTGGATTGTGTAGTACCTAAGATACATTTATAAGTGGTGAGATAAAGTAGTGTAAAATTGGTTAAGTATGGTAAGAAGTAAACGATTGTAAAATAAACAGGAGTAAGTTTAAACACAACCACAGGACAGATGTATAATTATTTTCTCTCCTCCATCCGGGTCTGAAGTAGAGCTAATTTATGATTCTGTAAAAAGATCAAGATAAATATGTGTTGTAATGTAAACAGAGTGTGATATTTATACTCATAAGTCAATTCATGTCGTTTTTTTTTTTTTTTTTTTTTTTTAGAATTGGAAGGATGTTGTCTACAACCCTATAAGTGTCTAACAGGTGTTGAAGAAAGAGGGTTGTAATAAAAATAAGAATGTAGAGTTGACGGTACTTTTGGATCTTCGTACTTTATGATGTTTAACTTTGTTTAAGGGACGATATGTCTTTAACATTTCATTTGGTTCTTCTTTCTCTTATATACGTATGAGTACGTTGTTCTTTATTTTTGTCTAATCTGATTACTTTTGAGAAGTAGTCTTCATTCAGGACCGGTCCGAGGATTTCGAAGTGATCACTTAAGATAATTTGTGAATTACTTATTGGTTAGGAAGTGTTTGTGAACGTTTCGTCTCTTCGTCCTTTTGGAGTTAAGTAAAACTCCGGTCATAATGGTCTATTGTTTAGTCTGTTTTCGTAGTGTTCTTTTTGATTTACACGTCTGATAATAGGAGTACTTACTTCTGTCTTTAGAGGTTTTTGAGATCATTTGACCTTCTTCGTTGTGTATTTTTACAATCTATTCGGACCGTACCACCACGTGTGGACATCACGGTCGATGAATCCTCCGACTCCACCGTCCTAGTGAACTCGGGTCCTCAAACTCCAATATCACTCGACACTGACACGGTGAGGTGGGGTTGGACCCACTGTCTCACTTCTGGAGTAGAGATTATTTTTAATTTCTTTTAATCTTTTTTCAATCTGTTATATTGGTTCACCTTAATAGGATGGTTATGTTTAGATTTTTAGTCACATTATGTGGTATAATTATCTTATTTCCTTTCTGGTACTACTAAAGTTACCTGCAGACTGTTACAGTTACTGTGAGTAAGGACCATTTAGAAGGTCTTTAGATCGTTATCTTTATTGAAGGACTTGGAGGATTTCCTGTAGGT 95.5 PASS PRECISE;SVTYPE=INS;SVLEN=2106;END=875973;CIPOS=-48,48;CILEN=-105,105;RE=11;RNAMES=b531fd8f-94a3-401d-b47a-cc8d15a8d5d7,4bbda435-5485-4814-be91-34dc058cdc5a,9148c8b3-594a-4abe-afe8-9c5f32d76173,4ba6382a-2fce-4dae-80ab-4c1dedd1cec7,3b3ec0bb-28b0-47d0-bfb6-9aaf46c33103,d9720cb5-2fa8-4d15-9221-76bebb8ee477,1ab37623-f1cc-461e-a660-23d807f676b7,f2c202a6-bd7d-4934-9879-7d393feb9dff,8b08274d-9781-4d6f-99a3-717a424483e9,21e9cbd1-d15b-4c76-a89d-aa015299da43,b1b40cdb-900d-4845-9898-6170d90cd3b2 GT:DR:DV:PL:GQ 1/1:1:11:95,21,0:21 +chr1 875973 cuteSV.DUP.2 C 0.1 q5 IMPRECISE;SVTYPE=DUP;SVLEN=455;END=876428;RE=7;STRAND=-+;RNAMES=e9278a48-f13d-452e-b116-b334f56aea9f,57180a95-ccfb-41fc-a26e-95f054ea5da9,8a996251-bd12-464d-8702-52627014ab5c,0f3c8ebf-04b2-45c9-943e-6aa19617fd25,7d61b37a-ec51-4e39-ab16-9262a0dc0754,8b08274d-9781-4d6f-99a3-717a424483e9,e1aef287-8fbf-4811-af38-59948ea55545 GT:DR:DV:PL:GQ 0/0:25:7:0,15,172:15 +chr1 876165 cuteSV.INS.19 C CCCCCACACCTCCCTGCTCCCCCCACACTCCCTCATACTCCCCATACCACCCCAACCTCCCACACTCACCCACTCCCCATACTCCCCAACCTCCCCCATACTCCCCACATTCCCCCATACTCCCCTCATACTCCCCCAAACTCCCCCATACTCCTC 102 PASS PRECISE;SVTYPE=INS;SVLEN=155;END=876165;CIPOS=-80,80;CILEN=-75,75;RE=17;RNAMES=22085938-8eeb-4b08-8c96-7afe9d261e81,81e82995-0ffd-447b-af0c-f31a8a115fb1,83d9c279-cb7c-4b06-9b0c-59442e2354da,54c716dc-2974-49bb-9da1-7517688122b1,fa9da229-e6a6-4179-b397-2d141edd48aa,c833ef7f-3c94-419c-96cc-bf4e38a240c2,bf7f96a4-1992-465e-afa5-a0be4c87825a,b0446234-8b99-44e0-b576-1a72d3ef0700,2417740b-2584-4ca1-8f2d-7f6c68abd8f9,99a38a81-7109-4bb4-ad5d-a20a4eee12ef,e1aef287-8fbf-4811-af38-59948ea55545,dfdd1f76-41a0-4067-b78a-2ad10449afb2,b8484e0f-d788-42c1-a228-10b46af1b795,8d5882ef-92c9-4618-835c-e2746bf2976d,0f3c8ebf-04b2-45c9-943e-6aa19617fd25,30a10600-8e17-45f1-ba0a-45befaf2a980,594f75ad-7c93-4a51-a735-43f49eeaa22f GT:DR:DV:PL:GQ 0/1:7:17:102,1,7:6 +chr1 882645 cuteSV.DEL.16 TAATATATTAGCTATTCTAGACTTTATGCATTTATGTAAAGTTTTCTTTGTTGCACTTTAAGTTCTGTGATACATGGGCAGAGCATG T 256.4 PASS PRECISE;SVTYPE=DEL;SVLEN=-86;END=882731;CIPOS=0,0;CILEN=0,0;RE=48;RNAMES=97e774b1-1177-4bdf-9c45-4dbd0f3d8a73,73728239-8ff1-4cd4-ae56-e31e059a758b,056a9e24-0a77-44d1-ab83-a80a75d9d788,6a9a2e7c-07fb-4cf7-8efc-22bea120a93b,046a1725-d02e-438e-9da5-d41fb1ce84b2,0e3cf0c5-c78d-45c7-a6bc-c4233623f360,ae6d687f-f2ff-483b-be58-dd45b9ba58bf,05c54a42-e1a3-4e16-93b9-9ca92d0319dd,08fa5454-6e34-4304-9787-9ed403a4ef53,0c5c0fcf-8c53-415b-9bc9-2ed8a9e4299d,1412da0d-d8b3-4d26-82bd-25b190e8a6a4,16169411-77a4-427a-8aef-7b60074206bd,21421e6f-ff09-4fa3-bc1e-1b17dea7e828,3bd7dc1d-4849-4e40-b64d-3842fc25cb9b,3d6d04de-603a-40d7-8b40-50a320ddd891,3fb08fdb-7f14-4f34-8aff-de9e75d4dd0e,42e33868-1a79-47a6-868e-7d5d389c5d56,46dfd63c-e140-494a-841b-5bd7f64189db,4eeb823d-93f5-4cb9-b4dd-56fbd2852eab,505d5fa2-a923-4c2b-988e-bd1a3019246e,56c7af10-8d0c-4d10-934e-ce832e20bec5,5d0b99df-5c30-4ea4-8385-0bec80e55059,5fbb17d4-0ab0-465e-b3e1-7dc60e96e11f,6169a3b9-e904-46e3-b8bf-b8c315d5c5e5,716637f2-babf-4370-9472-f8f6fa9d744e,7186749e-c3f5-4c2a-b234-3a6837d50fd6,768ccb5b-6be9-41ea-bc3f-279b2e7a8fb0,7d284a1c-633b-40cb-91e5-162f95fa3d66,80b82853-6e69-45ee-9ca4-e5ad9e8f03a1,90498d40-6caf-43e8-a74d-f1736a1de6c5,9f6cdfba-a21d-4f16-a5e4-14986b2af751,a2397e41-8500-4e12-9aba-12b8497ca4f1,a5e72e50-4e49-4227-b7ff-ee22a24a02f9,a78069fd-c1ba-4e9b-84ac-f8589d7589cc,b3cb0ab9-8a9d-48bf-9136-9eea67132892,b55c1a9a-8db3-465f-9be8-8469cbc48312,bad14ecc-2338-4c54-afcf-9cd59bd90c2a,d6291e72-fb98-406f-80f2-f01885a638c0,db1f015c-ba4f-4d13-a0da-f774b99ce01a,dde71730-62fd-47aa-b397-6f37ace09318,e1de4680-a48f-4701-b35a-9e90e6fd3da1,e2af033c-70d2-4d88-8b6c-e67e2eadb7b5,e530ef5b-f4f8-4800-aae0-eb330134cdaf,ee96a347-de27-4e4f-8152-7cb3d0422d8f,f5434dca-3eba-40df-99f4-1aa2af2c2e9d,ae1e190c-1af4-4119-80d6-6195876f9e64,c085c408-f90b-403c-889c-556a8cf52725,93c9ba0c-2139-4366-8af9-816d2b650eb3;STRAND=+- GT:DR:DV:PL:GQ 0/1:31:48:256,0,94:94 +chr1 883241 cuteSV.BND.7 N N]chr20:29351526] 0.2 q5 IMPRECISE;SVTYPE=BND;RE=19;RNAMES=7d284a1c-633b-40cb-91e5-162f95fa3d66,56c7af10-8d0c-4d10-934e-ce832e20bec5,ee96a347-de27-4e4f-8152-7cb3d0422d8f,505d5fa2-a923-4c2b-988e-bd1a3019246e,73728239-8ff1-4cd4-ae56-e31e059a758b,3fb08fdb-7f14-4f34-8aff-de9e75d4dd0e,a2397e41-8500-4e12-9aba-12b8497ca4f1,46dfd63c-e140-494a-841b-5bd7f64189db,046a1725-d02e-438e-9da5-d41fb1ce84b2,768ccb5b-6be9-41ea-bc3f-279b2e7a8fb0,b3cb0ab9-8a9d-48bf-9136-9eea67132892,5fbb17d4-0ab0-465e-b3e1-7dc60e96e11f,3d6d04de-603a-40d7-8b40-50a320ddd891,4eeb823d-93f5-4cb9-b4dd-56fbd2852eab,1412da0d-d8b3-4d26-82bd-25b190e8a6a4,6169a3b9-e904-46e3-b8bf-b8c315d5c5e5,056a9e24-0a77-44d1-ab83-a80a75d9d788,9f6cdfba-a21d-4f16-a5e4-14986b2af751,db1f015c-ba4f-4d13-a0da-f774b99ce01a GT:DR:DV:PL:GQ 0/0:57:19:0,13,363:12 +chr1 883245 cuteSV.BND.8 N N]chr20:29789175] 0 q5 IMPRECISE;SVTYPE=BND;RE=14;RNAMES=7186749e-c3f5-4c2a-b234-3a6837d50fd6,e2af033c-70d2-4d88-8b6c-e67e2eadb7b5,16169411-77a4-427a-8aef-7b60074206bd,0c5c0fcf-8c53-415b-9bc9-2ed8a9e4299d,e530ef5b-f4f8-4800-aae0-eb330134cdaf,c085c408-f90b-403c-889c-556a8cf52725,3bd7dc1d-4849-4e40-b64d-3842fc25cb9b,97e774b1-1177-4bdf-9c45-4dbd0f3d8a73,80b82853-6e69-45ee-9ca4-e5ad9e8f03a1,42e33868-1a79-47a6-868e-7d5d389c5d56,a5e72e50-4e49-4227-b7ff-ee22a24a02f9,21421e6f-ff09-4fa3-bc1e-1b17dea7e828,085aed94-1490-4a98-8912-0e3be7f68801,bad14ecc-2338-4c54-afcf-9cd59bd90c2a GT:DR:DV:PL:GQ 0/0:56:14:0,45,401:45 +chr1 886237 cuteSV.INS.20 G GGCCCTTTGGCAGAGCACAGGTGCTGTGCTG 67.2 PASS PRECISE;SVTYPE=INS;SVLEN=30;END=886237;CIPOS=-6,6;CILEN=0,0;RE=14;RNAMES=1b5284d7-0579-4bb8-aa3e-7bc85c40e956,2952f193-7c51-43af-bbc5-c080d80edcd7,833d5e0e-a83a-4d01-95b5-1e1327f581e0,ba1cb437-af9c-47a5-bb4f-7a83b124974c,e25b8c47-f70a-4568-b64b-53d6c821faee,9148c8b3-594a-4abe-afe8-9c5f32d76173,4ba6382a-2fce-4dae-80ab-4c1dedd1cec7,6ebac132-23ad-4bfc-82b9-71ab65b52b87,f8441ce1-4097-40ea-8559-2b09b0b98b05,157884a7-c3c2-4e4a-949a-53de16111ee0,d2c1f991-7bd6-47d9-a6ac-6379b6adcfc4,a095f8e9-9b24-4020-a392-c40aba2f48b5,8b08274d-9781-4d6f-99a3-717a424483e9,1f790562-7d96-4180-b322-9cb561596e5c GT:DR:DV:PL:GQ 0/1:12:14:67,0,48:48 +chr1 893790 cuteSV.DEL.17 AAAAAAAAAAAAAATATATATATATATATATATATA A 229 PASS PRECISE;SVTYPE=DEL;SVLEN=-35;END=893825;CIPOS=0,0;CILEN=0,0;RE=25;RNAMES=f8441ce1-4097-40ea-8559-2b09b0b98b05,0bad51cf-85e9-451f-b2b3-1d92289ca318,989f153a-61c4-4600-9c1a-91a4804fe4c1,21e1dfd6-61cf-4be2-9067-82184c38457f,b0168428-ba4f-4c4c-ac1b-7988f4060d1f,c74e5c73-20e4-4e8c-8030-36adb8223044,157884a7-c3c2-4e4a-949a-53de16111ee0,1f790562-7d96-4180-b322-9cb561596e5c,25e64d5c-2313-4e64-8051-3bd8e6003005,1c0aa9ad-f2ba-43e2-86dd-40af939404f4,061f0900-f931-48aa-8864-d586f6d790ee,833d5e0e-a83a-4d01-95b5-1e1327f581e0,b531fd8f-94a3-401d-b47a-cc8d15a8d5d7,3d7a1e2c-1452-47e1-963c-a3cfc6d63b46,46183fcd-acab-4c99-8b0a-fe46ba072d9d,5ad8c07f-30be-4a25-b4bd-17c37679d9e7,6ebac132-23ad-4bfc-82b9-71ab65b52b87,c753e6fa-20f8-4c7e-aa67-9efcb87a27b8,d960c198-fbca-4240-aaec-0524e03219cc,2e5a5c36-81a3-4ef7-a4f6-c7ce89c15fcb,5c1c717f-82b0-498a-a57e-c3a7b5fc52f3,668be2a3-b472-4cf2-ab60-b6e01a18a8bb,a095f8e9-9b24-4020-a392-c40aba2f48b5,a7a20a3f-47f4-4d97-9f5f-68670b283b42,ade5f657-3781-475d-a952-a3a26cef57b7;STRAND=+- GT:DR:DV:PL:GQ 1/1:1:25:229,57,0:56 +chr1 907847 cuteSV.INS.21 C CCTGCCCGGTCCTTCTGACCAGCCGAGAGAGTA 45.6 PASS PRECISE;SVTYPE=INS;SVLEN=32;END=907847;CIPOS=-7,7;CILEN=0,0;RE=12;RNAMES=6442676e-05bd-4d7e-9322-8a10a68af75b,989f153a-61c4-4600-9c1a-91a4804fe4c1,d9abe60f-2349-4c46-8fa8-4ed1d0681b41,cf34ac10-b4ba-4f7e-8eba-7dd1ebad8f3b,197ed372-bfef-4fe2-b0e0-df8b05eba7f3,3a6e4814-546e-4834-a6d9-14ababfe6a69,7c2d5b60-57ab-49b7-9d19-375dbf0cde78,bc69941c-5feb-4e57-aae0-0ee530b36dd9,013e7d51-f095-4b11-8a4e-f077dcc74e71,5ebeaf63-db07-4100-8081-1a1f63e74c8c,729b3db5-7276-4df7-ae39-9f9439c321ad,f86dbc79-ada1-4764-8777-6f2820cffbeb GT:DR:DV:PL:GQ 0/1:15:12:46,0,74:45 +chr1 909163 cuteSV.DEL.18 GGCGCACCTTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCATCTTTGCTGGTATATGCGGTGG G 324.4 PASS PRECISE;SVTYPE=DEL;SVLEN=-292;END=909455;CIPOS=-53,53;CILEN=-26,26;RE=35;RNAMES=de9147d3-cb61-4585-a00d-6bcc0e53e7a2,e19d0ab7-8c9a-4cdf-9fa5-e3b64259bd21,1c0aa9ad-f2ba-43e2-86dd-40af939404f4,48ee0195-0418-4d3e-983e-94295d21d48a,e89b5093-4ee5-4f94-ab68-72b0fb142efa,f5da1070-e436-4e16-9076-0c28a37a8434,32160072-5834-4d3a-a92a-b787ff89d148,6442676e-05bd-4d7e-9322-8a10a68af75b,d34147d4-42e8-4c08-969c-c4a22a1c4b9e,c0470469-f331-492e-a916-713baf1a9e52,8794daf4-3a1e-43ca-80d2-223fe23c7bff,87cfd6de-af07-43af-afc0-3b7956141015,fb3c12ce-a395-4c12-8aad-b83a4db3068c,013e7d51-f095-4b11-8a4e-f077dcc74e71,157884a7-c3c2-4e4a-949a-53de16111ee0,a7a20a3f-47f4-4d97-9f5f-68670b283b42,be47fddc-a6e6-4e11-abdc-ec58059ac9aa,455e63af-8997-471c-a9f4-7538acdc6d3c,ef017d95-eb39-451e-a7a0-433e7c631511,331538b4-6108-49ad-ab42-3a04840bd1d5,8cb61c3c-7e5e-4310-b3ba-d6790a174aa0,45e5bce7-5948-4f24-9979-f624acf818b3,b531fd8f-94a3-401d-b47a-cc8d15a8d5d7,d9abe60f-2349-4c46-8fa8-4ed1d0681b41,3ff85788-7299-4769-b0e2-660d65429540,6097bf31-8314-4ec8-92d1-de170682d868,7c2d5b60-57ab-49b7-9d19-375dbf0cde78,197ed372-bfef-4fe2-b0e0-df8b05eba7f3,cedaf996-5a21-4bdd-bc96-6e12a05959d0,cf34ac10-b4ba-4f7e-8eba-7dd1ebad8f3b,bc69941c-5feb-4e57-aae0-0ee530b36dd9,5ebeaf63-db07-4100-8081-1a1f63e74c8c,dfddfd57-ec74-49a0-8c87-056496f06344,729b3db5-7276-4df7-ae39-9f9439c321ad,f86dbc79-ada1-4764-8777-6f2820cffbeb;STRAND=+- GT:DR:DV:PL:GQ 1/1:1:35:324,82,0:82 +chr1 934050 cuteSV.DEL.19 GCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGG G 162.3 PASS PRECISE;SVTYPE=DEL;SVLEN=-829;END=934879;CIPOS=-14,14;CILEN=-6,6;RE=22;RNAMES=5e011927-ab9d-474f-91fc-f7ae39f94bc4,cb6fea29-8a74-4f19-97b0-047cd9ff1988,058a2a35-66c8-4e50-a04e-50abc2fee9e0,5439c77a-2a93-4cd8-9b3e-8d0055b43ebe,bdcdd759-18fb-49b0-b78d-c60f959d1e10,c2a57bcb-3194-4d93-8cf9-12c18544cc73,d84b2f57-3f94-4570-a9b9-d441844951d0,039e6e21-87a5-4c62-a83c-a7a6bc777e0e,5092d6a4-17e7-4b3d-ac3e-c22350294e33,6563cf52-9022-416c-b76a-ad4d604ad796,ef738199-2538-4ca3-bd44-4738dae9b64a,25d56d8f-95a3-467c-8971-704b6bd8b014,2ba057e6-2c4d-4d23-b853-edd58ffbbafd,8e0002eb-fff3-45d2-9a67-ae5bfeb455db,8909429c-8eae-4329-99f6-8ae048f53f8e,90759a7f-47ad-4027-a56d-575eb480bb51,f5da1070-e436-4e16-9076-0c28a37a8434,4225f521-f571-483b-8c4e-d899a90db58d,b1f681f8-03d5-4d20-8872-bbb605e83812,8ec4e4c1-9f5a-4bb8-89ab-920aa61ec69d,ed2eb115-3859-4e15-967e-5628f3e55e88,fbd996c8-c74d-43eb-bff2-121a8752eede;STRAND=+- GT:DR:DV:PL:GQ 1/1:5:22:162,21,0:21 +chr1 936287 cuteSV.DEL.20 TAGGGCTCCTGGACGGAGGGGGTCCCCGGTCCCGCCTCC T 257.6 PASS PRECISE;SVTYPE=DEL;SVLEN=-38;END=936325;CIPOS=-8,8;CILEN=0,0;RE=28;RNAMES=cb6fea29-8a74-4f19-97b0-047cd9ff1988,a2c53f94-331e-4946-9670-fe7adad6b2b2,ef738199-2538-4ca3-bd44-4738dae9b64a,d84b2f57-3f94-4570-a9b9-d441844951d0,7fdc4490-b48a-4769-94bf-c0c67f477a2f,7ef9e428-6312-4642-a023-f3363c296d5a,8ec4e4c1-9f5a-4bb8-89ab-920aa61ec69d,5e011927-ab9d-474f-91fc-f7ae39f94bc4,b1f681f8-03d5-4d20-8872-bbb605e83812,8e0002eb-fff3-45d2-9a67-ae5bfeb455db,b2d0e51e-2f9d-40ef-910b-15553399360a,25d56d8f-95a3-467c-8971-704b6bd8b014,2ba057e6-2c4d-4d23-b853-edd58ffbbafd,8909429c-8eae-4329-99f6-8ae048f53f8e,f5da1070-e436-4e16-9076-0c28a37a8434,039e6e21-87a5-4c62-a83c-a7a6bc777e0e,2474461a-5501-41d4-8c2f-14c8ea360ca1,6563cf52-9022-416c-b76a-ad4d604ad796,790a1392-a4ed-431a-b852-6b2da93ed0c2,f7500359-a393-40b1-8b08-76a85d7de289,5092d6a4-17e7-4b3d-ac3e-c22350294e33,058a2a35-66c8-4e50-a04e-50abc2fee9e0,90759a7f-47ad-4027-a56d-575eb480bb51,bdcdd759-18fb-49b0-b78d-c60f959d1e10,fbd996c8-c74d-43eb-bff2-121a8752eede,4225f521-f571-483b-8c4e-d899a90db58d,ed2eb115-3859-4e15-967e-5628f3e55e88,5439c77a-2a93-4cd8-9b3e-8d0055b43ebe;STRAND=+- GT:DR:DV:PL:GQ 1/1:1:28:258,64,0:64 +chr1 948693 cuteSV.INS.22 T TACCCTGGTCCCCCTGGTCCCTTTGGCCCTGCCTGGCTGG 257.6 PASS PRECISE;SVTYPE=INS;SVLEN=39;END=948693;CIPOS=-9,9;CILEN=-1,1;RE=28;RNAMES=da3d6919-68dc-40dc-9594-f00ad1e2ae52,c1613a03-ea98-4685-b3b9-c020a4c15210,eb762c90-9fcc-4d4c-9e98-debcc6e8261a,8f9eff79-5fa6-41a4-9961-d085eab8d911,6eb90826-9a88-48dd-966a-dd531f44fec5,cb6fea29-8a74-4f19-97b0-047cd9ff1988,4fd75c31-c8cb-4398-b105-5960c3eed53f,1638b6ea-9a32-4f75-bff1-31814803dfc1,2b2cea73-821f-4c3f-a7b0-0ea1e71c4031,35f7d250-ae9a-42da-984a-dacd59aefd87,2474461a-5501-41d4-8c2f-14c8ea360ca1,8a089335-5e21-404e-ac6e-8416b50619df,a58862dd-d98a-43f2-8f92-4a2cc3e44909,59158445-57ca-4f69-838e-b4e69bff3a5c,b39d85b2-e86f-4d3d-b4b9-dc64a04be2ca,23c8c9a1-9837-4401-9b74-d8d0e4d7a170,c4dabb82-0380-460c-9b0a-f94d85750d44,5092d6a4-17e7-4b3d-ac3e-c22350294e33,5623a23e-35c9-4fac-a114-e443f2c8ea83,5cbfd059-3131-4f6e-92ab-5a1c2438b814,c994f939-43ef-4f34-be99-93e3eed4d729,6cb90369-a4f1-4d89-a067-bbb639c431bf,f5da1070-e436-4e16-9076-0c28a37a8434,6ea88abc-babb-4aa0-8e58-1decb03e297f,e75216cb-9060-4279-94f4-739954cd39a6,bdcdd759-18fb-49b0-b78d-c60f959d1e10,c8e8bba4-4b8d-428a-a4b1-4e3c33b4d7ee,d28ba1cb-a79c-4766-995b-25cc0c43384b GT:DR:DV:PL:GQ 1/1:1:28:258,64,0:64 +chr1 964651 cuteSV.DEL.21 TGCTGCCGGGAGGGGGGCGCGGGTCCGCAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCCGCAGTGGGGAT T 229 PASS PRECISE;SVTYPE=DEL;SVLEN=-74;END=964725;CIPOS=-13,13;CILEN=-1,1;RE=24;RNAMES=17e1c98b-f7f1-4b30-a986-81cfbd8f4050,6518475e-ac67-4cf0-a321-471e3a59a87c,ef7fe0c2-9437-4f96-8ea1-88c04ab8147b,68892a2e-c39b-49e2-a491-00cd8f97811c,1d6b0d43-1deb-4376-8159-b695b0485276,c65f4965-ccfe-4d55-be58-b7c37f5cd837,c4d8bf72-de2a-4830-94c0-be8914a95fcf,acaeaf54-d697-48db-9593-cc9cbe17c947,e5c003c4-8c18-4466-b90f-064be3b489a7,65b0344c-fe44-4b34-b994-74450789fb6b,71dc3e8c-1f43-4431-8ee4-ce08314cbd2a,76751550-cc73-4a3c-9ef1-b2065c3b0e30,e4f38bc4-58ac-44db-aee0-dfd64ec30bda,8b821158-c7ae-41f5-a960-eac7fa52b8ae,e6cb51d8-b370-46fa-be4f-6e33f868005b,20afc371-dd60-46ae-b59a-eeb04cff123c,dc13e96d-675b-4a42-b876-179e591ec574,4bb3065b-6927-4086-aae6-68b2e3fba092,1ba4584e-575b-4136-a254-85e5e1fd19d5,49a01f1e-6091-470d-a45b-24c271e07986,c8e6c9f0-34cf-45a9-917a-bb30503fe891,ab9c0652-0fba-4be1-9b36-43ed10bb3d50,703ccb79-8208-4993-9803-7ab17d11da85,c72430c7-3181-4aa3-9f51-4cd251652aac;STRAND=+- GT:DR:DV:PL:GQ 1/1:0:24:229,61,0:61 +chr1 976689 cuteSV.DUP.3 C 0.1 q5 IMPRECISE;SVTYPE=DUP;SVLEN=83;END=976772;RE=6;STRAND=-+;RNAMES=44bbad3a-ec7c-4ad4-9202-70f500f1369b,b641bc48-fbae-40e1-97a6-a8e99aee46ad,c399f294-a2ce-4e1b-8f51-9868513601dc,3dbe75af-7974-428e-8e64-4d2a1e18fa0e,b85c5def-3018-4c59-930b-e3ec0752d39b,f1a145aa-6901-4e7a-a6d7-a3e5a9938f03 GT:DR:DV:PL:GQ 0/0:23:6:0,17,162:16 +chr1 977237 cuteSV.INS.23 A AACAACCCCAGGAACCACCTACCTCCCCGCAACCCCGGGAACCGCCCTCCCCTCCCCCGCAACCCGGAAGGGACACCTCCCACTCCCCCCCCACCCAGCCCAGGAACCTCCCCTCCCCCCATAACCCAGGGCATCACCTCCCTCCACCCCGCGCCCGGGGAACCGCCCCTCCCCTCCCCCGACCAACCCCCGGGAACCTGCTCCCACTCCCCGCAACCAGGGGGCACCTGCACTCCCCCATGGCAGGGCCGCCTCCACTCCCCCACCCAGCAGGAACCGCCCTCCACTCCCCGCAACCCCAGGAACCCTCCTCCCTC 257.6 PASS PRECISE;SVTYPE=INS;SVLEN=316;END=977237;CIPOS=-137,137;CILEN=-53,53;RE=28;RNAMES=edc9239e-9688-445a-88e5-423ca07dbe77,5f598d57-f68e-4ecd-b9cc-f9195a998554,d89d2427-3a22-4aab-b99f-2ab3097cfb84,e525aa7b-51be-45f5-9303-b4f8d5c37446,3dbe75af-7974-428e-8e64-4d2a1e18fa0e,44bbad3a-ec7c-4ad4-9202-70f500f1369b,9996ec4f-5be1-4652-bbba-41e823c07ca9,e8f80089-d8a9-4b7e-8c2d-b077fe2c5d61,7f13876d-4fcb-4799-a07d-f152551dfe16,703ccb79-8208-4993-9803-7ab17d11da85,b641bc48-fbae-40e1-97a6-a8e99aee46ad,2b0f7c30-ac53-43d3-bc0e-72dc495e3140,c399f294-a2ce-4e1b-8f51-9868513601dc,e16da602-240a-4f23-a489-553fa60982a3,71dc3e8c-1f43-4431-8ee4-ce08314cbd2a,a28b6e79-70d8-42c6-a115-862aaa959b74,755fcc77-3e17-40a7-be45-2d233eb9dcef,f1a145aa-6901-4e7a-a6d7-a3e5a9938f03,b5ee6a22-b578-4a47-86d9-2a806e7b82fe,22871708-eb90-45cb-8849-95dd335f6da7,3f6fd9c9-8f6f-4598-9e66-d1afbdaa4917,09eeaf48-ecac-4d6b-80df-c4bcbcf72936,6d6f1e38-c400-4faa-b2f2-534d498df5a0,5de6d963-11d3-42b2-835b-88723c55b1b3,b824fb18-679e-4312-ba10-b2a7b665bbca,f0b732cc-4a83-4934-9437-7b7874d37988,2d4ca48a-0ac2-4283-95c3-d49ad086e788,24605ded-9f4a-411a-85ae-4e352d32991a GT:DR:DV:PL:GQ 1/1:1:28:258,64,0:64 +chr1 988837 cuteSV.INS.24 C CGTTTTTGGAGTTCTGGGTTGATTGTTTCTGGAGTTCAGGGTTGGC 248.1 PASS PRECISE;SVTYPE=INS;SVLEN=45;END=988837;CIPOS=-7,7;CILEN=-1,1;RE=27;RNAMES=33c8c639-8316-4e93-8a3e-0f4b0c1e4dbb,2d4ca48a-0ac2-4283-95c3-d49ad086e788,741fcfa9-cbc3-4032-9299-2d29d7173087,bbffc87d-e0bc-4623-9165-7976befc5ed8,c09ec4b6-5a8d-4292-bf93-9655137ffb41,d522b452-a778-4b87-bc69-f9c7e28a67f5,fb659d07-1b25-4168-96b8-1eb2519fe0e5,58ee7776-a955-4752-b7da-83e96a10779d,a59d11ba-9ecb-41fc-a6ff-c70e764aacce,71dc3e8c-1f43-4431-8ee4-ce08314cbd2a,0526a88a-c43d-4306-87d1-f8044a4d7250,12920a0b-9f0c-4dff-8033-4bc19fed9b8a,7f13876d-4fcb-4799-a07d-f152551dfe16,205538dd-b6d5-443d-a8f4-d78dfa42fce1,e75e13bb-1f83-4cfe-8799-440edbb3df4e,551ed2f0-44cf-491d-b155-9445d2d9996c,22bfcad0-231a-4d9f-9122-46ef80736dce,9996ec4f-5be1-4652-bbba-41e823c07ca9,5ab823ac-f22f-4ec1-8265-30b737d5b4e7,5688faaa-ee3e-4c00-ac67-60e68aaa1731,57466553-b5b7-4875-b069-b34ed1b5f236,e79c9c43-465b-43fa-ace2-2aac17618d6e,b2ea2333-8b7b-4892-9ede-39b8c68bbb73,81baf47c-0987-4293-94a6-9030b31203d7,bcab629a-fdad-4e0e-93a2-ccffabcff548,01412e3f-65aa-457c-b3b7-c873f69e1c18,b641bc48-fbae-40e1-97a6-a8e99aee46ad GT:DR:DV:PL:GQ 1/1:1:27:248,62,0:61 +chr1 996558 cuteSV.INS.25 A AGGGAGGGCAGAAAGGACCCCCCGCTGGAGGGGGCACCTCACATCTGGGGCCACAGGATGCAGGGTGGGGGAGGATGCCAGAAAGGACCCCCTGCTGGAGGGGGACCCCCCCGCTGGGAGGGGGCACCCCACATCTGGGGGCCATATGCAGGG 353.1 PASS PRECISE;SVTYPE=INS;SVLEN=152;END=996558;CIPOS=-56,56;CILEN=-8,8;RE=37;RNAMES=db3a6405-9d82-4cbd-83e6-c0a4ce8ee914,33c8c639-8316-4e93-8a3e-0f4b0c1e4dbb,2d4ca48a-0ac2-4283-95c3-d49ad086e788,8d1b989b-10de-4784-95ef-e3943ebf6309,9e8c7cfd-c151-4a9c-b408-139491f081bb,63b9d932-5152-44b0-96ad-ecbb02ab915e,0526a88a-c43d-4306-87d1-f8044a4d7250,a43d0a23-98d1-4fed-8466-70240231420d,bb4a3545-df5e-4f76-a5cc-d0f7d3c3a921,603147e9-a00a-4f82-8e5d-8b203c6dcac7,d30ce71e-09f5-4f48-adfe-909e7ae658f4,a738069e-5b3a-4636-bed4-0e547fe1a8c4,634bae15-9dca-4fc1-9efb-fd7d940f3a37,01412e3f-65aa-457c-b3b7-c873f69e1c18,815ed88e-2e7c-479d-a3a3-2d7694f08791,57466553-b5b7-4875-b069-b34ed1b5f236,fd39c92e-d02b-4fdd-9f70-4491eee84dcd,586d0898-7c61-4e45-899b-46e169314896,205538dd-b6d5-443d-a8f4-d78dfa42fce1,c09ec4b6-5a8d-4292-bf93-9655137ffb41,fd5ba769-5143-40f6-be53-029ea699fd42,9996ec4f-5be1-4652-bbba-41e823c07ca9,36848b46-b6ad-4a7d-91df-8ff23667ff2b,551ed2f0-44cf-491d-b155-9445d2d9996c,f76e33a2-c83f-420f-9f54-cb2f270559d4,e26c3e7f-c16a-4730-987d-937c2b58025f,7fb8eda5-c691-42ac-951e-ca7e11588193,a59d11ba-9ecb-41fc-a6ff-c70e764aacce,f030743e-41da-4f0d-9e2e-753f8d1c41af,16b51bbe-d01c-40a9-a113-e457f9042688,10b28999-0e46-490c-901a-563958d0ea48,4d2326aa-2e82-4b75-993a-b6ce6dc5cb79,bbffc87d-e0bc-4623-9165-7976befc5ed8,7f13876d-4fcb-4799-a07d-f152551dfe16,5688faaa-ee3e-4c00-ac67-60e68aaa1731,3d1f73fa-a716-4e04-91b0-913c39df6718,e40ecb0a-6e7a-4f69-9220-6edb49e65f06 GT:DR:DV:PL:GQ 1/1:0:37:353,94,0:94 +chr1 998766 cuteSV.INS.26 G GGGGAGGGCTGAGCGGAGGGGAGGGCGCGAGCTGGA 71.7 PASS PRECISE;SVTYPE=INS;SVLEN=35;END=998766;CIPOS=-7,7;CILEN=-1,1;RE=15;RNAMES=205538dd-b6d5-443d-a8f4-d78dfa42fce1,10b28999-0e46-490c-901a-563958d0ea48,03736baf-21ff-4edb-8c3a-564a3da1dcbb,57466553-b5b7-4875-b069-b34ed1b5f236,a738069e-5b3a-4636-bed4-0e547fe1a8c4,603147e9-a00a-4f82-8e5d-8b203c6dcac7,5688faaa-ee3e-4c00-ac67-60e68aaa1731,0526a88a-c43d-4306-87d1-f8044a4d7250,9e8c7cfd-c151-4a9c-b408-139491f081bb,db31bc21-d577-4b10-86d1-af75ceebb355,328b5933-11f6-4319-98a8-ace09cd88f21,634bae15-9dca-4fc1-9efb-fd7d940f3a37,8d1b989b-10de-4784-95ef-e3943ebf6309,33c8c639-8316-4e93-8a3e-0f4b0c1e4dbb,16b51bbe-d01c-40a9-a113-e457f9042688 GT:DR:DV:PL:GQ 0/1:13:15:72,0,53:52 +chr1 1028941 cuteSV.INS.27 A AGCAGTGCGCAGGCCAGGGCGCCCACACCCACGCCACTCCGGGAAGAACCAGGCCCCAGCCCCTCGTGGGCCAGGGGCGCCACAGCCACGCCACCCTTTCGAAAGACCGGGCCCCAGCC 1.5 q5 IMPRECISE;SVTYPE=INS;SVLEN=118;END=1028941;CIPOS=-49,49;CILEN=-7,7;RE=6;RNAMES=ecb9a83c-48c1-426e-8ae0-3744d8c9c7c8,8b003e80-f945-4cbc-92d9-e6067d1f4457,99289c42-3c75-4773-8fa8-6dd7d3ea7446,83a65fbd-8ad8-4492-a486-0664dd4bfc6d,6d40d9c0-b42f-4c94-914c-a20ed46d3bda,ef99ad9e-a3e7-4140-b150-d7a743cefb11 GT:DR:DV:PL:GQ 0/0:18:6:1,5,116:5 +chr1 1029413 cuteSV.INS.28 G GTATGCAGGCGGAGGTGGGAGGGGACATCTGAG 21.4 PASS PRECISE;SVTYPE=INS;SVLEN=32;END=1029413;CIPOS=-14,14;CILEN=-1,1;RE=10;RNAMES=25fbbfe6-3bf9-406f-8204-afe00f41e043,765cb5f9-b894-4fe1-bac5-f0b1b9e80d57,44b6e03c-aea8-40a0-b92f-4a941f052200,94fc27c8-7cf0-40c8-9d15-0e18d842c864,95b7b8bc-573e-4eb1-83cd-ea4f10073d20,531f08c5-12d5-4330-beea-e5701486f243,f516c7b2-bfb5-43a4-841e-b29972690940,99289c42-3c75-4773-8fa8-6dd7d3ea7446,a5432370-c974-46f6-ab19-70e62d322628,8623c422-77b8-49d3-8172-a0e3653d29dc GT:DR:DV:PL:GQ 0/1:19:10:21,0,107:21 +chr1 1030853 cuteSV.DEL.22 TGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCAT T 219.5 PASS PRECISE;SVTYPE=DEL;SVLEN=-38;END=1030891;CIPOS=-18,18;CILEN=-1,1;RE=28;RNAMES=765cb5f9-b894-4fe1-bac5-f0b1b9e80d57,b6d1f0c3-2990-4ad7-986b-39de4ad3688c,f398b356-cdb7-4c64-8adc-ddd6dae15588,04c8133e-b70e-4755-9b44-7f0ade6a391c,2c763c62-5ac4-4d4d-83e9-79b89a0bfd29,531f08c5-12d5-4330-beea-e5701486f243,6d40d9c0-b42f-4c94-914c-a20ed46d3bda,f516c7b2-bfb5-43a4-841e-b29972690940,37066cab-3888-4261-9767-b874f4df3ade,8e343519-ee17-4d21-8f5a-c9a5b2a7a76c,b494b509-dc88-474c-80bf-12fa0e142343,ecb9a83c-48c1-426e-8ae0-3744d8c9c7c8,0b63842d-c3eb-4ed0-94ba-89576acad33f,25fbbfe6-3bf9-406f-8204-afe00f41e043,56f95b02-edd0-47cb-bab2-fdd45cba961e,5d8225c7-5225-4493-9ef6-d19161771345,5ffce705-8fca-4e75-b044-93d45f1ee9d9,3ee13d15-bd0b-4146-9605-38e698a91c49,97c4f3a9-d9ac-4ecb-9d09-24c865004f62,99289c42-3c75-4773-8fa8-6dd7d3ea7446,83a65fbd-8ad8-4492-a486-0664dd4bfc6d,ebf6d94a-7c83-4085-8bba-a3741da16f8e,d26a8d3b-e125-4b20-88b7-f9eceab15ddc,44b6e03c-aea8-40a0-b92f-4a941f052200,8b003e80-f945-4cbc-92d9-e6067d1f4457,4d408313-3977-4014-8e6c-8db62eda4673,a5432370-c974-46f6-ab19-70e62d322628,a6f31b98-32bb-410f-a717-120e935e8002;STRAND=+- GT:DR:DV:PL:GQ 1/1:5:28:219,37,0:36 +chr1 1041775 cuteSV.DEL.23 CTCCGGCCAGTGCCAGGGTCGAGGTGGGCGGCTCCCCCGGGGGAGGG C 181.3 PASS PRECISE;SVTYPE=DEL;SVLEN=-46;END=1041821;CIPOS=-14,14;CILEN=-1,1;RE=22;RNAMES=a64a03dd-185a-4818-8bee-7fcdb0319756,4b17d8e6-9217-4e2a-9faa-9e3f74f0c362,99289c42-3c75-4773-8fa8-6dd7d3ea7446,ba90f173-0dc5-4466-9200-a2a2297397ab,cf6dbca0-6268-43ee-9fcf-e9629260af48,65d2b66b-d923-40be-9fec-4fbbeea91c78,6f2a05fa-6b69-4e4a-ae27-3eabdf8970d9,51c54121-152d-4fd6-a888-847105530576,2c763c62-5ac4-4d4d-83e9-79b89a0bfd29,d26a8d3b-e125-4b20-88b7-f9eceab15ddc,3cb621ef-1675-4fed-96a7-65a09493297f,59c98262-5546-4735-aa25-67dc3c50d9f8,77bb4e80-06a6-4d19-b8b9-a286c2a7f2f6,b3cfd68b-0a60-4513-b081-af5f432a6fb0,8354b887-a636-4b9f-b963-24715918b6a1,6d40d9c0-b42f-4c94-914c-a20ed46d3bda,d5d77fef-ed0e-4f4b-a783-f38dce9f86ac,54547fe9-b935-4472-a5cf-83945666a995,e65fd9dc-95c3-45a8-9730-1cfb8ba8b300,d5ec49d3-bd52-427e-b561-07a09e66f612,5ffce705-8fca-4e75-b044-93d45f1ee9d9,e03bcc49-ddec-4177-8eea-8f4ed9cbed20;STRAND=+- GT:DR:DV:PL:GQ 1/1:3:22:181,35,0:35 +chr1 1068795 cuteSV.INS.29 G GTCCACAGCAGGGCAGGCCCGCCAGTGCCGCATGATGCCCCAGGTGGCGGGCAGGGCCTGGGCTGTGGGTGCAG 324.4 PASS PRECISE;SVTYPE=INS;SVLEN=73;END=1068795;CIPOS=-12,12;CILEN=-1,1;RE=34;RNAMES=edb11067-c404-4807-ab1c-cbc750ccebef,6ebded49-5c28-4243-b8eb-26e45f3ee5ff,c634c64c-2fd8-4a0c-a8db-ef436dfc17bf,616f68f1-5dfc-4f82-ad0b-2ee28ffc17a1,54547fe9-b935-4472-a5cf-83945666a995,32d2f0f7-c1cf-466d-8726-0e6caa00c35a,6255065a-9f26-4a69-97e5-8ebce1d1f382,87850298-2c1d-4a7c-9d6b-f0e096cfa397,bef325ad-0c6e-49ee-8cd3-191c11197e67,a64c2fc6-2da8-423e-9201-1533f81e5eda,a8ff5a7e-a988-4af5-9557-5e7807c00289,3050e181-e4df-4883-8daf-cc2453d15d9c,61c46a1a-9e73-4299-92ac-6d94dd6b23a0,a89a1947-ec8b-46df-8319-aa19f25e73b1,1a14603e-0571-4ce0-8f26-89b0c7074a73,ac1de16b-6802-4fe2-9672-3e548e18b9c2,bdcaebd1-c85b-41ff-b9f8-eff7af676070,7929258b-7dc7-4c71-9e22-7a9439666a19,6c973e59-e33a-4619-b7d2-6334d50c629a,82438b9f-6bb7-466e-9d6d-2c1d669b24ac,6278fc73-6dfe-45a7-a469-22aebe240a26,c5cbc961-a06a-4355-a28b-80dad6325418,0aef4258-425f-4230-a17d-f9a12bc13358,550ccb26-086e-4b4b-b92b-fff688fc873d,4ee3b15d-ca09-4598-89da-1b09e9d4d7fd,d0b569a4-5f75-4966-b266-eeb4008f5ed9,037ee0d1-b4ed-4e93-b41a-2e680fda7009,8716c5bf-5ea1-46d9-b434-901e8c1a50e9,e92f2f51-6f24-49ef-a43b-14f9e5abf9f2,bf8f78d4-dab1-4044-a6b2-f1af5d391de1,c3114829-11c5-49da-ab1a-cf227e963e2e,b9012afd-987c-4ad4-b1a5-aa7c3fe6bc4a,d227156d-8748-4569-b546-1f67b04ffcd4,99289c42-3c75-4773-8fa8-6dd7d3ea7446 GT:DR:DV:PL:GQ 1/1:0:34:324,87,0:86 +chr1 1076281 cuteSV.DEL.24 TGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGCTGGGAGGCTGAGGCTATGGGGACTCCGTCGGGGGAGGCTGAGTCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGGGGCAGGCTGAGGCTATGGTGACTCCGTGCAGGGCTGTGAGGCTACGGGGACTCCGTGGGGGGTGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGG T 152.7 PASS PRECISE;SVTYPE=DEL;SVLEN=-393;END=1076674;CIPOS=-34,34;CILEN=-61,61;RE=18;RNAMES=4999d51d-7728-4398-a04f-90d5d1f14259,6300ef61-230c-4fd9-a137-56e522b02d83,bf8f78d4-dab1-4044-a6b2-f1af5d391de1,a89a1947-ec8b-46df-8319-aa19f25e73b1,54547fe9-b935-4472-a5cf-83945666a995,e463ee38-1282-45ce-8dfe-721cd844f72c,6c9cc18f-75a9-4fa7-b7c9-f8c6500e9fab,c3114829-11c5-49da-ab1a-cf227e963e2e,aaf82e32-f7f4-4122-9309-08e94f165e1b,268013e1-c660-424b-ba46-ecda97faef65,ac1de16b-6802-4fe2-9672-3e548e18b9c2,87850298-2c1d-4a7c-9d6b-f0e096cfa397,cd7bbea6-6627-4ec4-8449-58831ae76422,17bdd25a-2e86-4cc9-bd83-360694e61966,bef325ad-0c6e-49ee-8cd3-191c11197e67,c5cbc961-a06a-4355-a28b-80dad6325418,6c973e59-e33a-4619-b7d2-6334d50c629a,037ee0d1-b4ed-4e93-b41a-2e680fda7009;STRAND=+- GT:DR:DV:PL:GQ 1/1:2:18:153,32,0:31 +chr1 1078872 cuteSV.DEL.25 AGGCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCGGGGAGGCTGAGGCTATGGGGACT A 3.6 q5 PRECISE;SVTYPE=DEL;SVLEN=-73;END=1078945;CIPOS=-48,48;CILEN=-16,16;RE=6;RNAMES=bf8f78d4-dab1-4044-a6b2-f1af5d391de1,f3451d58-eea4-4590-bdf2-31223f246593,87850298-2c1d-4a7c-9d6b-f0e096cfa397,6c9cc18f-75a9-4fa7-b7c9-f8c6500e9fab,5220f2c0-179e-4605-9f57-cf23c1449820,8cf3c101-6a61-4820-b2ee-91166a6dfa97;STRAND=+- GT:DR:DV:PL:GQ 0/1:16:6:4,2,99:3 +chr1 1080864 cuteSV.INS.30 C CCCCCACTCCCGGTCCCTGTCTCCTTCCCTCCGCCCCCACCTCGGTCCCTGCTTTCCTCCGCCCCCACCTCGGTCCCCTTGTCTCCTTCCCTCCGCCCCACTCGGTCCCTGTCTCCTTCCCTCTCCGCCCCCACCTCGGTCCCTGTCTCCTTCCTCCGCCCCCACCTCATCCTGTCTCCTTCCTCTCCGCCCCCACCTCGGTCCTTTTCGTCTCTCCTTCCCTCCGCTCCCCACCTCGGTCCCTGTCTCCTTCCTCCGCCCACCTCGGTCCCTGTCTCCTTCCTCCGCCCCCACCTCGGTCCCCTGTCTCTTCCT 219.5 PASS PRECISE;SVTYPE=INS;SVLEN=314;END=1080864;CIPOS=-53,53;CILEN=-45,45;RE=23;RNAMES=af0b84e0-4359-450b-9d92-62eadaa7d2b8,7134ddb6-573e-4a8f-b7d5-8790178f9fa1,f3451d58-eea4-4590-bdf2-31223f246593,6afc9f30-36b8-4d0b-8ed7-92f955a1db08,6ebded49-5c28-4243-b8eb-26e45f3ee5ff,c31fb5b7-a8ef-4c62-b90e-450f936bae4a,e463ee38-1282-45ce-8dfe-721cd844f72c,6c9cc18f-75a9-4fa7-b7c9-f8c6500e9fab,5220f2c0-179e-4605-9f57-cf23c1449820,aaf82e32-f7f4-4122-9309-08e94f165e1b,a8ff5a7e-a988-4af5-9557-5e7807c00289,8012b87e-4103-49cb-979e-9e0fbcba6769,09e46a61-482f-4f8c-bd98-f51d32cb239d,cd6bc3de-11f8-4b66-b39c-86719785ed87,c3114829-11c5-49da-ab1a-cf227e963e2e,6cd7ddfb-f119-404a-ab96-926853e2c497,17bdd25a-2e86-4cc9-bd83-360694e61966,5c7b6a09-da87-4a62-a166-1af09aa3f5a4,de5e48ae-831f-4790-9744-345b96efddf4,6f3d4f11-2168-4edf-8922-eac14761fb72,8cf3c101-6a61-4820-b2ee-91166a6dfa97,e819ddd5-c668-4730-bfb4-4f082d30cab7,cd7bbea6-6627-4ec4-8449-58831ae76422 GT:DR:DV:PL:GQ 1/1:0:23:219,59,0:58 +chr1 1139113 cuteSV.DEL.26 GAAGGTGGGGGTGTCAACGTCGAACCGGGGGACCTGGGTCCTGGGGAGTTTCCTGGGGTCAGAAGGTAGGGGTGTCAATGTCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGT G 111 PASS PRECISE;SVTYPE=DEL;SVLEN=-119;END=1139232;CIPOS=-12,12;CILEN=-1,1;RE=21;RNAMES=a4e2c6c8-24f5-415e-9d2d-6267b6650bb1,5a46ef88-dc8e-441d-b4f8-9dccbb2834c4,2f5dceab-4822-4348-955f-096a60b31b8e,ba3206c6-3e43-48e2-972c-8491a3188ea1,c4179c44-4791-4cb3-b520-e1fe81ab3748,2b838716-d558-4c11-bd39-1f8859acf4e9,454b9f78-d1c3-424e-bc78-923da02f01ae,5c697790-beff-40e7-9f60-2ea6819adc81,21b1ff19-a677-4984-ad6e-a9a1ebf7c1ed,57114dc7-8b5b-4199-9c74-878b96c79fa6,dbf0367c-5f33-47c0-9508-6d142504fd68,84e227f7-009c-4870-994c-82bf93a66718,c5642ff8-7392-45b5-98fd-0c64e782221d,80d5e617-b844-443e-b5bd-f221374e76cf,b4951356-ca81-42c9-9826-ac0e0f14cdc8,b57825a6-97f9-4fc5-bfab-a0961dffa9ad,ee0d9263-d299-4379-bf92-ccb062c13ae9,b6d1b7f1-19b8-4776-b6df-6fb949e0e0db,70c17be1-8aa7-4a06-bea1-b58fa5f6c4c5,c81c027c-07df-4de8-93ca-b163ddb87fed,cacdc6be-7a4f-49c8-ad8b-c8bbe2c80b38;STRAND=+- GT:DR:DV:PL:GQ 0/1:14:21:111,0,44:44 +chr1 1140208 cuteSV.DEL.27 TCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGGCCTGGGTCCTGGGGAGCTTCCTGGGG T 74.8 PASS PRECISE;SVTYPE=DEL;SVLEN=-60;END=1140268;CIPOS=-12,12;CILEN=-1,1;RE=18;RNAMES=ba3206c6-3e43-48e2-972c-8491a3188ea1,21b1ff19-a677-4984-ad6e-a9a1ebf7c1ed,c81c027c-07df-4de8-93ca-b163ddb87fed,454b9f78-d1c3-424e-bc78-923da02f01ae,5c697790-beff-40e7-9f60-2ea6819adc81,a4e2c6c8-24f5-415e-9d2d-6267b6650bb1,57114dc7-8b5b-4199-9c74-878b96c79fa6,84e227f7-009c-4870-994c-82bf93a66718,b6d1b7f1-19b8-4776-b6df-6fb949e0e0db,dbf0367c-5f33-47c0-9508-6d142504fd68,80d5e617-b844-443e-b5bd-f221374e76cf,c4179c44-4791-4cb3-b520-e1fe81ab3748,cacdc6be-7a4f-49c8-ad8b-c8bbe2c80b38,ee0d9263-d299-4379-bf92-ccb062c13ae9,5a46ef88-dc8e-441d-b4f8-9dccbb2834c4,70c17be1-8aa7-4a06-bea1-b58fa5f6c4c5,2b838716-d558-4c11-bd39-1f8859acf4e9,c5642ff8-7392-45b5-98fd-0c64e782221d;STRAND=+- GT:DR:DV:PL:GQ 0/1:20:18:75,0,94:74 +chr1 1140221 cuteSV.INS.31 G GGTCAGCATCGAACCGGGGACCTGGGTCCTGGGGAGCTTCCCTGGGGTCAGAAGGTGGGGGTGTCAACATCGAACCGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGG 0.4 q5 IMPRECISE;SVTYPE=INS;SVLEN=118;END=1140221;CIPOS=-90,90;CILEN=-46,46;RE=7;RNAMES=552a6e53-6d8f-4b59-9a27-92f32bc5d55a,63dff088-63aa-495d-9a83-7ead3bac9ad6,a96b4308-b4a8-4862-ab51-4d246ffa4856,5d272fd2-da4d-451d-8ca4-885acaad84e7,13395d40-9538-461e-ac5c-ea462c9e55fe,26953a70-aec9-4b3e-8c56-f0de5a9c3101,aa7d5ab5-720b-4edd-8c13-1f643b0e0c87 GT:DR:DV:PL:GQ 0/0:23:7:0,10,153:10 +chr1 1140336 cuteSV.INS.32 G GGGGGACCCGTCCTGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACATCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGTGTCAGCATTCGAACCGGGGACCTGGGTCCTGGGGAGCCTGGGGTCAGAAGCCGTAGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCATGGGAGCTTCCTGGGGTCAGAAGTGGGGTGTCAACGTCGAACCGGGGGCCTGGGTCCTGGGGCCTTCCTGGGGTCAGAAGGTAGGGGTGCCGAATCCGGGGACCTGGGTCCTGGGAGCTCTGGGGTCAGAAGGTGGGGTGTCAACGTCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGGGTCAACGTCGAACCGGGGGACCTGGGTCCTGGGGAGCTCCTGGGTTCAGAAGGTGGGGGTGTCAGCACGAACCGGGGGACCTGGGTCCTGGGAGCTTCCCTGGGGTCAGAAGGTGGGGGTGTCGTTATCGAACCGGGGGACCTGTCCTGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACATCGAACCG 0.5 q5 IMPRECISE;SVTYPE=INS;SVLEN=585;END=1140336;CIPOS=-110,110;CILEN=-40,40;RE=6;RNAMES=68113a8c-4d8d-4600-ac9f-1da02a5a8fe9,7594e3bc-88ee-4af4-816b-a1f7cf6c6d8a,3a3f9b2d-0ad2-4c86-bb32-06b8dadb0fa3,d23f3ab5-8a68-4d47-80bf-2ab949052219,3e352bf2-2d6d-4536-b218-cab154bdfb7b,a83cb84f-f8de-497f-bc4b-243637a2f5e5 GT:DR:DV:PL:GQ 0/0:20:6:1,10,134:9 +chr1 1141403 cuteSV.INS.33 C CCCCATCCCCGCCCCGTCCACAACCCCATCCTTACCTCTATCCCCACCCACATCCTTA 108.5 PASS PRECISE;SVTYPE=INS;SVLEN=57;END=1141403;CIPOS=-13,13;CILEN=-2,2;RE=21;RNAMES=70c17be1-8aa7-4a06-bea1-b58fa5f6c4c5,21b1ff19-a677-4984-ad6e-a9a1ebf7c1ed,57114dc7-8b5b-4199-9c74-878b96c79fa6,c81c027c-07df-4de8-93ca-b163ddb87fed,5c697790-beff-40e7-9f60-2ea6819adc81,ee0d9263-d299-4379-bf92-ccb062c13ae9,454b9f78-d1c3-424e-bc78-923da02f01ae,2b838716-d558-4c11-bd39-1f8859acf4e9,ba3206c6-3e43-48e2-972c-8491a3188ea1,c5642ff8-7392-45b5-98fd-0c64e782221d,b731b06b-1457-48a3-91fe-a228dcc63878,5a46ef88-dc8e-441d-b4f8-9dccbb2834c4,c4179c44-4791-4cb3-b520-e1fe81ab3748,b4951356-ca81-42c9-9826-ac0e0f14cdc8,a4e2c6c8-24f5-415e-9d2d-6267b6650bb1,84e227f7-009c-4870-994c-82bf93a66718,17ced63c-a1cd-4f0a-8bec-4aa7d1c613cb,80d5e617-b844-443e-b5bd-f221374e76cf,b6d1b7f1-19b8-4776-b6df-6fb949e0e0db,cacdc6be-7a4f-49c8-ad8b-c8bbe2c80b38,0ed75539-2679-496f-aeed-96a6daada7ed GT:DR:DV:PL:GQ 0/1:15:21:108,0,51:51 +chr1 1168040 cuteSV.DEL.28 GGGTGAGGGCGGAGGGCCGAGCGGGGCCAGCAGA G 56.3 PASS PRECISE;SVTYPE=DEL;SVLEN=-33;END=1168073;CIPOS=-12,12;CILEN=0,0;RE=15;RNAMES=0ceaad89-27a9-4a33-a481-fe5621a4bfad,3c221b00-3ecd-4eea-86ec-7a5e0d35424a,5c697790-beff-40e7-9f60-2ea6819adc81,3f0bcb38-bcc3-4ae4-a09b-3b9ef5ae387d,7e11412a-2876-4661-a708-479a9e787bc6,53f4dbfa-7c0b-4162-8d31-76f1e8bf91cc,f874e077-0684-412f-a39c-cc807bdbd0aa,88d73175-6061-400b-b9d4-94b05d743d8d,b30d1d9d-050a-4860-8d21-fbb3c551a66f,489c9583-a48b-417e-bbc8-63a31dfa62c5,34f64651-ce28-4440-8ac3-fa57ddd39181,360ce83e-dcd5-42da-9e0f-12d9a805fef8,d3da996c-3053-4a28-abc1-00ecfe8c81ee,6f4e3272-6e0f-4161-a5d1-91687eaaf225,7d7d2399-1fa6-4b55-9224-91734b7627e4;STRAND=+- GT:DR:DV:PL:GQ 0/1:19:15:56,0,95:56 +chr1 1202080 cuteSV.INS.34 C CCCAGTACCCCAGCCCGAGCCCAGTACCCAGCCTCCAGCCCAGTACCCAGCGACCCAGTA 76.1 PASS PRECISE;SVTYPE=INS;SVLEN=59;END=1202080;CIPOS=-87,87;CILEN=-9,9;RE=16;RNAMES=ced88eab-a646-4014-8aeb-c06ca21b5705,5add0203-d6e5-47a6-b90e-54d02d78d164,a4f3c7c6-0a66-4857-9665-11ae06c59a93,59590150-a13f-454c-8f37-279758d415fa,e4c629fb-92bb-41f6-a405-35c447ac2edb,1f107017-da31-4485-9ad0-f3131da1a2de,3acb4a98-1d7c-4ba0-a464-d7c716eff5aa,50f71109-b11a-4485-8ee5-0baff6cbb7cb,1371d4e0-de73-428d-a0b1-980f0172eed1,b0062959-3629-493f-926a-3c85b33f8ca7,408cfac5-0094-4446-a5d6-3dcb62642b1c,36bb353d-1fdc-40e9-bfcc-a7910a981285,d0649435-0e93-4a6f-a275-a83678f6b1c7,9a3e6750-b61c-4e3b-862d-d81b66d85836,d792ab01-3364-4f58-afdf-1f38aaac168b,5e1e811e-35f6-4628-b23a-480dbb46fdfc GT:DR:DV:PL:GQ 0/1:14:16:76,0,57:56 +chr1 1202304 cuteSV.INS.35 C CGAGCTCCGGTACCCAGCCCCGAGCTTCCAGTGCTCCAGCTCCCGAGCCCAGTACCCAGTCTCCAGCCCAGTACCCAGCCCCGAGCCCAGTACCCAGCCTCCAGCCCAGTACCCATCCGAGCCCTGTACCCAGCCCCGAGTGCAGTACCCAGCCT 8.1 PASS PRECISE;SVTYPE=INS;SVLEN=154;END=1202304;CIPOS=-115,115;CILEN=-12,12;RE=8;RNAMES=ccf6e32b-ba2c-4dec-a585-8b1d12802c3e,e44d31f6-e6b9-4427-87b5-73fe396c6bf8,46e7c4a0-de0a-4bb5-a9c7-3205b7ea5b62,a952409f-0adf-4d89-9164-1410df306be7,8c4bb810-3b30-46a1-a980-29dd77349014,72815391-ed34-4f91-83da-5c39287f281b,bd0adee6-8e6e-4457-babe-4698cd7b3958,8afecac0-f47b-4de8-97c7-d4460518f153 GT:DR:DV:PL:GQ 0/1:19:8:8,1,113:8 +chr1 1202446 cuteSV.INS.36 C CTGAGCCCAGTACCCAGCCCCCAGCCCAGTACACTCCAGCCCCGAGCCCAGTACCCAGCCTCCAGCCCAGTACCCAGCCCCGAGCCCAGTACCCAGCCTCCAGCCCAGTACCCAGCCCCGAGCCCAGTACCCAGCCCCGAGCCCAGTACCCAGCCCCCGAGCCCAGTACCCAGTCTCCAGCCCAGTACTCCAGCTCCCGAGCCCAGTACCCAGCCTCCAGCCCAGTACCCATCCCCCGAGCCCAGTACCCAGCCCCGAGCGCAGTACCCAGC 0.1 q5 IMPRECISE;SVTYPE=INS;SVLEN=271;END=1202446;CIPOS=-111,111;CILEN=-18,18;RE=5;RNAMES=b3beae97-47d9-48d6-a6b3-977c0a15b299,9c07ac5e-c8b6-46cc-9b83-81cd59589aca,2a300411-fa05-4afd-831a-3549a73f954f,ffd5cb31-03a8-46e9-bde4-29e85a43b70f,064ee272-8481-41a8-8b39-13d1ff904462 GT:DR:DV:PL:GQ 0/0:20:5:0,16,143:16 +chr1 1212611 cuteSV.INS.37 C CCCTCTGCCCCCCTCAAGCCCCTCCCAGCCTAGC 15.2 PASS PRECISE;SVTYPE=INS;SVLEN=33;END=1212611;CIPOS=-6,6;CILEN=-1,1;RE=8;RNAMES=a61f1fbe-b0a4-456d-8358-7d54b86c228d,1ef7159e-e106-46c9-81a1-f37295f1b6ee,cc29de15-7bd7-46f5-8fd5-69288395f465,064ee272-8481-41a8-8b39-13d1ff904462,8c4bb810-3b30-46a1-a980-29dd77349014,4b846ddd-587b-41f6-b62c-8c9450e1d5ca,7600aefb-e592-4790-a089-3937ec206e38,0db3db16-3b86-4131-8c6d-3fdb246e89ab GT:DR:DV:PL:GQ 0/1:16:8:15,0,92:15 +chr1 1226332 cuteSV.DEL.29 TCAACCCTGTACGGTCAGGAGGAAACATGGCACCTCCCCTCTGGGGGCTCTTTCCAGAAACCC T 51.2 PASS PRECISE;SVTYPE=DEL;SVLEN=-62;END=1226394;CIPOS=-4,4;CILEN=0,0;RE=15;RNAMES=00730119-6bf3-4259-b731-b4788ef5b314,96717086-fe48-49ab-a513-1910742d076a,d0e8fc4c-db47-4700-b095-c457edcbfdb7,a61f1fbe-b0a4-456d-8358-7d54b86c228d,26456d45-de06-4be1-9295-a4838e2521a6,4005744b-cbe1-4af1-b72b-4c92e4c46f69,9b47cf78-0455-4c56-a614-c2a96cabc3f4,cc29de15-7bd7-46f5-8fd5-69288395f465,9f25ff8c-0f0c-4102-9755-3eaa49ad6cc8,cfe81581-5685-41b4-b2e3-6cc759dde6b6,54056604-4d7e-472b-a0d1-e0b629bfb695,e4f8a90d-15f2-4e84-a954-296c3499000c,37fc26b7-bc9c-44c4-b02f-2bba900b5be9,dceef018-aba6-4d50-b9a3-784e280b362e,b2704107-e0ea-41b2-9395-da38d79b046c;STRAND=+- GT:DR:DV:PL:GQ 0/1:21:15:51,0,108:51 +chr1 1227298 cuteSV.DEL.30 GAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGG G 58.2 PASS PRECISE;SVTYPE=DEL;SVLEN=-173;END=1227471;CIPOS=-4,4;CILEN=-1,1;RE=16;RNAMES=d0e8fc4c-db47-4700-b095-c457edcbfdb7,00730119-6bf3-4259-b731-b4788ef5b314,96717086-fe48-49ab-a513-1910742d076a,9f25ff8c-0f0c-4102-9755-3eaa49ad6cc8,54056604-4d7e-472b-a0d1-e0b629bfb695,dceef018-aba6-4d50-b9a3-784e280b362e,cfe81581-5685-41b4-b2e3-6cc759dde6b6,26456d45-de06-4be1-9295-a4838e2521a6,39428341-3093-4633-b634-f5ce3af9d111,4005744b-cbe1-4af1-b72b-4c92e4c46f69,b2704107-e0ea-41b2-9395-da38d79b046c,cc29de15-7bd7-46f5-8fd5-69288395f465,384fd9e4-08ed-465a-8ab2-9bb997cc6cf2,e4f8a90d-15f2-4e84-a954-296c3499000c,a61f1fbe-b0a4-456d-8358-7d54b86c228d,9b47cf78-0455-4c56-a614-c2a96cabc3f4;STRAND=+- GT:DR:DV:PL:GQ 0/1:21:16:58,0,106:58 +chr1 1240674 cuteSV.INS.38 C CCCCATTCCACCCCGACAGGTCCCTGCCCCAGCCCCGCCGCCC 362.6 PASS PRECISE;SVTYPE=INS;SVLEN=42;END=1240674;CIPOS=-6,6;CILEN=-1,1;RE=39;RNAMES=e27f81b3-f37c-46a9-927d-1704051865f4,e95f67b5-2b6c-4d2b-8fdf-bb3bcf80380f,21a10068-b408-49c2-8591-bc56c13f8bc8,d8f83f59-c3ca-442b-8234-c87f5fd5b9d8,e8912883-f9ee-481e-9033-7ac822537778,115402c3-2363-4260-8185-088cb2efe813,d0e8fc4c-db47-4700-b095-c457edcbfdb7,6c6f5904-5902-49e4-aecc-9f7107aaaf32,00c871e6-9240-44ea-afd7-76260fb1ff8f,3ab10396-251a-4ab9-83e1-5fed03fd0fb4,48364c74-7e0e-449b-92e1-89c06dba0835,24fe21a5-7ddc-40db-8d2b-3536b5a4ea72,47d0af8c-17a1-436f-8839-ade7ab6de78a,cc60c1e0-54ff-4699-a10a-f7783be37055,9b47cf78-0455-4c56-a614-c2a96cabc3f4,3eba1c5e-e793-46d3-aacb-82377666e35e,08fd6d74-57bf-4e04-bd81-f2de084960a4,f09a7634-58be-4251-98d0-2d4f88afd783,421d61bb-197c-46a0-abc1-0ffc0e08958b,7f548ef8-bf8f-4a61-b9ce-43038c6ed4f1,00730119-6bf3-4259-b731-b4788ef5b314,cc29de15-7bd7-46f5-8fd5-69288395f465,95c3ccf4-0adf-43e3-9054-45c562be67f2,1fcf688b-b67c-4193-93d9-73465de7ffc7,5f57573e-7b14-4b1a-899c-87843755ef38,0de9cf66-122b-45e4-81a2-0e5f79607368,72780d85-8ccf-4898-b2b2-11cf986d0491,0ddf623b-87fe-4f8e-a9d6-bdb595c3afc4,8e84b06f-693a-4208-add6-ca0bebaf3b40,a32985bb-648d-4ee1-9572-7d4e2160a1f1,d22b9311-0646-440d-b6e4-59b9d2385746,8f02f628-554d-47ef-8855-2d0e61e87721,6aa87ec7-edab-4aed-8c84-d1ce656ee421,440b5c72-9c1a-42d2-93b7-ed47bf9c4ceb,e9628e82-8506-4ac7-b680-5640081d3a40,ba940b8e-7984-468d-9a9c-d9543dfd4501,21b2165e-219c-44c9-b583-eaaa6f9cfa4d,1cac9230-18ec-4c42-8b75-5b8edc0a46e0,f80b54d1-313c-4b3c-821e-2486f5ac1987 GT:DR:DV:PL:GQ 1/1:1:39:363,93,0:92 +chr1 1245147 cuteSV.INS.39 T TCCACCTTCATCCCATTCTTCCCCCACTATCTCCCTCCTCCCCCACCTCTGCCCTCCTCCCTTCCCCCTCCCTCTGCTCCTCTCCTTCCCCCTTCTCCCCCGACCCTTCCCCACTCATCTCCCTCCTCCCACCTCCTCTCCTCCCTCCCTCTCCCCCACTCCTCCCCCTCCTCCCCCACTCTCCCCCACTGCTCTCCCTCTCCCCCACTCCCCTCCCCCACTCCTCTCCCCCTCCTTCTCCACTCTCCTCCCCCTCCCACCCCTCCCCCACTCCCCAACCCTCCTCCCCCTCTTCCCTCCTCCCCACTCCTCATCTCCCTCCTCCCCACTCCTCCCTCTCCCTCTTCCCCTGCCTCACTCCTCCCCCACCTTCCCCTCTTGCTCCTCTCTTCCCCACTCCCTCCCCCACCCTCTCCCTCCTCCCCGCTCCCCTCTCC 219.5 PASS PRECISE;SVTYPE=INS;SVLEN=436;END=1245147;CIPOS=-12,12;CILEN=-8,8;RE=24;RNAMES=8f02f628-554d-47ef-8855-2d0e61e87721,a7eccd01-b383-4757-9497-19a633fc484a,fba85188-6836-422d-b3dd-be62d3ee3746,21b2165e-219c-44c9-b583-eaaa6f9cfa4d,d22b9311-0646-440d-b6e4-59b9d2385746,701b1d5e-6f8b-4582-83b3-bebdcd378d01,7f548ef8-bf8f-4a61-b9ce-43038c6ed4f1,1fcf688b-b67c-4193-93d9-73465de7ffc7,e9628e82-8506-4ac7-b680-5640081d3a40,6aa87ec7-edab-4aed-8c84-d1ce656ee421,cc29de15-7bd7-46f5-8fd5-69288395f465,00730119-6bf3-4259-b731-b4788ef5b314,3ab10396-251a-4ab9-83e1-5fed03fd0fb4,e7e1f47d-0cd9-4d0b-9259-5cb8d10e0754,aecdc501-f8cf-4c74-be34-7fbd6c71d3a8,5f57573e-7b14-4b1a-899c-87843755ef38,440b5c72-9c1a-42d2-93b7-ed47bf9c4ceb,d171ef72-b23e-432d-a798-df3db714a490,9b47cf78-0455-4c56-a614-c2a96cabc3f4,8e84b06f-693a-4208-add6-ca0bebaf3b40,3feb7f84-5f44-4b84-9010-37250a11d37e,115402c3-2363-4260-8185-088cb2efe813,6c6f5904-5902-49e4-aecc-9f7107aaaf32,95c3ccf4-0adf-43e3-9054-45c562be67f2 GT:DR:DV:PL:GQ 1/1:1:24:219,54,0:54 +chr1 1248059 cuteSV.DEL.31 CTGGATCTCCAACTCTGACCTACAGGCAGGAAAGTGGGCAGCCCTGGGAGGCTGGACTGAGGGAGGCTGGACTTCCCACTCAGGCCTACACGCAGGAAAATGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCACCCTACAGGCCAGGACACGGGCAGCCCTGGGAGGCTAGACCGAGGGAGGCTGGGCCTCCCATCTACCCTACAGGCCGGGACACAGGCAGCCCTGGGAGGCTGTACCGAGGGAGGC C 3.2 q5 PRECISE;SVTYPE=DEL;SVLEN=-263;END=1248322;CIPOS=-2,2;CILEN=-1,1;RE=7;RNAMES=cc29de15-7bd7-46f5-8fd5-69288395f465,a7eccd01-b383-4757-9497-19a633fc484a,bf5b0c80-4f39-4320-b98a-1e7ceb6af60c,701b1d5e-6f8b-4582-83b3-bebdcd378d01,95c3ccf4-0adf-43e3-9054-45c562be67f2,e9628e82-8506-4ac7-b680-5640081d3a40,9b47cf78-0455-4c56-a614-c2a96cabc3f4;STRAND=+- GT:DR:DV:PL:GQ 0/1:19:7:3,3,118:3 +chr1 1248987 cuteSV.DEL.32 CCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGAGCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCT C 0.1 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-326;END=1249313;CIPOS=-54,54;CILEN=-1,1;RE=5;RNAMES=cc29de15-7bd7-46f5-8fd5-69288395f465,701b1d5e-6f8b-4582-83b3-bebdcd378d01,9b47cf78-0455-4c56-a614-c2a96cabc3f4,e9628e82-8506-4ac7-b680-5640081d3a40,a7eccd01-b383-4757-9497-19a633fc484a;STRAND=+- GT:DR:DV:PL:GQ 0/0:20:5:0,16,143:16 +chr1 1249297 cuteSV.INS.40 C CCACAGGCCTCCCACACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCGCCCCACAGGCCGGGACACGGGGGCAGCCCTGGGAAGTTCCGAGGGAGGTCTGGGCCTCCCACTCCGCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGGACCGTAGACTCCACT 109.7 PASS PRECISE;SVTYPE=INS;SVLEN=209;END=1249297;CIPOS=-67,67;CILEN=-28,28;RE=18;RNAMES=ba65d04d-c548-41af-a1ae-2f92555cdec9,fba85188-6836-422d-b3dd-be62d3ee3746,21b2165e-219c-44c9-b583-eaaa6f9cfa4d,93793f8f-88f0-4d1c-bbc7-f4ff305b38db,9889e0f9-b111-466b-b4a7-cc49d18e799d,1fcf688b-b67c-4193-93d9-73465de7ffc7,202ac4cc-be0f-46d4-b6a1-6b44d0f40ca9,50a99aef-78bd-4c3d-a301-ce79ad502d13,e75b832f-5543-4dfb-93fa-4c7f935c6920,36a68d4a-51c0-45f9-bbb7-21e8d6595a74,5fcf5f40-970f-456e-a6c8-6d2e644cc05d,abc80d9a-83a8-491f-bda0-ef94c8b186fc,e7e1f47d-0cd9-4d0b-9259-5cb8d10e0754,8f02f628-554d-47ef-8855-2d0e61e87721,7f548ef8-bf8f-4a61-b9ce-43038c6ed4f1,645bdabf-aba6-4f5c-9a5a-aac5300ec7f2,8108df60-0733-4ded-ab77-efdf8bbf4d25,d171ef72-b23e-432d-a798-df3db714a490 GT:DR:DV:PL:GQ 0/1:7:18:110,2,5:4 +chr1 1284150 cuteSV.INS.41 G GTGGGGTGAGGGGTGGTTGGGGCTGGGAGCGAGGGTGGGGCTAAGGCTGGGTGAGGGGGTGGGGTGGGGGTTGGGTGAGGGGGTGGGGTGGGGGGTTGGAGTGAGGGGGTGTTGGTGAGGGGTTGGGGTTGGGTGAGGGGGTGGGGCTAGCAGGGAGGGGGTGGGTGGGGGTTGAGTGAGGGGGTGGGGGTTGGTGAGGGTGGGGTTGGGTGAGGGGGTGGGTGGTGGCTGAGTGAGGGTGGGGGGAAGAAGAGGGGTGGGGGTTGGGTGAGGGGGTGGGGGTGGGTTGTGGGGTGGGAGGGGGGGTGGGGGCTGGGTGAGGGGTGGGCTCGGGGGGGTTGAGTGAGGGTGAGGGGTGGGGTGGGGGTTGAGGAGGGGTTGGGGTGTTGG 334 PASS PRECISE;SVTYPE=INS;SVLEN=389;END=1284150;CIPOS=-11,11;CILEN=-42,42;RE=35;RNAMES=b0abcfbb-61fd-4728-81a1-43fa323e396b,f28c127d-baf6-42e2-8aca-5e4956bf6f3b,d9c4eb81-e133-4197-8e8e-e206e57bd810,fdf0a32a-0337-491d-a75e-34098f30517a,d1cbe0e6-eca8-4c73-a51f-46ab546d1e8e,f173546d-38f4-44f6-97b6-91d0e45a2d14,95404675-b417-4a52-a407-b2ce2852ad4b,33f38ec0-cd95-4f95-832e-80af657c2ecf,b9c5bedc-f363-4539-bc47-7b0f412a4bbd,e694bcb5-46e8-43d0-b0d6-3fd9647ffe7e,45b3e9c7-9541-4afe-b291-ae5bc5e5bfb1,3133cf21-e3fe-42a8-a1a9-abfe57860c7e,34b9f7e2-6ab9-4c30-8f04-825c28b41bc3,57b4f996-d753-4cff-a038-96e61ad4c4a1,da1bdeb0-11ca-41f7-9977-64a5f9b6f43d,fe9586e4-a814-4585-b5e9-0a70f38dd05e,48f59671-4881-42ed-a9a3-aed4c41a8a27,bd8c4b8e-a603-47e0-ae1a-bd0c4f7c768c,1e8fccd3-10da-4fbb-abd3-696fc95a7449,4e830313-41e9-4df0-aa80-fd044e1941f6,cba077c7-f898-4622-9da4-5b16316d0245,0338319c-9648-432b-a5e8-d100191dbd20,7453e3f8-6ef3-4524-bec9-f9eac86f3421,06524921-db14-4b52-9dce-ed08e1523255,1660464c-9193-48b8-9078-8c477520a960,174f1aff-b0b3-407d-939e-eba9860b4ba1,45316b56-a0a4-4f50-bcbf-99c85507665f,91688ed7-ae4f-48d4-9550-45a4903f9d70,b8227d89-a8f7-4fb5-a3e2-42174694c6b6,ba8acd78-c80f-49ff-93fc-1a99ffa7e077,c1019b1c-b966-4bb8-b129-a172a18b289f,87a3d636-a864-4fe2-b799-df74b0cbef67,ddc30ebd-ceb6-4337-94f9-058444486c91,723260c2-5676-4623-acd8-87e2d5ff2445,b437894e-54e9-47cd-ab32-9813a0cc594f GT:DR:DV:PL:GQ 1/1:0:35:334,89,0:89 +chr1 1289207 cuteSV.INS.42 C CTGTTCGTCCCCAGAGTCTCGGTTCCGTCCTGGTGTCTCTGTTCATTCCCCGTGTTCTGTCTGTCCCGTGTTCTGTTCGTCTCCAGAAACCCTGTTCATTCCCCGTGGTTCTCTGTCTGGCCCCGTGTCTCTGTTCGTCTCGTTGTCCCTGCTCCGCCCCGTGTCTCTGTTCCCAGCCCCCGTGGTGCCCTTGTCTCTGTTCGGGTCCTGTCCCTGTTCGTCTCCTCTCGAAGTCTCTGCTCCGCGTCCTGTGTCTCTGCCTGTCCCTAGGCTCCTGTTCGTCCCGTGTCTCTGCCCCTGCTGTCTCGTGGTTCTTGCCCCGTCCTTGTTCTGTCTACCCCTCTTGTCTGTTCCTGGCCCTGTCTCCTGTTCAGTCTTGGCAGGTCCCTGTTCGGTCTCTGTGTTCCTGTTCGTCTATCCCCTG 229 PASS PRECISE;SVTYPE=INS;SVLEN=423;END=1289207;CIPOS=-234,234;CILEN=-85,85;RE=26;RNAMES=3e1eb897-72e7-4ae0-a0d2-2627466a830c,2aa3c5b0-51b8-4dde-b323-6756439be524,5330950b-0332-411a-9811-65f6cd72fbd7,57d01459-e734-4df0-b3f6-a19c8816ad89,1660464c-9193-48b8-9078-8c477520a960,fb75b63c-b8bc-4e80-9b67-bc3421d23077,fdf0a32a-0337-491d-a75e-34098f30517a,4e830313-41e9-4df0-aa80-fd044e1941f6,7daa0ceb-1da3-4a2f-9d3e-292ad11d4e0d,91688ed7-ae4f-48d4-9550-45a4903f9d70,320fb35f-589e-4586-a6f4-658a187b6466,1e8fccd3-10da-4fbb-abd3-696fc95a7449,89301ea0-b509-45e9-a23d-9d482685b2d9,7453e3f8-6ef3-4524-bec9-f9eac86f3421,cba077c7-f898-4622-9da4-5b16316d0245,b8227d89-a8f7-4fb5-a3e2-42174694c6b6,45242ace-d289-4d59-b664-05f2cc0fe66e,48f59671-4881-42ed-a9a3-aed4c41a8a27,4240aded-2e15-41cf-8e86-5b4eb4c25db8,e34f1c89-c242-4a46-b19b-50deaba9e44f,7559d883-c5e9-4b07-86c4-daa5b48a64f0,ba8acd78-c80f-49ff-93fc-1a99ffa7e077,b9c5bedc-f363-4539-bc47-7b0f412a4bbd,ddc30ebd-ceb6-4337-94f9-058444486c91,b437894e-54e9-47cd-ab32-9813a0cc594f,da1bdeb0-11ca-41f7-9977-64a5f9b6f43d GT:DR:DV:PL:GQ 1/1:2:26:229,52,0:52 +chr1 1324165 cuteSV.INS.43 G GGCTGAGGGGCTGGGGGGCTGGGAGGCTGAGAGGCTGGGGAGCTGGGAGCTGGGGGGCTGG 324.4 PASS PRECISE;SVTYPE=INS;SVLEN=60;END=1324165;CIPOS=-10,10;CILEN=-3,3;RE=35;RNAMES=b607649d-c7ea-4932-9d33-12dcae50810e,e0e831cb-c095-4d3a-9a06-b2dd44d616ea,f5cceb4b-93c9-4b51-8096-7e0cd64ae466,b2acb702-b045-418b-82e9-84fb23740ee1,5add4e04-e468-4944-9803-ca71ad5e067c,e5961070-c2ef-42e2-bba4-5a4b29a04fad,b9ba05fc-bdd4-4b2f-92fb-90aefa2c680c,d5be04cd-8350-47d4-9024-0bbf5ae19de9,69ab5885-79b5-4bfc-9a75-be19a85673d9,8769fc7d-1fd4-4f70-8eb0-8d9a33b81acb,516db773-824b-4b59-8e23-aa5f399b028e,33722165-439f-455f-b763-83b7ef59275b,92f48072-887e-4dcf-9305-f8cea44945b2,419aed38-9d49-4493-859c-0983e2f06856,f6392148-86ea-4ac4-aa20-64bc81271553,40f48fb3-c138-4c40-935c-59936303dbab,c4febaca-032a-45e4-9d7b-ddffa3049f24,7dec19f4-8014-4062-8905-a0391b78a028,9cc9d9bd-3d4e-4c32-a10d-96dc5fb4374a,abdbcf55-98e2-4ad0-a689-b98c0af04706,9af4109b-0bbb-49ea-af89-7b40b62ee955,e8736cdd-fdb3-49c7-ad15-153cf640722a,09708d95-dd04-43d5-b47a-5e6601726993,7b5578be-5d71-42df-b818-22f669246aaa,d48b9361-395a-4463-819b-deb1427cd07d,57d01459-e734-4df0-b3f6-a19c8816ad89,ed17aad3-7a33-4220-b67b-4bf5a6e44ddb,08e6fb86-3192-40c2-b7fa-d2ccd498e453,091c6d44-03dc-4651-8218-818c1d23a93d,85cef6cd-d396-416f-9e45-d02e88061185,aa90c795-27e9-4b82-bb90-5963d30a64fa,1fda58af-a114-4202-8c53-90149c2d2ed3,d1068b23-9f55-40f9-8b26-0027b0d88fe3,dbc135c4-8e41-41ea-9db0-a1299ef02dbe,218d7b4b-3b50-409f-8fe2-a82e03ddc201 GT:DR:DV:PL:GQ 1/1:1:35:324,82,0:82 +chr1 1339914 cuteSV.INS.44 A AGTCCCCAGCAGCCCCACAGACCCACCGCAGCCGCATG 219.5 PASS PRECISE;SVTYPE=INS;SVLEN=37;END=1339914;CIPOS=-10,10;CILEN=-1,1;RE=24;RNAMES=e0e831cb-c095-4d3a-9a06-b2dd44d616ea,15dc9de7-45aa-4fe9-8890-d2cca94af899,d61fe4e6-eeee-4a3c-a2a6-8318ada41fd1,091c6d44-03dc-4651-8218-818c1d23a93d,eb6dedc7-def4-412d-8942-3e67e3a089af,0601fb29-eaa7-4c91-b112-db4c34a23f91,b7aea0b0-9f3f-4bda-ac4f-45ddfa7e3eff,3ba7d7a0-df2a-4871-b569-45db3fd2521b,b2acb702-b045-418b-82e9-84fb23740ee1,08fbcfb4-62e2-40d6-b323-b8fea89ab44a,376261d2-c01e-43b3-9924-bbbe58c5aa63,b9ba05fc-bdd4-4b2f-92fb-90aefa2c680c,e4ba9d23-c3be-4d82-88fc-8f59596bf8e7,fb9c1378-c2b8-4d1e-be76-8ff0321d4554,bc09aba3-dd95-4a94-99c9-5ad7c6912ba9,9a1b7bc6-5e6e-4f8e-8083-2604ef3b1d1f,7d1bde11-580f-44a6-9ce0-279b9d606688,db20f2bc-8216-4d64-a548-cb476cdbaf82,3422bb03-1d59-4f56-bcf8-52524a1a3f4b,a7b6b556-1844-4003-ac40-86a454b26de7,ac227aea-fadc-407b-aa52-ff70fbf3c32e,237b09c3-d628-4f57-98ce-8bd2378ba2ad,d38f14a1-a826-46af-ba7e-d9954408e6f3,681524fc-d8d4-4fae-b24b-fa515d8ce749 GT:DR:DV:PL:GQ 1/1:1:24:219,54,0:54 +chr1 1350113 cuteSV.DEL.33 GGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAGGA G 248.1 PASS PRECISE;SVTYPE=DEL;SVLEN=-1076;END=1351189;CIPOS=-29,29;CILEN=-1,1;RE=28;RNAMES=c759666a-78d1-4e4e-85e9-49f14c92cb02,b2c9e453-3cae-45b5-823f-a06accb28bda,d600c2be-262b-467c-b9fd-72d4d65e3e7a,bf77a3ce-4a60-47a5-9bb5-2c13067c86e0,befb0e46-69bf-44a0-9a06-1316fa491269,237b09c3-d628-4f57-98ce-8bd2378ba2ad,0950a236-9718-441c-9710-ab2aba6ee96f,e4ba9d23-c3be-4d82-88fc-8f59596bf8e7,ba555f0d-01d6-4b77-bb53-fb5602de2be3,e7a09d9c-3f36-45eb-b50b-f42d8b8a35e9,be8eadd1-4343-42de-9b4e-d998c4b14ca1,b3af5871-92fa-4fd6-849c-bf7a8ac51a0e,e28879d9-907b-47f0-82db-ebb2f3f31103,bc443e1f-204d-461b-8e69-e0b21369bd10,fdad49e6-a6ad-4be1-9f95-1a3dc66aad68,bcd87627-7572-45e7-bf8f-9261520b21b6,d9caad9d-b11e-4c3b-8c1b-44bf7b369001,fa1decc9-64be-4c96-9390-bb11efe8f144,84021b5e-e408-44fd-83ab-eed7da232e97,fc27f63f-4a8f-4cab-89d7-f5156f1944b7,55da8ccf-279d-4963-b3e3-8bbe90b36620,8a106af9-82b5-46d8-b907-35be6dedfcda,2bdca843-6153-45b0-8766-782ee3635507,ed58bbcf-2808-4104-b38f-0f6a6f8cb40c,b7aea0b0-9f3f-4bda-ac4f-45ddfa7e3eff,93745b8a-6e5d-4f25-ae47-f8c35baaafd4,9a1b7bc6-5e6e-4f8e-8083-2604ef3b1d1f,d61fe4e6-eeee-4a3c-a2a6-8318ada41fd1;STRAND=+- GT:DR:DV:PL:GQ 1/1:2:28:248,57,0:57 +chr1 1366923 cuteSV.DEL.34 GAGTTGGTGTGAATTGAATTGTGTGAATGAGTGGATTGGTGAGTGAATTGGTGAGTTGAATTGGTGTGTGTAGTGGATGAGTGTGGATGAATGTGAATTGGCGAGTATGGATGTGTGAATTGGTGAGTGTGAATGTG G 16.4 PASS PRECISE;SVTYPE=DEL;SVLEN=-136;END=1367059;CIPOS=-8,8;CILEN=-16,16;RE=10;RNAMES=aa88905e-5130-4723-9afd-4f5c3d70dd16,53ccdbe6-ac6d-4e16-a1e8-b277be973a9e,67772cfc-e5e8-4d09-9bfa-0819a87da557,cb74559a-face-4bc5-a20f-85224a2b7cff,79c6d5aa-e931-424a-b086-4a5f560efe83,39131799-9d54-4644-8077-ec715475d10c,4e414224-9269-42df-b33d-2feba304ad4e,4cbf911c-b176-4222-9802-7ba2334f570a,9d38c592-3987-4463-9fc5-b77d2f97c536,18988950-fe75-4b3c-a50d-8fff5a9e362e;STRAND=+- GT:DR:DV:PL:GQ 0/1:21:10:16,0,121:16 +chr1 1382232 cuteSV.DUP.4 G 0.1 q5 IMPRECISE;SVTYPE=DUP;SVLEN=671;END=1382903;RE=6;STRAND=-+;RNAMES=9d38c592-3987-4463-9fc5-b77d2f97c536,4f00962c-12f2-4ff5-b18b-d08a4118c893,9d57d1ac-8ef1-42cb-9f8a-1952ebd70438,f0a7a3af-0ea1-47a8-a52d-90b077dd5311,a5a3a1b3-0284-49ed-a518-dc0ab1b32e10,0697a1f6-c749-4ec4-a0bb-1320fe1bc916 GT:DR:DV:PL:GQ 0/0:23:6:0,17,162:16 +chr1 1382606 cuteSV.INS.45 G GTAACAACTAGAGGCTCACCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCACCCTTCCCAACAATCCAGTAACAATCTAGAGGTCACCACCCTTCCCAACAATCCAGTAACAACAATCCAGAGGCCACCACCCCTTCCCAACAATCCAGCTTCCCAACAACTAGTAACAATCCAGAGGTCACCACCCCTTCCCAACAATACAGTAACAATCCAGAGGCCACCACCCCTTCCCCAACAATCCACTAACAATCCAGAGGTCACCACCTCTTCCTAACAATACAGTAACAATCCAGAGGCCACCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCACCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCACCCTTCCCAACAATCCGCAACAATCCAGAGGTCACCACCCCCTTCCCCCAACAATACAGTAACAATCCAGAGGCACCACCCCCTTCCCAACAATCCACTACCAATCCAGAGGTCACCACCCTTCCCAACAATACAGTAACAATCCAGAGGCCACCTCACCCCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCACCCTTTCCAACAACTTCACTACCAATCCAGAGGTCACCACCCTTTCCCAACAATCCACTAACAATCCAGAGGTCACCACCCCTTCCCCAACAATCCAGTAACAATCCAGAGGTACCACCCCTTCCCAATAATCCAGTAACAATCCAGAGGTCACCACCTTCCAACAATCCACTAACAATCCAGAGGTCACTCACCCCCTTCCCCAACAATCCACTAACAATCCAGGAGTCGCCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGTACCACCCCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCAATTCCTTCCCAACAATCCAGTAACAATCCAGAGGTTACCACCCTTCCCAACAATCCACATCAATCCAGAGGCCACCACCCTTCCCAACAATCCGGCAAGGACCCAGAGGCCACCACCCCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCACCCTTCCCCAAAATCCAGTAACATCCAGAGGTCACCACCCCTTCCCCAACAATCCAGTAACAATCCAGAGGCCACCACCCCTTCCCCAACAATCCAGTAACAATCCAGAGGTACCACCCCTTCCCAACAATCCAGTAACAATCGAGGCCACCACCCCTTCCTTAACAATCCAGTAACAATCCAGAGGACACCACCCTTCCCAACAATCCACTAGCAATCCAGAGGCCACCACCCTTCCCAACAATCTGGCAACGACCCAGAGGCCACCACCCCTTCCCAACAAATCCAGTAACAATCCAGAGGTCACCACCCCCTTTCCCAACAATCCAGTAACAATCCAGAGGTCA 20.9 PASS PRECISE;SVTYPE=INS;SVLEN=1395;END=1382606;CIPOS=-41,41;CILEN=-5,5;RE=7;RNAMES=77b12407-8c9b-4d88-a023-aac52eb7d8a0,f0eab809-506a-46ee-b4e8-962ea3bda1ff,d936d231-dd4d-4254-ac7a-c92933cd3268,3a92fd10-7806-4547-8570-259f291e70c4,33c9bbab-aba2-437b-90a1-6f991ec4c38c,67559dcc-d09b-4387-a2ec-c9075b98d81d,df6fb7c2-1567-443f-af3b-29e57c98eb0d GT:DR:DV:PL:GQ 0/1:11:7:21,0,59:20 +chr1 1382641 cuteSV.INS.46 C CAGGAGTCACCACCTTCCCAACAATCCAAGTAACAATCCAGAGGTTACCACCTTCCCAACAATCCACTAACAGTCCAGAGCCACCACCCCTTCCCAACAATCTGACCAAGGACAGAGGCCACCACCCCTTTAACAATCCAGTAACAATCCCCAACAATCCAGTAACAATCCAGGGTACCACCCTTCCCAACAATCCAGTAACAATCAGGAGCCACCACCCCTTCCCAACAATCAGTAACAGTCAGAGGACACCACCCTTCCCAACAATCCACACTAGCAATCCAGAGGCCACCACCCTTCCCCAACAATCTGGCAACGACCCAGAGCCTACCCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACACCCCTTCCCAACAATCCAGTAACAATCAAGGGGATCCCACCCCTTCCCAACAATCTGGTAACAATCCAGAGGTCACCACCCTTCCCAACAATCAAAGTA 134.9 PASS PRECISE;SVTYPE=INS;SVLEN=466;END=1382641;CIPOS=-39,39;CILEN=-132,132;RE=21;RNAMES=f0a7a3af-0ea1-47a8-a52d-90b077dd5311,468ee76e-3ed4-4648-bfe1-d98510c064cb,fafc5c55-1207-4ec9-ad82-c384b1ca16fa,0697a1f6-c749-4ec4-a0bb-1320fe1bc916,a9944842-b9cf-42b4-83d7-345c82404ac7,a5a3a1b3-0284-49ed-a518-dc0ab1b32e10,2f0183eb-547a-46f9-a17a-2d1cdd9e5757,9d57d1ac-8ef1-42cb-9f8a-1952ebd70438,9d38c592-3987-4463-9fc5-b77d2f97c536,922f26c5-3206-4434-b317-1afbdb2a1a7f,4f00962c-12f2-4ff5-b18b-d08a4118c893,358e9370-bc32-41ba-8770-34330ce6995e,271f004d-0105-4403-98f2-d1e3bcbfb1ed,3bce85ca-852f-4a5d-aafd-33a558214eb1,a6b702f4-af87-46b0-8258-dd0080dba5de,9fd33fe0-7f99-4554-b196-b8ec37fa5ebd,a36c1728-9d07-43a0-8e83-8e5bb1e27014,06ae3af0-e88c-4c02-92cc-d45624993d5f,36491f59-cbff-4115-b4f6-653d792fcc66,f123340d-ccd5-4643-9cda-aeea14c2f41e,c104db4f-c8b0-44f8-9748-972c777861bb GT:DR:DV:PL:GQ 1/1:7:21:135,6,1:5 +chr1 1427436 cuteSV.INS.47 C CTCCTCCATCATCCGCCCGCTCCCCTCTCACCTCCCCTTCCCCTCCATCCCACCCTGCCCAGCCCCCTCCCCTCCATCACCTGCCCTGCCCCTCCCTCCATCCATCCCGCCCCGCTCCCCTCTCCACCCCTCCCCCTTCCCCTGCATCACACCCTGCCCAGCCCCCACCCCTCCATCACCCTGCCCTGCCCCCCTCCCCTCATCACCCTGCCCTGGCCCCCTCCCCTCCATCACCCTGCCCAGCCCCCCTCCCCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCCAGCCCCTCCCTCCATCACCCTGCCCTGCCCCTTCACCTCCATCACCTGCCCTGCCCCTTCCCCTCCATCACCCTGCCCTGCCCCTCCCTCCATCACCCTGCCCTGCTCCCCTCTTCTCCCCCTTCCCTCCATCATCCCGCCAGCTCCCCTTTCCCACCCCTCCCCTCCCCTCCATCACCCTGCCCAGCCCCCTCCTCATCACCCTGCCCTGCCCCCCCTCCCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCATCCATCCTTCCCGCTCCCTCTCCCACCCCTCCCCTTCCCCTCCATCACCCTGCCCTGCCCCCACCCCTCCATCACCCTACCCCTGCCCCCACCCCATCACCCTGCCCTGCCCCCTTCCCTCCATCATCCCGCCCGCTCCTCTCCCACCTCCACTTCCCTGCATCACCCTGCCCACACTGCCCCTTCCCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCCTGCCCCCTCCCTCCTCACCCTGCCCTGCCCCCACCCCTCCATCATCCCGCCCGCTCCTACTCACCTCCCCTTCCCCTGCATCACCCTGCCCTGCCCCTTCCCTCCATCCCCCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCTGCCTCCCTCCATCACCCTGCCCTGCCCCCCCCTCCATTGTCCCGCCCGCTCCCACTCTCCACCCCTCCCCTTCCCCTGCATCACCCTGCCCTGCCCCTTCCCCTCCATCACCTGCCCGACCCCCTCCCCCTCCATCACCCTGCCCT 28.4 PASS PRECISE;SVTYPE=INS;SVLEN=1071;END=1427436;CIPOS=-23,23;CILEN=-16,16;RE=11;RNAMES=9029b9c6-09de-484d-83ff-7a20e6b313c8,2934cba4-0b4b-4aee-b14b-0d0e3546b541,54e98c4b-5a5f-4113-9304-7a5c0aea7c63,d2031098-3567-4d27-be20-c8e182f7c4b4,68845987-0249-4222-b2eb-04cb83059a27,733b07d2-662e-4b95-b959-2b5d0a115483,12932aa3-c4fb-4966-905c-488f025cb743,fde476d8-2aac-42ed-9c46-752e5d14207f,3c70ee06-5212-40a3-8f97-e23cf4bf063d,74e3c21f-dad1-4e75-9848-a6a85d837b12,f99f1f38-6f26-485c-8aad-b3bad4a2596f GT:DR:DV:PL:GQ 0/1:19:11:28,0,105:28 +chr1 1427458 cuteSV.INS.48 T TCCCTTCCCTCCATCACCCTGCCCAGCCCCCTCCCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCATCATCCCGCCCGCTCCCTCTCCACCCCTCCTTCCCCTGCATCACCCTGCCCAGCCCCCACCCCTCCATCTTCCCTGCCCTGCCCCTCCCCTCCATCCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCCAGCCCCTCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCCTGCCCCCACCCCATCACCCTGCCCTGCCCCCTTCCCTCCATCATCCCGCCCGCTCCCCTCTCCACCCCTCCCCTTCCCCTGCATCACCCTGCCCTGCCCCTTCCCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCCTCCATCACCCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCTGCCCCCACCTCCATCATCCCGCCCGCTCCCCTCTCCACCCCTCCCTTCCCTGCATCACCCTGCCTGCCCCTTCCCCTCCATCACCTGCCCTGCCCCCTCCCCTCCATCACCCTGCCCTGCCCCTCCCCTCCATCACCCTGCCCTGCCCCCACCCCTCCATCATCCCGCCGCTCCCCTCTCCACCCCTCCCTTCCCCTGCATCACCCTGCCCT 111.7 PASS PRECISE;SVTYPE=INS;SVLEN=637;END=1427458;CIPOS=-24,24;CILEN=-32,32;RE=20;RNAMES=f5204bf0-46df-4baf-964b-8c839f046200,2329bd50-c50b-40fb-8da8-8a3b33bd241e,a569e6c2-b324-4efd-9a08-13076cbbcbd5,c226539f-904d-474c-a18e-49b18f7cb378,ff84c73b-00ed-4477-949c-4902317bb4f0,102cfb6a-f6b2-4542-b618-563ef2e3ed6d,05911855-6bbc-4bb3-979c-479c8273eeaf,ed59a62c-2964-4d7a-ad2f-4e7b142d8463,b472de9d-b5fe-4b3d-8923-13fe8969ad76,d8e1b071-4b75-4442-9426-407a431e0b92,68a7b331-b649-402a-ac0e-b65b85655d04,77268fa6-8721-44f0-b8e3-ecf9fb06b924,623f61f6-f014-46d0-b4de-08151f38ff0d,640d7d27-bb3a-494b-838d-44ee7c85f12c,69c90976-a291-4c14-b803-a85f27f6fb6a,cf41e38b-2b70-40b9-9323-03a1f73be7c6,cedb80be-37e8-47fe-ad60-645973a12955,ea545945-4684-404b-aee5-f96da073730d,59ec017f-83ff-4284-a55f-d4fdec1caedd,31ccfe43-1c0d-4756-a1bb-acc9e62b69f6 GT:DR:DV:PL:GQ 0/1:11:20:112,0,26:25 +chr1 1428035 cuteSV.INS.49 G GGAGAGGGGAGGAGGGAAGAGGGAGGGGAGGGGGTAGGGAGGGGAGAGAGGGGAGGGAGGGGGAGAGAGGGAGGGAGGGGAGAGAGGTGGGAGGAGGAGAGAGGGGGAGTGGGAGGAGGAAGAGGGAGGGGGGAGGGAAGAGAGGGGACGGGAGGGGAAGAGAGGGAGGGGAGGGGAGGGGGGAAGAGAGGGCAGGGGAGGGTGGGAGAGGAGGGCATGGAGGGGTGGGAGAGAGGGGAGGGAGGGAGAGGAGAGGGGAGGGGGAGAGAGGGGAGGGAGGGGGAGAGAGGGAGGGAGGGAGGGGAAGAGGGGGAGGGGAGGGGAAGAGGAGGAGGGTGGAGAGAGGCAGGAGAGAGGGCAGTGGGGAGGG 324.4 PASS PRECISE;SVTYPE=INS;SVLEN=369;END=1428035;CIPOS=-17,17;CILEN=-22,22;RE=34;RNAMES=733b07d2-662e-4b95-b959-2b5d0a115483,f5204bf0-46df-4baf-964b-8c839f046200,d8e1b071-4b75-4442-9426-407a431e0b92,54e98c4b-5a5f-4113-9304-7a5c0aea7c63,2329bd50-c50b-40fb-8da8-8a3b33bd241e,68a7b331-b649-402a-ac0e-b65b85655d04,12932aa3-c4fb-4966-905c-488f025cb743,9029b9c6-09de-484d-83ff-7a20e6b313c8,d2031098-3567-4d27-be20-c8e182f7c4b4,f99f1f38-6f26-485c-8aad-b3bad4a2596f,c226539f-904d-474c-a18e-49b18f7cb378,623f61f6-f014-46d0-b4de-08151f38ff0d,a569e6c2-b324-4efd-9a08-13076cbbcbd5,05911855-6bbc-4bb3-979c-479c8273eeaf,77268fa6-8721-44f0-b8e3-ecf9fb06b924,69c90976-a291-4c14-b803-a85f27f6fb6a,59716984-39a5-40aa-af8c-7859e47e8dd6,cf41e38b-2b70-40b9-9323-03a1f73be7c6,2934cba4-0b4b-4aee-b14b-0d0e3546b541,fde476d8-2aac-42ed-9c46-752e5d14207f,b472de9d-b5fe-4b3d-8923-13fe8969ad76,3a22e315-09d8-4146-8de1-6a077f83e794,68845987-0249-4222-b2eb-04cb83059a27,74e3c21f-dad1-4e75-9848-a6a85d837b12,3c70ee06-5212-40a3-8f97-e23cf4bf063d,ff84c73b-00ed-4477-949c-4902317bb4f0,ea545945-4684-404b-aee5-f96da073730d,cedb80be-37e8-47fe-ad60-645973a12955,ed59a62c-2964-4d7a-ad2f-4e7b142d8463,59ec017f-83ff-4284-a55f-d4fdec1caedd,640d7d27-bb3a-494b-838d-44ee7c85f12c,91dab2f4-f212-4619-be08-03d8bc49e2d9,31ccfe43-1c0d-4756-a1bb-acc9e62b69f6,8e2d8882-2af9-4c06-b864-5cb41183c2e5 GT:DR:DV:PL:GQ 1/1:0:34:324,87,0:86 +chr1 1442880 cuteSV.INS.50 G GTTTCAGCAGATTTTGGGCCAAACCAAGTCAAT 190.9 PASS PRECISE;SVTYPE=INS;SVLEN=32;END=1442880;CIPOS=-11,11;CILEN=0,0;RE=27;RNAMES=9131a259-bac1-4478-91c8-74315ef48c01,a1ecbbe0-44ce-4b79-b89d-d41f41d6e980,a79e0f63-7054-4a36-8a7f-910b6bd38fc6,cedb80be-37e8-47fe-ad60-645973a12955,a569e6c2-b324-4efd-9a08-13076cbbcbd5,0451a570-a9c6-46b0-b66c-f07a921085d2,b472de9d-b5fe-4b3d-8923-13fe8969ad76,8037c6cd-9a78-4bb3-bc0f-876835c66397,9e1af1c2-ae7e-4af6-80c8-f04db6b8da09,56cf6573-38f4-40a4-975a-9eafb54e4772,3644ce7c-04b1-4adc-8da0-9bcfc52fea97,81c77fd1-7f7a-45b5-bbe9-d4ca5259b06b,12932aa3-c4fb-4966-905c-488f025cb743,640d7d27-bb3a-494b-838d-44ee7c85f12c,ea545945-4684-404b-aee5-f96da073730d,5c443079-8bd5-490c-999e-a89b4a7c2e4a,4d0d4c92-bd33-4cbf-b887-24500de6aa7c,6df546c9-ca86-42a4-b5d4-76f5ecb1a301,59ec017f-83ff-4284-a55f-d4fdec1caedd,88227abe-84b7-4e0c-9df5-b1f9c19a7db7,98c04ccb-fed1-4d09-b2ab-2237950eade2,07f7818c-3008-485b-a72a-bf23f8a9bd85,b75c54f4-1363-419e-aaec-ccd650302bab,299d3562-7f7c-453f-89f6-92cc818e2539,76fd3da7-0cfb-4d60-972d-4902f7c7579b,3114c99a-222d-4864-a130-dad4fa8ee72a,59716984-39a5-40aa-af8c-7859e47e8dd6 GT:DR:DV:PL:GQ 1/1:7:27:191,20,0:20 +chr1 1443673 cuteSV.DEL.35 CCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT C 132 PASS PRECISE;SVTYPE=DEL;SVLEN=-33;END=1443706;CIPOS=0,0;CILEN=-1,1;RE=24;RNAMES=59716984-39a5-40aa-af8c-7859e47e8dd6,76fd3da7-0cfb-4d60-972d-4902f7c7579b,74e3c21f-dad1-4e75-9848-a6a85d837b12,a79e0f63-7054-4a36-8a7f-910b6bd38fc6,1f5d20d9-283f-4121-b344-08e677666733,c8f9bea9-6094-4db4-a62e-87eb8ca3b85a,ea545945-4684-404b-aee5-f96da073730d,81c77fd1-7f7a-45b5-bbe9-d4ca5259b06b,5adb0f11-712e-43de-8296-47af85507bfc,12932aa3-c4fb-4966-905c-488f025cb743,4d0d4c92-bd33-4cbf-b887-24500de6aa7c,59ec017f-83ff-4284-a55f-d4fdec1caedd,5a0fc793-fc3b-4878-bb46-e770c6bf95ae,b472de9d-b5fe-4b3d-8923-13fe8969ad76,a188eb2e-3338-4a2a-a859-c22662ded131,a569e6c2-b324-4efd-9a08-13076cbbcbd5,07f7818c-3008-485b-a72a-bf23f8a9bd85,3644ce7c-04b1-4adc-8da0-9bcfc52fea97,a0d28591-5dfb-4680-9140-21843e82d50b,cedb80be-37e8-47fe-ad60-645973a12955,299d3562-7f7c-453f-89f6-92cc818e2539,9e1af1c2-ae7e-4af6-80c8-f04db6b8da09,c3895732-a3e2-4839-a5d3-409942045275,b75c54f4-1363-419e-aaec-ccd650302bab;STRAND=+- GT:DR:DV:PL:GQ 0/1:14:24:132,0,37:36 +chr1 1469102 cuteSV.DEL.36 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA A 0.7 q5 IMPRECISE;SVTYPE=DEL;SVLEN=-34;END=1469136;CIPOS=-6,6;CILEN=-1,1;RE=8;RNAMES=92dc4377-d417-453f-9af9-12426f178cf8,574e03e0-4c27-464c-95e2-7e359ad18011,a9c7f5b9-d9b4-416c-8249-e95da7419fcc,7c25be75-6172-4f7f-9c0b-b80389b1f8d9,857e3690-3e8e-4943-bbb1-28cd73e53bf8,d9b425de-6457-4cd7-9a17-8a9690806f29,05606178-17ba-4773-8989-dc90172265ae,bbd9dd0a-252d-4d40-b27a-5bd9eec4f1b3;STRAND=+- GT:DR:DV:PL:GQ 0/0:25:8:1,9,163:8 +chr1 1477901 cuteSV.INS.51 C CGCCACCACGCCTGGCTAATGTTGTATTTGTAGAGACGGGGTTTCTCCATGTTGGTCGGGCTGGTCTCGAACTCCCGACCTCAGGTGATCCACCGCCTGTCTCTCAAATCAGTGCTGGGATTACAGGCATGT 305.4 PASS PRECISE;SVTYPE=INS;SVLEN=131;END=1477901;CIPOS=-15,15;CILEN=-1,1;RE=32;RNAMES=92dc4377-d417-453f-9af9-12426f178cf8,6d230858-7403-4a6e-8fe6-561a53d8bb08,9304e5e6-4053-44dc-bd61-14dbbbacfe80,55db679c-a93c-4d4b-b081-6be6ddf8d32d,0c4d0a70-ff5e-4891-99b9-05ed59a23eca,4d0846d2-c77a-4f04-ab9d-68eb548cc199,7724e954-c223-4614-8ee1-452e727afe21,7c25be75-6172-4f7f-9c0b-b80389b1f8d9,b73d7779-9769-42f0-b383-896a2cf597d8,b996975f-7ea0-452e-9a01-15d57bacd531,6e95792e-cae3-4246-9227-5e4c3c8c457e,93c9a19c-23df-4132-8455-678153d0eecd,6a20ecdf-0172-4385-bf2c-7fc05a077871,21e92a97-e96a-45b9-ab20-149fd0adae33,f2d3f807-f651-402a-a686-ad37157d5956,6bf99bdb-e2f4-4d9e-a74d-277aa8418d5f,a370a155-fb2e-479c-a49b-0efa0e7a96b7,c50b11cd-e5f8-4448-bd61-e85b50c30308,18de7f98-d2e4-4e3c-b9a5-4100435aed4a,65b0e4a5-f9ec-4ae5-bc34-c79385df8859,e06bbc72-2c56-480f-b3ba-289a320fa5ac,a9c7f5b9-d9b4-416c-8249-e95da7419fcc,576d01a5-9b52-4982-9e22-4953786abad1,55845941-6f72-4a68-a4e5-b51f39132aed,aa6d1629-603c-4ff4-8b60-6e5e3310bbb1,a37daeca-ebbe-49df-b89d-f662704b024c,d638acf6-2965-4be2-818a-f4b169c90606,80fc4697-1048-4150-a38a-6032b39aca24,96e41d89-8690-4e83-888a-3d096b4cf675,d9b1ec8b-d331-4b36-8367-c14aee545173,301af76c-5c8c-4f8f-a7de-60de6dce6843,37af4c00-18d0-4eee-911f-aa036824902a GT:DR:DV:PL:GQ 1/1:0:32:305,82,0:81 +chr1 1497112 cuteSV.INS.52 C CGCCTGGGCGCAGCGGTCCCATGGGAGAGCAGACCCTCG 78.6 PASS PRECISE;SVTYPE=INS;SVLEN=38;END=1497112;CIPOS=-8,8;CILEN=-1,1;RE=16;RNAMES=c920d579-a175-4d4e-98b8-5483456734fc,0d2b73fa-6365-4899-8a27-0b4f8e133d21,a370a155-fb2e-479c-a49b-0efa0e7a96b7,757d110c-2087-4f31-99f9-984675f8afc2,f2da6824-3384-45bb-a431-9637afa6833a,de71b88d-4247-40b5-8f20-18ad4ba91948,727733a3-f662-4da2-b1b6-6bb591a498e1,e002b396-4646-469c-b2f5-31418b4a55c5,7994bd08-7dbf-4d35-9313-ad00aede507d,6cdcaa04-0dad-4d56-a1c1-16fca9d411fb,5c5434d0-7e94-4961-86bf-5ed0be05a901,b20f81d6-0e81-482a-b814-9f2895aad339,96e41d89-8690-4e83-888a-3d096b4cf675,729d0ffb-0e5a-444a-9e5c-25b39d7500b1,ae45098f-0cbd-4ff4-8622-d1ad73658aa0,df37dd1c-d52d-40b9-b817-b885543054d4 GT:DR:DV:PL:GQ 0/1:13:16:79,0,50:50 +chr1 1540143 cuteSV.DEL.37 CGGGGAGGGGAGCGCAGGCCGGGGAGGGGAGCGCAGGCC C 171.8 PASS PRECISE;SVTYPE=DEL;SVLEN=-38;END=1540181;CIPOS=-6,6;CILEN=0,0;RE=24;RNAMES=d211c770-b648-412e-826b-1c8f6b8c7d33,5fe8d78f-4365-4398-a6fd-c2f2375b038c,1fcf68f1-2ebd-432b-aad6-0baaa1e1fc25,2d213b39-1d81-4c16-b70a-600d7f0e1329,3e6d8ba0-6225-42ff-8c4f-b71ca8b33fcf,bf39c1d7-160d-462f-95d2-c8eb809c430b,c940e79b-9607-442d-aaa3-efdaa61625b8,056a7033-c4e8-4b0a-a9e1-63acfa17c278,ee817bac-930e-4d3c-a14b-b1b2646e6bf5,edc22f9f-3ca3-42a8-9fb8-607cac4d842b,222aed17-ae32-4e79-8f2e-6a047c6f0175,a38b178e-45de-49a5-894a-bcd312ffd3e0,e06d8d91-cb08-43d9-8502-0beb8815aff6,e55fb060-f89b-40c2-9ad3-237a85b372b1,30c45a20-8403-4f8e-9b9c-71840c473bd9,6e9c3ff9-386f-44dc-b9d3-5df3baf75fc7,69af0fc2-5e0e-4ebf-b788-3843b85e1017,c2b3bda1-e3f7-40ff-b1e9-22da462fff0c,3a7272a0-2ebd-48ec-81c7-449f116f9638,e7d62aaf-f433-4d15-a8b5-f99f4aa67a17,133c6253-34ea-4e09-af98-2727ae8c2a7b,73b74963-da1a-4249-aeba-9c309d098918,98d935e6-1078-4aa3-bdb0-b2c6b9bda83d,5e8f62d8-cace-44cf-a70e-de5ddbdddcfc;STRAND=+- GT:DR:DV:PL:GQ 1/1:6:24:172,19,0:19 +chr1 1554172 cuteSV.DEL.38 CTAAGGGGTCCCCACGAAGCTGAGCACGAGGCGGATCCGGACCA C 57 PASS PRECISE;SVTYPE=DEL;SVLEN=-43;END=1554215;CIPOS=-12,12;CILEN=-1,1;RE=14;RNAMES=222aed17-ae32-4e79-8f2e-6a047c6f0175,6e9c3ff9-386f-44dc-b9d3-5df3baf75fc7,c2b3bda1-e3f7-40ff-b1e9-22da462fff0c,69af0fc2-5e0e-4ebf-b788-3843b85e1017,41e3acae-1239-4690-aaa5-104c76c52dbb,df9ed2d4-72ac-4afe-bcba-c3391664b31a,c5abd0d0-0f77-4306-b8e2-693cb97de6ca,81f0b908-59ab-41a6-a075-a4bd92c97b16,c755a07f-1e64-43bb-9430-dba434f89b26,d211c770-b648-412e-826b-1c8f6b8c7d33,8b4ebcb8-8c9e-4280-9157-ffb0e9018584,f7ebc27f-ae67-453c-b618-2cfa8fa6f3f6,133c6253-34ea-4e09-af98-2727ae8c2a7b,f493efa3-011f-47ea-a3ae-a13a3ec516e9;STRAND=+- GT:DR:DV:PL:GQ 0/1:16:14:57,0,76:56 +chr1 1594855 cuteSV.INS.53 A AAGCAGGGTGGGGAGAGACAGACACAGAGAGAGAGCAGAACGGGAAGGAAGAGACAGAGAGAGGCAGACAGAGAGAGAGAGAGAGACAGACAGACAGACACAGAAGAGCAGAACAGGGAGAGACAGAGAGAGTGAGACAGAACCCGGAGACAGAGGAGGCAGACACACAGAGAGAGAGAGAGAGAGAGACAGACAGACACAGAGGCAGAACAGGGAGAGACAGAGAGACAGAGAGAGAGAGAGTGAGACAGAGACAGGGAATTGAGAGGCAGACAGAGAGAGACAGACAGACAGACAGACACAGAGAGAACCAGGGCAAGGGACAGACAGAGAGAGAGAGACAGACAGAGAGCAGAACAGGGAGAGACAAAGAGACAGAGAGAGAAGACACAGAGAGAGAGAGACAGAGAGAGGCAGACAGAGACAAGAGAGACAGACAGACACAGAGCAGAACAGGGAAGACAGAGAGAGAGAGAGACAGAGAGAGGCAAGACAGAGAGAGAGAGAGACAGACACAGAGAGAGCAGAACAGGGAGAAACAGAGACAGAGAGCGAGAGAGACAGAGACAGAGAGAGAGGCAGACAGAGACAGAGAGAGAGAGAGACAGACAGACACAGAGAGAGCAGAGCAGGGAGGAGAGACAGAGAGAGAGAAAGAGACAAAGACAGAGACAGAGACAGAGAGGCAGAGACAGAGACACAGAGAGAGCAGAACAGGGAGAGACAGCAGACAGAGAGAGAGAGAGAGACAGAGAGAGGCAGACAGAGACAGAGAGAGAGACAGACAGACACAGAGAGAGCAGAACAGGGAGAGACAGAGACAGAGAGAAACAGAGACAGAGACAGAGACAGAGAGGCAGACAGAGAGAGAGACAGACACAGAGAGAGCAGAACAGGGAGAGACAGAGAGACAGAGAAGGTAGAGACAGAGACAGAGACAGAGGCAGACAGAGAGAGACAGACAGACAGACAGACAGACACAGAAGAGAGCAGAACAGGGACAGACAGAGAGAGAGAGAGAGAGACAGAGACAGAGACAGAAGGCAGAGAGAGACACAGAGAGAGCAGAACAGGGAGAGACAGAGAGACAGAGAGAGAGAGAGACAGAGAGGAGCCAGACAGAGACAGAGACAGACAGACAGACACAGAGAGAGCAGAACGGGGAGAGACAGAGAGAGAGAGAGAGAGAGAGACAGAGAGAGAGAGCAGAACAGGGAGAAACAGAGAGACAGAGAGCGAG 1 q5 IMPRECISE;SVTYPE=INS;SVLEN=1239;END=1594855;CIPOS=-13,13;CILEN=-45,45;RE=5;RNAMES=592727b7-8b4e-41a3-835b-3e0ff258c59c,bb65e009-c11a-467f-8d23-d620c0605d7b,2ef6b649-4918-488d-9a44-500e03fbebaf,57e2bd9a-3426-42c1-ad31-09f88e493158,64d8dedb-1adb-469e-9cee-aba7b1e24411 GT:DR:DV:PL:GQ 0/0:16:5:1,7,106:6 +chr1 1594953 cuteSV.DUP.5 A 0.4 q5 IMPRECISE;SVTYPE=DUP;SVLEN=45;END=1594998;RE=8;STRAND=-+;RNAMES=feb42a4c-bff2-47e6-bc2b-32908590ae4a,e0aa3efa-33ee-4d1e-ab4c-e75b4cc6c632,8c7063be-6b73-4d7b-8f08-3e861f2ca50d,46de2f85-a73a-4539-8cc8-08ca1c95653b,3b0211e3-7775-44ad-b4a3-e42a1b9745c8,bba10337-5aad-46a2-883d-8f081186d727,0124b41f-8dd8-426e-94c4-efca467e11eb,68023d5d-2b80-4da6-8dea-6c6e3fb0f7e5 GT:DR:DV:PL:GQ 0/0:26:8:0,11,172:10 diff --git a/tests/data/sniffles.vcf b/tests/data/sniffles.vcf new file mode 100644 index 00000000..700df87a --- /dev/null +++ b/tests/data/sniffles.vcf @@ -0,0 +1,355 @@ +##fileformat=VCFv4.1 +##FILTER= +##source=Sniffles +##fileDate=20210607:16:05 PMef_minus +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##bcftools_viewVersion=1.11+htslib-1.11 +##bcftools_viewCommand=view --regions chr1 F24721_merged_sorted.bam_5_read_sorted.vcf.gz; Date=Tue Jan 4 22:45:21 2022 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /projects/jfan_prj/jfan_prj/Nanopore_Testing/2021_nanopore_sv_testing/scratch/depth_testing/POG/COLO829/minimap2_bam/F24721_merged_sorted.bam +chr1 10006 35777 N ]chr3:198172735]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198172735;STD_quant_start=32.4628;STD_quant_stop=44.8237;Kurtosis_quant_start=2.29519;Kurtosis_quant_stop=-0.995353;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=--;STRANDS2=0,6,6,0;RE=6;REF_strand=72,102;Strandbias_pval=0.0824618;AF=0.0344828 GT:DR:DV 0/0:168:6 +chr1 10030 36832 N ]chr17:41490827]N . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr17;END=41490827;STD_quant_start=48.6107;STD_quant_stop=4.67516;Kurtosis_quant_start=0.545103;Kurtosis_quant_stop=1.53121;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=--;STRANDS2=0,7,7,0;RE=7;REF_strand=24,9;Strandbias_pval=0.000613617;AF=0.212121 GT:DR:DV 0/0:26:7 +chr1 10312 35780 N ]chrX:449436]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chrX;END=449436;STD_quant_start=117.156;STD_quant_stop=68.302;Kurtosis_quant_start=-1.29786;Kurtosis_quant_stop=-0.029231;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=0,6,0,6;RE=6;REF_strand=45,128;Strandbias_pval=0.33926;AF=0.0346821 GT:DR:DV 0/0:167:6 +chr1 10466 35781 N N[chrX:156030800[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chrX;END=156030800;STD_quant_start=81.1924;STD_quant_stop=134.17;Kurtosis_quant_start=1.34083;Kurtosis_quant_stop=1.99911;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=5,0,0,5;RE=5;REF_strand=7,8;Strandbias_pval=0.0546956;AF=0.333333 GT:DR:DV 0/1:10:5 +chr1 10467 35779 N N[chr3:10002[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=10002;STD_quant_start=106.244;STD_quant_stop=161.729;Kurtosis_quant_start=0.552508;Kurtosis_quant_stop=2.99076;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=+-;STRANDS2=6,0,6,0;RE=6;REF_strand=4,5;Strandbias_pval=0.043956;AF=0.666667 GT:DR:DV 0/1:3:6 +chr1 10467 35782 N N[chr3:198174376[ . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198174376;STD_quant_start=17.5865;STD_quant_stop=297.518;Kurtosis_quant_start=0.324147;Kurtosis_quant_stop=0.886959;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=7,0,0,7;RE=7;REF_strand=57,112;Strandbias_pval=0.000675389;AF=0.0414201 GT:DR:DV 0/0:162:7 +chr1 10468 35778 N N[chr17:41490879[ . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr17;END=41490879;STD_quant_start=41.208;STD_quant_stop=1.92354;Kurtosis_quant_start=3.02235;Kurtosis_quant_stop=0.961601;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=11,0,0,11;RE=11;REF_strand=4,6;Strandbias_pval=0.00386997;AF=1 GT:DR:DV 1/1:0:11 +chr1 35143 35783 N N[chr20:60001[ . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr20;END=60001;STD_quant_start=0;STD_quant_stop=0.632456;Kurtosis_quant_start=nan;Kurtosis_quant_stop=-0.5;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=+-;STRANDS2=3,2,3,2;RE=5;REF_strand=0,3;Strandbias_pval=0.196429;AF=1 GT:DR:DV 1/1:0:5 +chr1 136637 0 N GTGTCGGCTGACCCTCTGTCCGCGTGGAGGCCGGTGGGGTGTGGAGGC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=136638;STD_quant_start=20.4524;STD_quant_stop=21.8541;Kurtosis_quant_start=-1.28738;Kurtosis_quant_stop=-1.27278;SVTYPE=INS;SUPTYPE=AL;SVLEN=46;STRANDS=+-;STRANDS2=10,8,10,8;RE=18;REF_strand=20,20;Strandbias_pval=0.780391;AF=0.45 GT:DR:DV 0/1:22:18 +chr1 136956 1 N TGACCTCTCTCAGTGTGGGAGGGGGCCGGTGTGAGGCAAGGGGCTCACGCGCGGCCTCTGTCCGCGTGGGAGGGGCCGGTGTGAGACAAGGGGCTCAGGCTGACCTCTCAGCGTGGGAGGGGCCGGTGTGAGGCAAAGGGCTCGGGCTGACCTCTCTCAGCGTGGGAGGGCCAGTGTGAGGCAGGGCTCACATGACCTCTCAGCATGGGAGGGGCCGGTGTGAGACAAGGGCTCGGG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=136989;STD_quant_start=36.1248;STD_quant_stop=37.4166;Kurtosis_quant_start=0.21249;Kurtosis_quant_stop=1.96205;SVTYPE=INS;SUPTYPE=AL;SVLEN=195;STRANDS=+-;STRANDS2=6,8,6,8;RE=14;REF_strand=18,20;Strandbias_pval=1;AF=0.368421 GT:DR:DV 0/1:24:14 +chr1 180694 35784 N ]chrX:449444]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chrX;END=449444;STD_quant_start=296.777;STD_quant_stop=10.3923;Kurtosis_quant_start=0.018679;Kurtosis_quant_stop=0;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=0,6,0,6;RE=6;REF_strand=26,30;Strandbias_pval=0.0354297;AF=0.107143 GT:DR:DV 0/0:50:6 +chr1 181262 2 N CCGGCAGGCGCAGAGAGGCGCGGGCCGGGGTCGGGCGCAGGCGCAGAGAGCGCGGCCGGCGCAGAGGCGCAGAGAGGGCGCAGCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=181262;STD_quant_start=28.5441;STD_quant_stop=24.1125;Kurtosis_quant_start=2.54835;Kurtosis_quant_stop=1.686;SVTYPE=INS;SUPTYPE=AL;SVLEN=93;STRANDS=+-;STRANDS2=19,15,19,15;RE=34;REF_strand=56,46;Strandbias_pval=1;AF=0.333333 GT:DR:DV 0/1:68:34 +chr1 257667 35785 N ]chr5:181462060]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr5;END=181462060;STD_quant_start=0.894427;STD_quant_stop=0.447214;Kurtosis_quant_start=2;Kurtosis_quant_stop=2;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=--;STRANDS2=0,5,5,0;RE=5;REF_strand=19,18;Strandbias_pval=0.0532252;AF=0.135135 GT:DR:DV 0/0:32:5 +chr1 350806 3 N ACTCACTGAAGGTGGAGGGAAAATGGTGACCTAAGTC . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=350807;STD_quant_start=1.22475;STD_quant_stop=2.82843;Kurtosis_quant_start=3;Kurtosis_quant_stop=-0.65625;SVTYPE=INS;SUPTYPE=AL;SVLEN=37;STRANDS=+-;STRANDS2=4,2,4,2;RE=6;REF_strand=10,4;Strandbias_pval=1;AF=0.428571 GT:DR:DV 0/1:8:6 +chr1 368936 4 CCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACGTGGGTGCCATCTCAGCAGCTCACGGTGTAGAAACTGCGACACTCCCATGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTAGAAACTGCGACACTCCCATGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=369307;STD_quant_start=139.183;STD_quant_stop=119.892;Kurtosis_quant_start=-0.086052;Kurtosis_quant_stop=-0.157727;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-371;STRANDS=+-;STRANDS2=7,5,7,5;RE=12;REF_strand=16,20;Strandbias_pval=0.511217;AF=0.333333 GT:DR:DV 0/1:24:12 +chr1 372679 5 CTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGCATATATGTATAATATATATTATATATGGTCCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCTATAACCGACCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=372858;STD_quant_start=74.4439;STD_quant_stop=102.876;Kurtosis_quant_start=-0.974097;Kurtosis_quant_stop=1.36116;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-179;STRANDS=+-;STRANDS2=6,4,6,4;RE=10;REF_strand=14,12;Strandbias_pval=1;AF=0.384615 GT:DR:DV 0/1:16:10 +chr1 374100 6 N CCCCCTCTCCTTTCTCCTCTCCATCCCCCCTCTCCATCTCCTCTCCTTTCTCCTCTCTCGCCCCCTCTCCTTTCTCCCTCTCTATCCCCCTCTCCTTTCTCCCTCTCTCCCCCTCTCCTTTCTCCTCTCCATCCCCTCTCCATCCCCCTCTCCATCTCCTCTCCTTTCTCCTCTCTAGCCCCTCTCCTTTCTCTCTCCTCCCCCTCTCCTTTCTCCCTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=374100;STD_quant_start=57.1456;STD_quant_stop=79.9085;Kurtosis_quant_start=0.684282;Kurtosis_quant_stop=-0.413029;SVTYPE=INS;SUPTYPE=AL;SVLEN=227;STRANDS=+-;STRANDS2=4,4,4,4;RE=8;REF_strand=14,10;Strandbias_pval=0.703493;AF=0.333333 GT:DR:DV 0/1:16:8 +chr1 606600 7 GGTCAGAGCTGTCCTGGGTCAGAGCTGCCCAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=606632;STD_quant_start=2.98329;STD_quant_stop=2.54951;Kurtosis_quant_start=2.61341;Kurtosis_quant_stop=3.87685;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;STRANDS2=7,4,7,4;RE=11;REF_strand=28,28;Strandbias_pval=0.516721;AF=0.196429 GT:DR:DV 0/0:45:11 +chr1 609583 8 GTGGCCAGCAGGCGGCGCTGCAGGAGAGGAGATGCCCAGGCCTGGCGGCACACGCGGGTTC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=609647;STD_quant_start=21.6956;STD_quant_stop=18.4174;Kurtosis_quant_start=-0.340189;Kurtosis_quant_stop=0.435423;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-64;STRANDS=+-;STRANDS2=9,4,9,4;RE=13;REF_strand=30,34;Strandbias_pval=0.223523;AF=0.203125 GT:DR:DV 0/0:51:13 +chr1 611309 9 TGTGGGTGTGACAGGGTGTGTTCTGTGTGAGAACATGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGATGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTTGGTGTGAGTTCATGGGTGTGACGGGGTGTGCTGTGTGAGAACGTGTGTGTAGTGTTCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=612033;STD_quant_start=78.5303;STD_quant_stop=59.8415;Kurtosis_quant_start=-0.363;Kurtosis_quant_stop=0.0992;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-724;STRANDS=+-;STRANDS2=7,6,7,6;RE=13;REF_strand=29,33;Strandbias_pval=0.763359;AF=0.209677 GT:DR:DV 0/0:49:13 +chr1 744867 10 N TATATATATATATATATATATATATATATATA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=744867;STD_quant_start=1.34164;STD_quant_stop=4.07431;Kurtosis_quant_start=2;Kurtosis_quant_stop=-0.953985;SVTYPE=INS;SUPTYPE=AL;SVLEN=35;STRANDS=+-;STRANDS2=4,1,4,1;RE=5;REF_strand=24,16;Strandbias_pval=0.635332;AF=0.125 GT:DR:DV 0/0:35:5 +chr1 814584 11 N AAAAAAAGATGTGAAACCTATTTTCAGAATTAACATTTCCTTCCTAAATATCTAACACAACACTGAAGGAGAAAGTCCAGTCAATTTTATGTAGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=814585;STD_quant_start=17.8792;STD_quant_stop=15.8719;Kurtosis_quant_start=-1.85801;Kurtosis_quant_stop=-1.90018;SVTYPE=INS;SUPTYPE=AL;SVLEN=96;STRANDS=+-;STRANDS2=13,11,13,11;RE=24;REF_strand=40,44;Strandbias_pval=0.64659;AF=0.285714 GT:DR:DV 0/0:60:24 +chr1 820880 12 N TCTACACTACCTGCCTGGCCAGCAGATCCACCCTGTCTACACTACCTGCCTGGGCAGTAGTTCCACGCAATCTCCCTTACCTGCCTCTCCAGCAGACCCGCCCTATCTATACTACTTGCCTGTCCAGCAGATCCACTTCCCATTCACACGACCTGCCTGTCCAGCAGATCCACCCTGTCTACACTACCTTCCTGCTTGTCCAGCAGGTCCACCCTGTCTATACTACCTGCCTGGCCAGTAGATCCACACACTA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=820881;STD_quant_start=6.70075;STD_quant_stop=12.2963;Kurtosis_quant_start=5.29053;Kurtosis_quant_stop=5.17296;SVTYPE=INS;SUPTYPE=AL;SVLEN=245;STRANDS=+-;STRANDS2=11,9,11,9;RE=20;REF_strand=54,40;Strandbias_pval=1;AF=0.212766 GT:DR:DV 0/0:74:20 +chr1 820906 13 TCCACCCTGTCTACACTACCTGCTTGTCCAGCAGG N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=820941;STD_quant_start=2.56905;STD_quant_stop=2.70185;Kurtosis_quant_start=-1.38237;Kurtosis_quant_stop=-1.08812;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=11,9,11,9;RE=20;REF_strand=54,40;Strandbias_pval=1;AF=0.212766 GT:DR:DV 0/0:74:20 +chr1 822428 14 CCTGGCCAGCAGATCCACCCTGTCTATACTACCTG N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=822463;STD_quant_start=2.98329;STD_quant_stop=2.91548;Kurtosis_quant_start=-1.2983;Kurtosis_quant_stop=-1.31531;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=11,9,11,9;RE=20;REF_strand=54,42;Strandbias_pval=1;AF=0.208333 GT:DR:DV 0/0:76:20 +chr1 839479 15 ACACACACCTGGACAAACACACCTGGACACACACACCTAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=839519;STD_quant_start=11.7558;STD_quant_stop=10.8904;Kurtosis_quant_start=-1.43253;Kurtosis_quant_stop=-1.20587;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-40;STRANDS=+-;STRANDS2=8,7,8,7;RE=15;REF_strand=42,42;Strandbias_pval=1;AF=0.178571 GT:DR:DV 0/0:69:15 +chr1 853534 16 GCCGTGTGGTAAACTGATGAACCCCGACCCTGATGAACGTGAGATG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=853581;STD_quant_start=21.0143;STD_quant_stop=21.4033;Kurtosis_quant_start=-1.50523;Kurtosis_quant_stop=-1.48919;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-47;STRANDS=+-;STRANDS2=7,8,7,8;RE=15;REF_strand=30,36;Strandbias_pval=1;AF=0.227273 GT:DR:DV 0/0:51:15 +chr1 866801 17 N CGCTCCTGGCCGTCTCCGAGCCCTCCACATGTCTCCTGCCTCATCCCTGACGTCCTCCCAGGCCCTCGTGGTCACTCCCCCTGCACTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=866839;STD_quant_start=45.3707;STD_quant_stop=37.0162;Kurtosis_quant_start=-2.21189;Kurtosis_quant_stop=-1.85872;SVTYPE=INS;SUPTYPE=AL;SVLEN=49;STRANDS=+-;STRANDS2=6,12,6,12;RE=18;REF_strand=32,44;Strandbias_pval=0.598063;AF=0.236842 GT:DR:DV 0/0:58:18 +chr1 872837 18 N GGGGAGGTTTCATTTGCTCCACCTGCAGCGAGTAAGTAGCCCATCTCAGGTTTGACTCCTGACTTAATTCCTAACAGGGGAAGCCAAGGTCCTGTGACCCTCCCGGGGGAGGGGTTTCATTTGTTCTACCTGCAGTGAGGTCTGTTAGCCCATCTCAGGTTTGACTCCTGACTCTAATTCTAACAGGAAGCTGTCCTGTAACTCTGGGGAGGGGGGGGTTTCATTTGCTCCACCTGCAGCGAGGTTAGCCCTCCATCTCAGGTTTGACTCCTGACTTAATTCCTAACAGGGGAAGCTGCTGTCCTGTGACTCTGGGAGAAGGGGTTTCATTTGCTCCACCTGCAGTGAGGTCTGCTAGCCCATCTCAGGTTTGACTCTGACTTAATTCCTAAACAGGGGAAGCTGCTGTCCTGTAACTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=872837;STD_quant_start=87.716;STD_quant_stop=108.899;Kurtosis_quant_start=-1.21956;Kurtosis_quant_stop=-1.1229;SVTYPE=INS;SUPTYPE=AL;SVLEN=416;STRANDS=+-;STRANDS2=7,7,7,7;RE=14;REF_strand=28,30;Strandbias_pval=1;AF=0.241379 GT:DR:DV 0/0:44:14 +chr1 876112 19 N CCCCATACTCCTCCCCCATACTCCCCCATACCCCCCCACACTCCCCCCATACTCCTCCCCCATACTCCCCCATACTCCCCCACACTCCCCCATACTCCTCCCCCATACTCCCCTATACTCCCCACACTCCCCCCAAACTCCCCCCATACTCCTCCCCCATACTCCCCATACTCCCCCACACTCCCCCACACTCCCCCATACTCCCCCACACTGTTCCCCCCATACCTCCCCCATACTCCCCCACACTCCCCCACACTCCCCCACGCTCCTCCCCCACACCCTCCCACACTCCCCCACACTCCCCTACTGCCTTCCCCCACACTCCCCCACACTCCTCCCCATACTCCCCCACACTCCCTCATACTCCCCATACTACCCCAACCTCCCCCATACTCCCCCATACTCCCCACACACTCCCCCCACACTCCCCCCAAACTCCCCCATACTCCTCCCCCAGTACTCCCCCATACTCCCACACTCCCACACTCCCCCACACTCCCCCCATACTCCCCCACACTCCCCCACACTCACTCCACACTCCCCATACTCCCCAAATCTCCCCCATACTCCCACATTCCCCCACACTCCCCACACTCCCCCATACTCCCCCACACTCCCCACACTCACCCACACCCCCCCATACTCCCCAACCTCCCCCATACTCCCCACATTCCCCCATACTCCCCCATACTCCTCCCCCATACTCCCCCCATACTCCCCCACACTCCCCACACTCCCCCATACTCCCCCACACTCCCCATACTCCCCCTGCATCCTCCCCATACTCCCCACATTCCCCCATACTCCCCATACTCCCCACACTCCCCCACACTCCCCCATACTCCCCCTCACACTCCCCCCATACTCCCCAACCTCCCAAACTCCCCCACATTCCCCCATACTCCCCATACTCCCCCAAACTCCCCATACTCCTCCCCTCAATACTCCCCATACTCCCCCATACTGCCCAACCTCCCCATACCCCCCACACTCCCCCCATACTCCCCCCACACCCCCCCCATACTCCCCCACACTCCCCTGCAACTCCCCTTATACTCCTCCCCCATACTCCCCATACTCCCCCCACACTCCCCAAACTCCCCATACTCCTCCCCATACTCCCCATACTCCCCCACACTCCCCCATACTCCTCCCCCATACTCCCCATACCCCCACACTCCCCCATACTCTCCCCATACTCCCCATACTCCCCACACTCCCCCAAACTCCCCCATACTCCTCCCCCATACTCCCCATACTCCCCCACACTCCCCCACACTCCCCCATACTCCCCACACTCCCCCATACTCCCCCAACCTCCCCATACTCCCCCACATTCCCCTATTACTCCCCATACTCCCCAAACTCCCCACATTCCCCCATACTCCCCCATACTCCCCAAACTCCCCCATACTCCTCCCCCACACTCCCCATACTCCCCCATACTCGCCCAACCTCCCCATACTCCCCCACTCCCCCATACTCCCCCACAGTCCCCCACACTCCCCCACACACTCCCCAACCTCCCCCATACTCCCCATACTCGCCCACACTCGCCCACACCCCCCCATACTCCCCACACTCCCCCACACTCCCCCACACCCCCCATACTCCCCCATACTCCCCATACTCCCCCACACCCCCACACT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=876112;STD_quant_start=160.964;STD_quant_stop=281.694;Kurtosis_quant_start=-1.48637;Kurtosis_quant_stop=-0.804806;SVTYPE=INS;SUPTYPE=AL,SR;SVLEN=1649;STRANDS=+-;STRANDS2=7,6,7,6;RE=13;REF_strand=48,38;Strandbias_pval=1;AF=0.151163 GT:DR:DV 0/0:73:13 +chr1 876433 35786 N N[chr4:189980733[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr4;END=189980733;STD_quant_start=373.501;STD_quant_stop=193.312;Kurtosis_quant_start=-0.601023;Kurtosis_quant_stop=-0.696578;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=4,3,3,4;RE=7;REF_strand=15,15;Strandbias_pval=1;AF=0.233333 GT:DR:DV 0/0:23:7 +chr1 878423 35787 N ]chr3:198124405]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198124405;STD_quant_start=0.632456;STD_quant_stop=0.316228;Kurtosis_quant_start=4.00716;Kurtosis_quant_stop=2.14525;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=12,5,12,5;RE=17;REF_strand=34,32;Strandbias_pval=0.182341;AF=0.257576 GT:DR:DV 0/0:49:17 +chr1 878423 36833 N ]chr3:198124405]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198124405;STD_quant_start=2.72029;STD_quant_stop=0.632456;Kurtosis_quant_start=1.9394;Kurtosis_quant_stop=6.69527;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=3,8,3,8;RE=11;REF_strand=34,32;Strandbias_pval=0.19555;AF=0.166667 GT:DR:DV 0/0:55:11 +chr1 882645 20 ATATATTAGCTATTCTAGACTTTATGCATTTATGTAAAGTTTTCTTTGTTGCACTTTAAGTTCTGTGATACATGGGCAGAGCATGC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=882732;STD_quant_start=2.70801;STD_quant_stop=2.51661;Kurtosis_quant_start=0.409091;Kurtosis_quant_stop=3.73961;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-87;STRANDS=+-;STRANDS2=8,1,8,1;RE=9;REF_strand=77,90;Strandbias_pval=0.0153702;AF=0.0538922 GT:DR:DV 0/0:158:9 +chr1 883246 35788 N N[chr20:29351529[ . STRANDBIAS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr20;END=29351529;STD_quant_start=6.0208;STD_quant_stop=8.59506;Kurtosis_quant_start=4.94502;Kurtosis_quant_stop=2.53006;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=8,0,0,8;RE=8;REF_strand=31,35;Strandbias_pval=0.00564375;AF=0.121212 GT:DR:DV 0/0:58:8 +chr1 883246 35789 N N[chr20:29789177[ . STRANDBIAS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr20;END=29789177;STD_quant_start=1.73205;STD_quant_stop=2.05481;Kurtosis_quant_start=4.74074;Kurtosis_quant_stop=5.09003;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=9,0,0,9;RE=9;REF_strand=30,34;Strandbias_pval=0.00272312;AF=0.140625 GT:DR:DV 0/0:55:9 +chr1 886250 21 N TGTGCTGGCCCTTTGGCAGAGCAGGTGTGGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=886250;STD_quant_start=14.922;STD_quant_stop=15.3351;Kurtosis_quant_start=-0.359429;Kurtosis_quant_stop=-0.424765;SVTYPE=INS;SUPTYPE=AL;SVLEN=32;STRANDS=+-;STRANDS2=4,2,4,2;RE=6;REF_strand=20,38;Strandbias_pval=0.186216;AF=0.103448 GT:DR:DV 0/0:52:6 +chr1 893790 22 AAAAAAAAAAAAATATATATATATATATATATATAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=893826;STD_quant_start=0.738549;STD_quant_stop=0;Kurtosis_quant_start=-1.74362;Kurtosis_quant_stop=-1.42857;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-36;STRANDS=+-;STRANDS2=13,10,13,10;RE=23;REF_strand=28,24;Strandbias_pval=1;AF=0.442308 GT:DR:DV 0/1:29:23 +chr1 907836 23 N CTGCCCGGTCCTTCTGACCAGCCGAGAGAGTA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=907836;STD_quant_start=11.7346;STD_quant_stop=12.1491;Kurtosis_quant_start=-0.460251;Kurtosis_quant_stop=-0.470373;SVTYPE=INS;SUPTYPE=AL;SVLEN=32;STRANDS=+-;STRANDS2=6,6,6,6;RE=12;REF_strand=34,32;Strandbias_pval=1;AF=0.181818 GT:DR:DV 0/0:54:12 +chr1 909140 24 TTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCATCTTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCCGGGCGCACT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=909494;STD_quant_start=64.8764;STD_quant_stop=64.2294;Kurtosis_quant_start=-2.03504;Kurtosis_quant_stop=-1.70278;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-354;STRANDS=+-;STRANDS2=20,14,20,14;RE=34;REF_strand=39,33;Strandbias_pval=0.680851;AF=0.472222 GT:DR:DV 0/1:38:34 +chr1 934067 25 GGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=934880;STD_quant_start=19.0606;STD_quant_stop=20.0499;Kurtosis_quant_start=1.46688;Kurtosis_quant_stop=-1.88984;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-813;STRANDS=+-;STRANDS2=12,14,12,14;RE=26;REF_strand=24,33;Strandbias_pval=0.812937;AF=0.45614 GT:DR:DV 0/1:31:26 +chr1 936289 26 AGGGCTCCTGGACGGAGGGGGTCCCCGGTCCCGCCTCCTA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=936328;STD_quant_start=5.46316;STD_quant_stop=5.2915;Kurtosis_quant_start=0.217921;Kurtosis_quant_stop=0.801437;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-39;STRANDS=+-;STRANDS2=11,15,11,15;RE=26;REF_strand=24,34;Strandbias_pval=1;AF=0.448276 GT:DR:DV 0/1:32:26 +chr1 948662 27 N CCTGGCTGTCCTTGGTCCCCTGGTCCCTTGGCCCTGCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=948696;STD_quant_start=12.3786;STD_quant_stop=16.4784;Kurtosis_quant_start=-2.07742;Kurtosis_quant_stop=-2.33448;SVTYPE=INS;SUPTYPE=AL;SVLEN=37;STRANDS=+-;STRANDS2=8,19,8,19;RE=27;REF_strand=18,40;Strandbias_pval=1;AF=0.465517 GT:DR:DV 0/1:31:27 +chr1 964642 28 CAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCCGCAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCCGCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=964717;STD_quant_start=14.3717;STD_quant_stop=16.1442;Kurtosis_quant_start=-1.6698;Kurtosis_quant_stop=-1.78719;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-75;STRANDS=+-;STRANDS2=12,10,12,10;RE=22;REF_strand=26,22;Strandbias_pval=1;AF=0.458333 GT:DR:DV 0/1:26:22 +chr1 976811 29 N CAACCCCGGGAACCGCCTCCCACTCCCCCCACCAACCCCCGGGAACCGCCTCCCACTTCTCCCGCAACCCCGGGAACTGCCTCCCACTCCCTTCTGCAACCCCCGGGAACCGCTCCCACTCCCCGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=976916;STD_quant_start=53.1169;STD_quant_stop=72.0569;Kurtosis_quant_start=-0.975517;Kurtosis_quant_stop=-0.734689;SVTYPE=INS;SUPTYPE=AL,SR;SVLEN=87;STRANDS=+-;STRANDS2=9,10,9,10;RE=15;REF_strand=36,36;Strandbias_pval=1;AF=0.208333 GT:DR:DV 0/0:57:15 +chr1 977334 30 N CGCTCCCCACTCCCCCGCAACTTCGGGAACCGCCTCCCCACTCCCCCACCAACCCCTGAACCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=977334;STD_quant_start=59.633;STD_quant_stop=54.4259;Kurtosis_quant_start=-1.27956;Kurtosis_quant_stop=-1.57241;SVTYPE=INS;SUPTYPE=AL;SVLEN=131;STRANDS=+-;STRANDS2=11,13,11,13;RE=22;REF_strand=38,36;Strandbias_pval=0.814668;AF=0.297297 GT:DR:DV 0/0:52:22 +chr1 977541 31 N CCCCGGAACCGCTCCCACCGCGCGCAACCCCTGAACCGCCTCCCACTCCCCACCAACCCTGGAACCGCCTCCACTCCCCTCTTACCGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=977584;STD_quant_start=48.8615;STD_quant_stop=40.2961;Kurtosis_quant_start=-0.497493;Kurtosis_quant_stop=-0.185665;SVTYPE=INS;SUPTYPE=AL;SVLEN=67;STRANDS=+-;STRANDS2=3,6,3,6;RE=9;REF_strand=36,38;Strandbias_pval=0.490453;AF=0.121622 GT:DR:DV 0/0:65:9 +chr1 977848 32 N ACCAACCGGGGAGCCGCCTCCCCTCCCCCCACCCGCCCCGAGCCGCCTGCCCCCGCCACCAACCCCGGGAACCACCTCCCACTCCCCGCCCAACCCCGGGAACCGCCCCTCCCCTCCCCACG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=977882;STD_quant_start=61.4687;STD_quant_stop=66.9395;Kurtosis_quant_start=-1.68422;Kurtosis_quant_stop=-1.71073;SVTYPE=INS;SUPTYPE=AL;SVLEN=86;STRANDS=+-;STRANDS2=9,10,9,10;RE=19;REF_strand=34,38;Strandbias_pval=1;AF=0.263889 GT:DR:DV 0/0:53:19 +chr1 988831 33 N AGTTCTGGAGTTGATTGTTTCTCAGAGGTTCAGGGTTGAGTGTTC . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=988831;STD_quant_start=6.13314;STD_quant_stop=6.34277;Kurtosis_quant_start=-1.27849;Kurtosis_quant_stop=-1.31557;SVTYPE=INS;SUPTYPE=AL;SVLEN=46;STRANDS=+-;STRANDS2=11,16,11,16;RE=27;REF_strand=22,36;Strandbias_pval=0.815445;AF=0.465517 GT:DR:DV 0/1:31:27 +chr1 996353 34 N GCACCTACATCTGGGGCCACAGGATGCAGGGTGGGGAGGGCAAGGCCTCTGCGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=996353;STD_quant_start=25.8341;STD_quant_stop=26.3869;Kurtosis_quant_start=-1.7722;Kurtosis_quant_stop=-1.00787;SVTYPE=INS;SUPTYPE=AL;SVLEN=64;STRANDS=+-;STRANDS2=12,18,12,18;RE=30;REF_strand=30,52;Strandbias_pval=0.826462;AF=0.365854 GT:DR:DV 0/1:52:30 +chr1 998765 35 N GGGGAGGGCGCTGAGCCGAGGGGGAGGGCTGAGCGGGAG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=998770;STD_quant_start=11.4935;STD_quant_stop=9.94485;Kurtosis_quant_start=-1.80857;Kurtosis_quant_stop=-1.45908;SVTYPE=INS;SUPTYPE=AL;SVLEN=34;STRANDS=+-;STRANDS2=4,10,4,10;RE=14;REF_strand=26,46;Strandbias_pval=0.762111;AF=0.194444 GT:DR:DV 0/0:58:14 +chr1 1030890 36 TGTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1030928;STD_quant_start=15.6993;STD_quant_stop=15.5285;Kurtosis_quant_start=0.12083;Kurtosis_quant_stop=-0.088691;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-38;STRANDS=+-;STRANDS2=18,13,18,13;RE=31;REF_strand=39,26;Strandbias_pval=1;AF=0.476923 GT:DR:DV 0/1:34:31 +chr1 1041778 37 GGCCAGTGCCAGGGTCGAGGTGGGCGGCTCCCCCGGGGGAGGGCTG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1041824;STD_quant_start=15.7567;STD_quant_stop=16.687;Kurtosis_quant_start=-0.850302;Kurtosis_quant_stop=-1.83988;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-46;STRANDS=+-;STRANDS2=11,12,11,12;RE=23;REF_strand=23,25;Strandbias_pval=1;AF=0.479167 GT:DR:DV 0/1:25:23 +chr1 1068748 38 N AAGGCCACGCGGGCTGTGCAGATGCAGGTGCGGCGGGGCGGGCCACGCGGGCTGTGAAGGTGCAGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1068809;STD_quant_start=28.0891;STD_quant_stop=12.2066;Kurtosis_quant_start=-2.18788;Kurtosis_quant_stop=-1.80804;SVTYPE=INS;SUPTYPE=AL;SVLEN=75;STRANDS=+-;STRANDS2=19,14,19,14;RE=33;REF_strand=42,28;Strandbias_pval=0.832838;AF=0.471429 GT:DR:DV 0/1:37:33 +chr1 1076283 39 GCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGCTGGGAGGCTGAGGCTATGGGGACTCCGTCGGGGGAGGCTGAGTCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGGGGCAGGCTGAGGCTATGGTGACTCCGTGCAGGGCTGTGAGGCTACGGGGACTCCGTGGGGGGTGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1076735;STD_quant_start=48.1819;STD_quant_stop=70.5606;Kurtosis_quant_start=0.175533;Kurtosis_quant_stop=-1.43403;SVTYPE=DEL;SUPTYPE=AL,SR;SVLEN=-452;STRANDS=+-;STRANDS2=7,11,7,11;RE=18;REF_strand=18,23;Strandbias_pval=0.780972;AF=0.439024 GT:DR:DV 0/1:23:18 +chr1 1076341 40 GCTGGGAGGCTGAGGCTATGGGGACTCCGTCGGGGGAGGCTGAGTCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGGGGCAGGCTGAGGCTATGGTGACTCCGTGCAGGGCTGTGAGGCTACGGGGACTCCGTGGGGGGTGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGCCGGGAGGCTGAGGCTACGGGGACTCCGTGCGGGGAGGCTGAGTCTACGGGGACTCCGTGAGGGGTGGCTGAGTCTATGGGGACTCCGTGCGGGGAGGCTGAGTCTATGGGGACTCCGTGCGGGGTGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCTGTGCCGGGAGGCTGAGGCTACGGGGACTCCGTGCCGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGGGGGGAGGCTGAGTCTATGGGGACTCCGTGCCGAGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTTGGGGGAGGCTGAGGCTATGGGGACTCCGTTGGGGGAGGCTGAGGCTATGGGGACTCCGTTGGGGGAGGCTGAGGCTATGGGGACTCCGTGCGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1077781;STD_quant_start=341.572;STD_quant_stop=478.388;Kurtosis_quant_start=-1.1971;Kurtosis_quant_stop=-0.402974;SVTYPE=DEL;SUPTYPE=AL,SR;SVLEN=-1440;STRANDS=+-;STRANDS2=3,2,3,2;RE=5;REF_strand=19,23;Strandbias_pval=0.653637;AF=0.119048 GT:DR:DV 0/0:37:5 +chr1 1080919 41 N CTGTCCTTCTCACTTCCTGCCTCGGTCTCTGTCTCCTTCCCTCCGCCCTACCTCGGTCCTATCATCCTTCCTCGCCTACCTCAGGTCCCTGTCTCCTTCCCTCCATACACACTCGGTCCCTGTCTCTCTTCCCTCCGCCTGGTCCCTGTCTCCTTCCCTCCTTCCCCCCACCTCCGGTCCTGTCTCCTTCCCTCCCTTCCGCCTCAGTCTGTCTCACTTCCCTCCGCCCACCTCAGTCCCCTGTCTCCTTCCCTCCCACACTTCCTCTGGTCCTGTCTCCTTCCCTCAAGCCCCCTCAGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1080919;STD_quant_start=32.45;STD_quant_stop=47.856;Kurtosis_quant_start=-1.30067;Kurtosis_quant_stop=-1.55852;SVTYPE=INS;SUPTYPE=AL;SVLEN=289;STRANDS=+-;STRANDS2=12,9,12,9;RE=21;REF_strand=26,20;Strandbias_pval=1;AF=0.456522 GT:DR:DV 0/1:25:21 +chr1 1139106 42 GGGTCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGACCTGGGTCCTGGGGAGTTTCCTGGGGTCAGAAGGTAGGGGTGTCAATGTCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1139227;STD_quant_start=15.678;STD_quant_stop=18.4038;Kurtosis_quant_start=-2.1908;Kurtosis_quant_stop=-2.23094;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-121;STRANDS=+-;STRANDS2=15,6,15,6;RE=21;REF_strand=41,31;Strandbias_pval=0.312886;AF=0.291667 GT:DR:DV 0/0:51:21 +chr1 1140200 43 AGGTGGGGGTGTCAACGTCGAACCGGGGGGCCTGGGTCCTGGGGAGCTTCCTGGGGTC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1140259;STD_quant_start=17.0822;STD_quant_stop=14.6151;Kurtosis_quant_start=0.610656;Kurtosis_quant_stop=0.583042;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-59;STRANDS=+-;STRANDS2=14,6,14,6;RE=20;REF_strand=44,36;Strandbias_pval=0.312112;AF=0.25 GT:DR:DV 0/0:60:20 +chr1 1140410 44 N CGTCCGAACCGGGGGGACCTGGGTCCTGGGAGCTTCCTGGGTTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGAGTCCTGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCCTGGGGGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACATCGAACCGGGGGGCCCTGGGAGTCCTGGGAGCTTCTGGGGTCAGAAGGTGGGAGTGTCCAGCATCGAACCGGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTAGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCATGGGGGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGGCCTGGGTCCTGGGAGCTTCCTGGGGTCAGAAGGTAGGGGTGTCAACGTCAGACAGGGGACCTGGGTCCTGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACGCGTCGAACCGGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCCAG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1140410;STD_quant_start=232.849;STD_quant_stop=281.385;Kurtosis_quant_start=1.04118;Kurtosis_quant_stop=0.939709;SVTYPE=INS;SUPTYPE=AL;SVLEN=536;STRANDS=+-;STRANDS2=4,7,4,7;RE=11;REF_strand=42,40;Strandbias_pval=0.522879;AF=0.134146 GT:DR:DV 0/0:71:11 +chr1 1141388 45 N TCATCCTCTGTCCACAACCCCATCCTTACCTCTATCCCCCACCTTACATCTCATTCCTCTATCCCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1141394;STD_quant_start=10.6724;STD_quant_stop=14.1704;Kurtosis_quant_start=-1.44054;Kurtosis_quant_stop=-1.65538;SVTYPE=INS;SUPTYPE=AL;SVLEN=55;STRANDS=+-;STRANDS2=14,7,14,7;RE=21;REF_strand=44,40;Strandbias_pval=0.327428;AF=0.25 GT:DR:DV 0/0:63:21 +chr1 1168031 46 CGGGGCCAGCAGACGGGTGAGGGCGGAGGGCCGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1168063;STD_quant_start=14.2864;STD_quant_stop=13.7514;Kurtosis_quant_start=-1.22974;Kurtosis_quant_stop=-1.1581;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;STRANDS2=7,8,7,8;RE=15;REF_strand=38,32;Strandbias_pval=0.776548;AF=0.214286 GT:DR:DV 0/0:55:15 +chr1 1212606 47 N CAGCCCTCCTCCCAGCCCCTGGCTCCCTCTGCCCCCTCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1212613;STD_quant_start=10.1207;STD_quant_stop=8.7014;Kurtosis_quant_start=-0.032446;Kurtosis_quant_stop=-1.09448;SVTYPE=INS;SUPTYPE=AL;SVLEN=32;STRANDS=+-;STRANDS2=2,5,2,5;RE=7;REF_strand=30,28;Strandbias_pval=0.42665;AF=0.12069 GT:DR:DV 0/0:51:7 +chr1 1226331 48 CCCTCAACCCTGTACGGTCAGGAGGAAACATGGCACCTCCCCTCTGGGGGCTCTTTCCAGAAAC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1226395;STD_quant_start=5.74456;STD_quant_stop=5.1672;Kurtosis_quant_start=-1.50778;Kurtosis_quant_stop=-1.62886;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-64;STRANDS=+-;STRANDS2=8,7,8,7;RE=15;REF_strand=34,39;Strandbias_pval=0.77815;AF=0.205479 GT:DR:DV 0/0:58:15 +chr1 1227293 49 GCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1227467;STD_quant_start=2.14476;STD_quant_stop=1.34164;Kurtosis_quant_start=0.1517;Kurtosis_quant_stop=0.426462;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-174;STRANDS=+-;STRANDS2=8,8,8,8;RE=16;REF_strand=32,42;Strandbias_pval=0.782406;AF=0.216216 GT:DR:DV 0/0:58:16 +chr1 1240679 50 N CCGCCCCCATTCACCCCGGCCGTGGTCCCTACCGCAGCCCCA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1240683;STD_quant_start=7.86398;STD_quant_stop=8.59008;Kurtosis_quant_start=-1.97851;Kurtosis_quant_stop=-1.85397;SVTYPE=INS;SUPTYPE=AL;SVLEN=40;STRANDS=+-;STRANDS2=16,22,16,22;RE=38;REF_strand=34,48;Strandbias_pval=1;AF=0.463415 GT:DR:DV 0/1:44:38 +chr1 1245159 51 N CTCTGCCCTCCTCCCACCTTCCCCCTCCTCCCCCCACTCCCTCTCCCCTCTTCCCCCGACTCCCTTCCCCTACTCATCTCCTCCTCACCCACTCCTCTCCCCCTCCTCTCCCACTCCTCCCCCTCCTCCCCCCCACTCCTCCCCCCACTGCACTCTCCCCTCTTCCCCCACTCCTCCCCACTCCTCTCCCCTCCTTCTCACCTCCTCTCCCCTCCTCCTCCTCCTGTCCCTCCTCCCCCTCTTCCCCCTCCTCCCCATATACCCTCCTCCTCCTCTCCCTCTTCCTCCCACTCCCCCCACTCCTCCCCACTCCTCTCCCCTCTTGCCCCTCCTCCCTACCACTCCTTCCTCCTCTCCTCTCTTCCCCCCACTCCCTCCCCCACTCCTCTCCTCCTCCACCTCCTCTCCCCTCCTCCCCCACTCCTCTCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1245159;STD_quant_start=3.43996;STD_quant_stop=14.8436;Kurtosis_quant_start=4.9032;Kurtosis_quant_stop=3.59937;SVTYPE=INS;SUPTYPE=AL;SVLEN=439;STRANDS=+-;STRANDS2=12,13,12,13;RE=25;REF_strand=28,36;Strandbias_pval=0.813884;AF=0.390625 GT:DR:DV 0/1:39:25 +chr1 1248060 52 GATCTCCAACTCTGACCTACAGGCAGGAAAGTGGGCAGCCCTGGGAGGCTGGACTGAGGGAGGCTGGACTTCCCACTCAGGCCTACACGCAGGAAAATGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCACCCTACAGGCCAGGACACGGGCAGCCCTGGGAGGCTAGACCGAGGGAGGCTGGGCCTCCCATCTACCCTACAGGCCGGGACACAGGCAGCCCTGGGAGGCTGTACCGAGGGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1248319;STD_quant_start=52.6064;STD_quant_stop=21.0815;Kurtosis_quant_start=3.96322;Kurtosis_quant_stop=3.62028;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-259;STRANDS=+-;STRANDS2=2,5,2,5;RE=7;REF_strand=23,28;Strandbias_pval=0.686983;AF=0.137255 GT:DR:DV 0/0:44:7 +chr1 1249348 53 N CGCTCACACCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGGGGCTGGGCCTCCCCTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGGCTGGACCAGGGGAGGCGCCAGGCCTCCCACTCGCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCCCGAGGGAGGCTGGAGCCTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1249395;STD_quant_start=37.7478;STD_quant_stop=65.9212;Kurtosis_quant_start=-1.75534;Kurtosis_quant_stop=-1.77327;SVTYPE=INS;SUPTYPE=AL;SVLEN=124;STRANDS=+-;STRANDS2=8,10,8,10;RE=18;REF_strand=26,28;Strandbias_pval=1;AF=0.333333 GT:DR:DV 0/1:36:18 +chr1 1249588 54 N TGGGATCGAGAGCTGGCTCCCACCGCCTCCCAGGCCTGGACACTGCAGCCCTGGGAGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1249625;STD_quant_start=47.571;STD_quant_stop=45.3122;Kurtosis_quant_start=-1.76881;Kurtosis_quant_stop=-0.375917;SVTYPE=INS;SUPTYPE=AL;SVLEN=61;STRANDS=+-;STRANDS2=6,6,6,6;RE=12;REF_strand=26,28;Strandbias_pval=1;AF=0.222222 GT:DR:DV 0/0:42:12 +chr1 1284183 55 N TGAGGGGGTGGGGTGGGGGTTGAGTGAGGGGGTGGGGGGGTTGGGTGAGGGGGGTGGGGGGTTGGGTGAGGGGGTGGGGGGCTGGTGAGGGGGTGGGGTTGGGTGAGGGGGTGGGCTCGGGGGGGGTTGAGTGAGGGGGTGGGGTGGGGGGTTGGGTGAGGGGGGTGGGGTGGGGGTTGAGGAGGGGGTGGGGTGTTATGAGGGGTTGGGGGTTGGGTGAGGGGGGGTGGGGGTTGCGAGGGGGTGGGGGGTGGGGGGGTTGAGTGAGGGGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1284201;STD_quant_start=9.78895;STD_quant_stop=64.8333;Kurtosis_quant_start=-1.28536;Kurtosis_quant_stop=-1.87274;SVTYPE=INS;SUPTYPE=AL;SVLEN=282;STRANDS=+-;STRANDS2=18,16,18,16;RE=34;REF_strand=49,32;Strandbias_pval=0.535485;AF=0.419753 GT:DR:DV 0/1:47:34 +chr1 1288944 56 N CGTGTCCCTGCTCCGGGCCCCGTGTCTCTGTTCACTGGCCCCCGTGTCTCTGCTCCTCGTCCCGTGTCCCTTGCTCCGCCCTGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTTCTGCTCCGTCCTGTGTCTCTTGCTCCGGCCCCCGCGGTCTCTGCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1289049;STD_quant_start=87.6482;STD_quant_stop=60.9024;Kurtosis_quant_start=-0.741898;Kurtosis_quant_stop=-0.516487;SVTYPE=INS;SUPTYPE=AL;SVLEN=64;STRANDS=+-;STRANDS2=6,5,6,5;RE=11;REF_strand=28,34;Strandbias_pval=0.74488;AF=0.177419 GT:DR:DV 0/0:51:11 +chr1 1289357 57 N TGCTCCGTCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCTCCCCGTGTCTCTGCCCCGTCCCGTGTCTACTCCGTCCCGGTCTCTGCTCCGTCCCCCGTGTCTACTCCGTCCCCCGTGTCTCTGCTCCGTCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCGTCCCCGTGTCTCTGCTCCGTCCGTGTCTCTGCTCAATCCCCCGTGTCTCTGCCCCGTCCCGATGTCTCTGCTCCGTCGATGTCTCTATGAGCTCTCCCGTGTCTGCTCCGTCCGTGTCTCTGCTCCGTCCGATGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1289476;STD_quant_start=159.932;STD_quant_stop=97.4515;Kurtosis_quant_start=-1.84527;Kurtosis_quant_stop=-1.54002;SVTYPE=INS;SUPTYPE=AL;SVLEN=162;STRANDS=+-;STRANDS2=8,9,8,9;RE=16;REF_strand=28,32;Strandbias_pval=1;AF=0.266667 GT:DR:DV 0/0:44:16 +chr1 1289780 58 N CCCCGTGTCTCTGCTCCGTCCGTGTCTACTCCGTCCCGATGTCTCTGCCACGTCCCCGTGTCTCTGCCCCGTCCCCGTGTCTCTGCCCCGTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1289823;STD_quant_start=26.7955;STD_quant_stop=30.8675;Kurtosis_quant_start=-0.210759;Kurtosis_quant_stop=-0.202563;SVTYPE=INS;SUPTYPE=AL;SVLEN=91;STRANDS=+-;STRANDS2=8,7,8,7;RE=15;REF_strand=28,30;Strandbias_pval=0.778387;AF=0.258621 GT:DR:DV 0/0:43:15 +chr1 1290106 59 N GTGTCTCTGTCTGGCCCCCCGTGTCTCTGCTCCAGCCCCGTGCCCTGCTCCTCATT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1290106;STD_quant_start=38.9923;STD_quant_stop=30.4023;Kurtosis_quant_start=0.491131;Kurtosis_quant_stop=-0.169138;SVTYPE=INS;SUPTYPE=AL;SVLEN=103;STRANDS=+-;STRANDS2=8,9,8,9;RE=16;REF_strand=30,34;Strandbias_pval=1;AF=0.25 GT:DR:DV 0/0:48:16 +chr1 1324173 60 N GGGCTCAGGGGCTGGGGGCTGCTGGGCTGAGGCTGGGGAGACTGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1324173;STD_quant_start=5.02933;STD_quant_stop=13.0384;Kurtosis_quant_start=-1.45664;Kurtosis_quant_stop=-1.54076;SVTYPE=INS;SUPTYPE=AL;SVLEN=66;STRANDS=+-;STRANDS2=19,16,19,16;RE=35;REF_strand=38,36;Strandbias_pval=0.838779;AF=0.472973 GT:DR:DV 0/1:39:35 +chr1 1350109 61 GGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1351186;STD_quant_start=20.5878;STD_quant_stop=20.3663;Kurtosis_quant_start=-0.011268;Kurtosis_quant_stop=-1.64329;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-1077;STRANDS=+-;STRANDS2=15,13,15,13;RE=28;REF_strand=32,29;Strandbias_pval=1;AF=0.459016 GT:DR:DV 0/1:33:28 +chr1 1366913 62 TGAATTGGTGAGTTGGTGTGAATTGAATTGTGTGAATGAGTGGATTGGTGAGTGAATTGGTGAGTTGAATTGGTGTGTGTAGTGGATGAGTGTGGATGAATGTGAATTGGCGAGTATGGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1367033;STD_quant_start=14.0961;STD_quant_stop=39.5335;Kurtosis_quant_start=0.90213;Kurtosis_quant_stop=3.80352;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-120;STRANDS=+-;STRANDS2=7,3,7,3;RE=10;REF_strand=36,30;Strandbias_pval=0.499306;AF=0.151515 GT:DR:DV 0/0:56:10 +chr1 1382683 63 N CAACAATCCAGTAACAATCCAGAGGTCACCACCCTTCCCAACAATCCAGTAATCCAGAGGTTACCACCCTTCCCAACAATCCACTAACAATCCAGAGGCCACCACCCTTCCCAGCAATCGGCAAGGACCCAGAGGCCACCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGGTCACCACCCCTTCCCAAAATCAGTAACCAGGGAGTCCACCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGCCACCACCCCTTCCCACAACAATCCAGTAACAATCCAGAGGTACCACCCTTCCCAACAATCCAGTAACAATCGACCACCACCCTTCCCAACAATCCAGTAACAATCCAGAGGACACCACCCTTCCCAGCAATCCACTAGCAATCCAGAGGCCACCACCCCTTCCCAACAATCTGGCTTAGCGACCAGAGAGCCACCACCCCTTCCCAACAATCAGTAACAATCCAGGAGTCACCACCGCTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1382683;STD_quant_start=27.5276;STD_quant_stop=144.602;Kurtosis_quant_start=-2.03439;Kurtosis_quant_stop=-2.16418;SVTYPE=INS;SUPTYPE=AL;SVLEN=494;STRANDS=+-;STRANDS2=11,15,11,15;RE=26;REF_strand=24,34;Strandbias_pval=1;AF=0.448276 GT:DR:DV 0/1:32:26 +chr1 1427516 64 N TCCCAAGTCTCGGCCTCCCTCTCCACCCCTCCCCTTTCCCCTGCATCACCCCGCCCAGCCCCCACCCCTCCATCACCCTGCTCCCGCCCCCTCCCCTCCATCCTGCCCCCCTCCCCCTCCATCACCCTGCCCAGCCCCCTCCCCTCCATCACTCCCAAGCCCTGCCCCCTTCCATCACCCTGCCCTGCCCCCACCCCATCACCCTGCCCTGCCCCCTTCCCCTCCATCATCCCGCCCGCTCCCCTCTCCACCCCTCCCTCTCCCCTGCATCACTCCCTGCCCTGCCCCTTTCCCCCTCCATCACCCCAGCCTCTGCCCCTCCCCTCCCCTCCATCACCCTGCCCTGCCCTCCTCTCTCCATCACTCCCTGTCTCTGCCCCCACCCCTCCATCATTCTGCCCTGCTCCTCTCCACCTCCCCCTTCCCCTGCATCACCCAGCCTTCTAAGTTCCTTCCTCCATCAATTCTGCCTCTGCCCCTCCCCTCCATCACTCCTGGCACTCTGCCCTCCCTCCATCACTCCTGCTCCTGCCCCCCACCCCTCCATCATCTTCACACTTTCCACCCCTCCCTTCCCTTCCCCTGCATCATCTGCACTCCTGCCTCTTCCCCCTCCATCACCCTGCCCAGCCCCCTCCCCTCCACTGCTTGCGCCCTCCTCCATCACACCCCGGCCCTGCCCCCAGCTCCG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1427892;STD_quant_start=48.4345;STD_quant_stop=77.4325;Kurtosis_quant_start=-1.98608;Kurtosis_quant_stop=-1.067;SVTYPE=INS;SUPTYPE=AL;SVLEN=439;STRANDS=+-;STRANDS2=12,9,12,9;RE=21;REF_strand=38,32;Strandbias_pval=1;AF=0.3 GT:DR:DV 0/0:49:21 +chr1 1428085 65 N AGGAGGGAGGGGGAGGAGGGGAGGAAGAAGAAGGAGGAAGAGGAAGGAGGAAAAGAGGAGGAGGAAAGAGAGAGGAAGAAAGGAGGGGAGGAGAAAGAGGAGGGGACAGGAGGGAAGGAGGAGAGAAAGAGGAAAAGAAAGGAGGGAGGGAAGGAGAGAGGAGAGGAAGAGAGAGACAGGGAAGGGAAAGAAAAACAGGGAGGGGAAGGAGGAGGAAGAGGAGGGAAGGAAGAAGAGGAGGAGAGGGAGGGAAGAGAGGAGGGAAAGAGGGAGGAGGAAGAGGGGGCAGGGGAGGAAGAAGAGAACA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1428085;STD_quant_start=33.6102;STD_quant_stop=42.9676;Kurtosis_quant_start=4.12818;Kurtosis_quant_stop=-1.69999;SVTYPE=INS;SUPTYPE=AL;SVLEN=222;STRANDS=+-;STRANDS2=16,13,16,13;RE=29;REF_strand=42,34;Strandbias_pval=1;AF=0.381579 GT:DR:DV 0/1:47:29 +chr1 1442871 66 N TTTCTATGGTAATGGTGATAAACCAAGTCAA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1442871;STD_quant_start=13.6345;STD_quant_stop=12.775;Kurtosis_quant_start=-1.56517;Kurtosis_quant_stop=-1.35954;SVTYPE=INS;SUPTYPE=AL;SVLEN=31;STRANDS=+-;STRANDS2=9,10,9,10;RE=19;REF_strand=32,38;Strandbias_pval=1;AF=0.271429 GT:DR:DV 0/0:51:19 +chr1 1443674 67 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1443708;STD_quant_start=0;STD_quant_stop=1.04881;Kurtosis_quant_start=-0.5;Kurtosis_quant_stop=-1.79438;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-34;STRANDS=+-;STRANDS2=6,14,6,14;RE=20;REF_strand=34,42;Strandbias_pval=0.310419;AF=0.263158 GT:DR:DV 0/0:56:20 +chr1 1469099 68 TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA N . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1469134;STD_quant_start=10.2652;STD_quant_stop=9.18559;Kurtosis_quant_start=3.94733;Kurtosis_quant_stop=2.94515;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=0,8,0,8;RE=8;REF_strand=42,38;Strandbias_pval=0.00589579;AF=0.1 GT:DR:DV 0/0:72:8 +chr1 1477855 69 N CACCACGCCCGGCTAATGTTGTATTTTTAGTAGAGACGGGTTTCTCCCATGGTCAGGCTGGTCTCTAACTCCCGACCTCAGGTGATCCACCCGCCTCGGCCTCTCAACCAGTTGGGATTACAGGCATGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1477883;STD_quant_start=11.94;STD_quant_stop=21.8689;Kurtosis_quant_start=-1.766;Kurtosis_quant_stop=-1.90683;SVTYPE=INS;SUPTYPE=AL;SVLEN=131;STRANDS=+-;STRANDS2=17,15,17,15;RE=32;REF_strand=36,38;Strandbias_pval=0.832673;AF=0.432432 GT:DR:DV 0/1:42:32 +chr1 1497123 70 N CCTCGGCCTGGGCACGAACGGTCCCATCGAGAGCAGA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1497123;STD_quant_start=3.84708;STD_quant_stop=4.3589;Kurtosis_quant_start=-1.05282;Kurtosis_quant_stop=-1.08734;SVTYPE=INS;SUPTYPE=AL;SVLEN=40;STRANDS=+-;STRANDS2=7,8,7,8;RE=15;REF_strand=34,36;Strandbias_pval=1;AF=0.214286 GT:DR:DV 0/0:55:15 +chr1 1554173 71 CTAAGGGGTCCCCACGAAGCTGAGCACGAGGCGGATCCGGAC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1554213;STD_quant_start=9.73653;STD_quant_stop=10.0846;Kurtosis_quant_start=0.855249;Kurtosis_quant_stop=1.94672;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-40;STRANDS=+-;STRANDS2=6,7,6,7;RE=13;REF_strand=34,28;Strandbias_pval=0.760968;AF=0.209677 GT:DR:DV 0/0:49:13 +chr1 1595833 72 GAGCAGAACAGGGAGAGACAGAGAGAGAGAGACAGAGAGAGGCAGACAGAGACAGAGAGAGAGACAGACAC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1595902;STD_quant_start=33.7313;STD_quant_stop=33.2971;Kurtosis_quant_start=2.73708;Kurtosis_quant_stop=2.38236;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-69;STRANDS=+-;STRANDS2=5,5,5,5;RE=10;REF_strand=36,42;Strandbias_pval=1;AF=0.128205 GT:DR:DV 0/0:68:10 +chr1 1595853 73 N ACAGAGAGACAGAGAGAGAAACAGAGAGACAGAGACAGAGAGGCAGACAGAGAGAGACAGACAGAGAGCAGAACAGGGAGAGACAAAAGAGACAGAGAGAGAGAGACAC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1595863;STD_quant_start=39.2785;STD_quant_stop=49.7011;Kurtosis_quant_start=-0.536228;Kurtosis_quant_stop=-0.678589;SVTYPE=INS;SUPTYPE=AL;SVLEN=77;STRANDS=+-;STRANDS2=3,2,3,2;RE=5;REF_strand=36,42;Strandbias_pval=0.661994;AF=0.0641026 GT:DR:DV 0/0:73:5 +chr1 1605690 74 N GGCTGGGCTGGTCAGGTGTAGGCTGGGCTGGTCAGGCGTGGAGTGGGCTGGTCAGGCGTGGGGTGGGGTGGGCTGGTCAGGTGTGGGCTGGGCCTGGTCAGGTGTGAGGTGGGGTGGTGGGGGTGAGGGGGTTGTCTGGTCAGGTGTGGAGTGGGCTGGTCAGGTGTGGGCTGGGCTGGTCCAGACAGGGTCGGCTGGTCAGGTGTGGGCTGGGCTGGGCTGGTCAGGTGTGGGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1605690;STD_quant_start=28.6112;STD_quant_stop=43.4385;Kurtosis_quant_start=-0.191101;Kurtosis_quant_stop=-1.21501;SVTYPE=INS;SUPTYPE=AL;SVLEN=226;STRANDS=+-;STRANDS2=7,7,7,7;RE=14;REF_strand=38,40;Strandbias_pval=1;AF=0.179487 GT:DR:DV 0/0:64:14 +chr1 1666975 75 CACGCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCACTTCAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCAAACCAGAGAAATCCAGCTCTGGGTGACAGAGCAAGACTCTGTTTCGGGAAAAATAAAATACATAGGCAGGGCGCGGTGGCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1667142;STD_quant_start=0;STD_quant_stop=0;Kurtosis_quant_start=11.8809;Kurtosis_quant_stop=8.99409;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-167;STRANDS=+-;STRANDS2=18,14,18,14;RE=32;REF_strand=38,28;Strandbias_pval=1;AF=0.484848 GT:DR:DV 0/1:34:32 +chr1 1681989 76 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1682020;STD_quant_start=0;STD_quant_stop=1.30384;Kurtosis_quant_start=6.9449;Kurtosis_quant_stop=-0.962407;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-31;STRANDS=+-;STRANDS2=10,8,10,8;RE=18;REF_strand=34,26;Strandbias_pval=1;AF=0.3 GT:DR:DV 0/0:42:18 +chr1 1717605 77 GCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTCGGTCTGTGACACACGCAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1717656;STD_quant_start=8.22935;STD_quant_stop=8.90381;Kurtosis_quant_start=-1.60307;Kurtosis_quant_stop=-1.75287;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-51;STRANDS=+-;STRANDS2=18,19,18,19;RE=37;REF_strand=40,39;Strandbias_pval=1;AF=0.468354 GT:DR:DV 0/1:42:37 +chr1 1749606 78 N GTCCATGCATATTTTTCTGTGTGATGTGTCTGTGTGTGTGTCTCAGTGGT . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1749616;STD_quant_start=6.64118;STD_quant_stop=6.18572;Kurtosis_quant_start=-1.82765;Kurtosis_quant_stop=-1.82115;SVTYPE=INS;SUPTYPE=AL;SVLEN=48;STRANDS=+-;STRANDS2=19,19,19,19;RE=38;REF_strand=42,38;Strandbias_pval=0.84535;AF=0.475 GT:DR:DV 0/1:42:38 +chr1 1766411 79 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1766446;STD_quant_start=2.64575;STD_quant_stop=6.72681;Kurtosis_quant_start=1;Kurtosis_quant_stop=2.21022;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=2,6,2,6;RE=8;REF_strand=42,40;Strandbias_pval=0.267342;AF=0.097561 GT:DR:DV 0/0:74:8 +chr1 1845825 80 ACACACACACACACACACACACACACACACAC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1845857;STD_quant_start=4.42396;STD_quant_stop=5.59336;Kurtosis_quant_start=0.08546;Kurtosis_quant_stop=1.51911;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;STRANDS2=4,3,4,3;RE=7;REF_strand=26,26;Strandbias_pval=1;AF=0.134615 GT:DR:DV 0/0:45:7 +chr1 1924230 81 N CCCCCAGCCTGCAGCCCACCCCCCCATCTCACCGCCTAGCCCCCATCTCACCAGCTGCCCCCTCCCCGACACACGCCCACCCCCTTATCTCACCAACCA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1924231;STD_quant_start=0.948683;STD_quant_stop=2.72029;Kurtosis_quant_start=0.969632;Kurtosis_quant_stop=-0.273855;SVTYPE=INS;SUPTYPE=AL;SVLEN=96;STRANDS=+-;STRANDS2=7,7,7,7;RE=14;REF_strand=36,28;Strandbias_pval=0.770084;AF=0.21875 GT:DR:DV 0/0:50:14 +chr1 1929385 82 N AGGGGACAGGTCTGGGGGGGAGGCAGGAGAGAGGGTGAGGGGGAGGCAGGAGTGGGGGAGGGAGGGGAGAGGGTAGGGAGGGAGGAGAGGGTAGGGGGAGGGAGGGAGAGAGGAGGAGGGGAGAGGGTGGGAGGGAGAGAGGAGGAGAAGGGAGGGGACATGGGGAGGGGAGAGGAAAGAGGAGGGAGGGAGAGGGGAGGGAGGGAGCGGGTGAGGGGAGGGAAAGGAGGGAAATGGTATGGGAGGGGAGGGAGGGGAGAGGGTGAGGGGGAGGGAGCAGAGGGAAAGGGTGGGGGAGGGAAGGAAGGGAGAGGGTGGGGGAGGGTAGGGAGGGAGGGAGAGAGAGGGTAGGGGGAGGGGGAGAGAGGGTGAGGAGGGGGAGGGTAGGGGAGGGAAGGAGGGGAGACGGTGAGGGAGGGAGGAGAGGGTAGGGGGGAGGGAGGAAGAGGAGGGGTAGGGAGGGAGGGAGAGGAGAGGGAGGAGGGGAGGAGGGGGAGAGAGGGGTAGGGAGGGAGGGGAGGGAGGGAAGAGGGTAGGAGGGAGGGAGAGGAGAGGGAGGGAGGGAGGGGAGGAGGGAGGGTGGGAGGAGGGAGAGGGTTAGGGGAGGGAGGGAGAGGGAGGGGGAGAGGGTAGGGAGGAGAGGAGGAGAGGGTAGAGGAGGGAGGAGGGGAGAGGGGAGGGGAGGGAGGGAGAAGAGGAGGGAGAGGGTAGGGAGGGAGGGAGAGGAGAGGGGGAGAGGGAGGAGGAGGAGGAGAGGGTAGGGAGGAGGGGAGGAGGGGAGGGGTAGGGAGGGAGGGAGAGGAGGGAGGGAGGGAGGGGGAGGAGGGGGAGAAAGTTAGGGAGGGAGGGAGAGGAGAGGGGGAGGGAGGGAGGGGGAGGAGGAGAGGGGTAGGGAGGGAGGAAGGGAGGGAGGGAGGAGGGCAGGAGGGGAAATTGGGAGGGAGGGGCAGGAGGGAGAGGGTAGGGAGGGAGGGCAGGAGGGAGAGGGTAGGGAGGGAGGGAGGAAGGGAGGGAGGTAGGGAGGAGGAGGAGAGGGTAGGGAGGGAGGAGGAGGGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1929385;STD_quant_start=0;STD_quant_stop=12.4023;Kurtosis_quant_start=0.385661;Kurtosis_quant_stop=-0.139538;SVTYPE=INS;SUPTYPE=AL;SVLEN=1062;STRANDS=+-;STRANDS2=14,9,14,9;RE=23;REF_strand=46,34;Strandbias_pval=0.814909;AF=0.2875 GT:DR:DV 0/0:57:23 +chr1 1934289 83 N TACACAGGTGTACATTAGATTATTAGGTTGTGAAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1934289;STD_quant_start=26.898;STD_quant_stop=24.4172;Kurtosis_quant_start=-1.47698;Kurtosis_quant_stop=-1.25573;SVTYPE=INS;SUPTYPE=AL;SVLEN=98;STRANDS=+-;STRANDS2=10,4,10,4;RE=14;REF_strand=34,20;Strandbias_pval=0.755487;AF=0.259259 GT:DR:DV 0/0:40:14 +chr1 1949003 84 N CTTCCCTTCCCCTTCCTTCCTTCTCTCCCTCTCCCTCCTCCTCTTCCCTCCTTTCCTTCCTTCCTTTCCCTTTCCTCCTTCCTCTCCCTCCCCTCCTTTCCCCTTTTCATTCCCTCTTCCCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1949044;STD_quant_start=33.9013;STD_quant_stop=7.46324;Kurtosis_quant_start=-2.16185;Kurtosis_quant_stop=-0.295908;SVTYPE=INS;SUPTYPE=AL;SVLEN=111;STRANDS=+-;STRANDS2=10,9,10,9;RE=19;REF_strand=30,48;Strandbias_pval=0.304283;AF=0.24359 GT:DR:DV 0/0:59:19 +chr1 1968925 85 CCCTCCTGGGGGCTCCGGTCCTGCCCAGCAGCCCCAGGTGAGACAGCGCCTGGCGGCCCCTCCCTAGCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1968994;STD_quant_start=2.28035;STD_quant_stop=2.25832;Kurtosis_quant_start=0.046742;Kurtosis_quant_stop=0.304863;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-69;STRANDS=+-;STRANDS2=7,10,7,10;RE=17;REF_strand=30,42;Strandbias_pval=1;AF=0.236111 GT:DR:DV 0/0:55:17 +chr1 1979021 86 AGGCTGCACAGAACACGTGTGTCGTGCTGAGCTGGGCGTGGGAAGGCGTCATGTGACGAGGCTGCACAGAACATGCGTGTGGTACTGAGCTGGGCGTGGGAAGGTGTCACGTGACAAGGCTGCACAGAACATGTGTGTGGTACTGAGCTGGGCGTGGGAAGGCATCATGTGACA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1979168;STD_quant_start=12.3369;STD_quant_stop=9.86577;Kurtosis_quant_start=3.47666;Kurtosis_quant_stop=3.76987;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-147;STRANDS=+-;STRANDS2=16,15,16,15;RE=31;REF_strand=34,35;Strandbias_pval=1;AF=0.449275 GT:DR:DV 0/1:38:31 +chr1 1980059 87 CTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGACTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGACTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1980291;STD_quant_start=9.01234;STD_quant_stop=11.0454;Kurtosis_quant_start=-1.20742;Kurtosis_quant_stop=-0.902165;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-232;STRANDS=+-;STRANDS2=18,18,18,18;RE=36;REF_strand=35,37;Strandbias_pval=1;AF=0.5 GT:DR:DV 0/1:36:36 +chr1 1981556 88 N CACGCAGGACACACAGCCGCGACGCACACCGGCACGCAGGACACCCAGCCACGGTCACACGCGGGGCACGCAGGACACCCAGCCGCGGTCACATGC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1981581;STD_quant_start=21.5465;STD_quant_stop=28.327;Kurtosis_quant_start=-1.84901;Kurtosis_quant_stop=-1.66641;SVTYPE=INS;SUPTYPE=AL;SVLEN=34;STRANDS=+-;STRANDS2=10,14,10,14;RE=24;REF_strand=34,40;Strandbias_pval=0.814965;AF=0.324324 GT:DR:DV 0/1:50:24 +chr1 1982045 89 N CGGGGACACGCAGGACACCCAGGACACCCAGCCGCGGACAGACACGGGGGCACACAGGACACCCAGCTCGTGGACAGACA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1982046;STD_quant_start=5.75698;STD_quant_stop=3.31663;Kurtosis_quant_start=-0.365062;Kurtosis_quant_stop=-0.55588;SVTYPE=INS;SUPTYPE=AL;SVLEN=79;STRANDS=+-;STRANDS2=15,14,15,14;RE=29;REF_strand=36,42;Strandbias_pval=0.666552;AF=0.371795 GT:DR:DV 0/1:49:29 +chr1 1982220 90 N AGATAGACACGGGACACGGACACCCCAGCCGTGACAGACACGGTGACAACACAGACACCCAGCCATGGACAGACACGGGCCACGAGGACACCCAGCCACGGACAGGGACATCGATGGCTTTATGACACTCCAGCCGGTAA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1982220;STD_quant_start=30.0322;STD_quant_stop=21.4957;Kurtosis_quant_start=-1.80704;Kurtosis_quant_stop=-1.88498;SVTYPE=INS;SUPTYPE=AL;SVLEN=206;STRANDS=+-;STRANDS2=12,19,12,19;RE=31;REF_strand=34,46;Strandbias_pval=0.830787;AF=0.3875 GT:DR:DV 0/1:49:31 +chr1 1993705 91 N GGGCACAGTGGCTCATGCCTGTAATCCCAGCAACATGGGAGCCTGAGGTGGGAGGCTCTCTTGACAGGAGTTTGAGACCAGCCTGGGCAACATAGCAGACCCCCCACCCCGCCATTTCTAGGAAAAAAAAAAAAAAAGTGGCC . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1993712;STD_quant_start=0;STD_quant_stop=2.51396;Kurtosis_quant_start=11.9398;Kurtosis_quant_stop=0.982105;SVTYPE=INS;SUPTYPE=AL;SVLEN=141;STRANDS=+-;STRANDS2=23,27,23,27;RE=50;REF_strand=48,58;Strandbias_pval=1;AF=0.471698 GT:DR:DV 0/1:56:50 +chr1 2019222 92 N GGGGCGGGGGAGGAGAGGGGGGAGGGAGGGGGACCGGGTAGGGTGGGGGGGGGAGGGGAACGGGGAGGGGGCAGGCAGGCGCGGGGTGGGGGGAGGGGAGGGGGAGGGGAGAAGACGGGCAGCGGGAGGGGCGGGGGGAGGGGATGGGGGCGGGGGAGGAGGGCGGCGGGGGAGGGGATGGGGGCGGGGGAGGGGATGGGCGGGGGGAGGGGGA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2019227;STD_quant_start=3.28633;STD_quant_stop=4.04969;Kurtosis_quant_start=-0.713007;Kurtosis_quant_stop=0.942016;SVTYPE=INS;SUPTYPE=AL;SVLEN=211;STRANDS=+-;STRANDS2=23,17,23,17;RE=40;REF_strand=46,34;Strandbias_pval=1;AF=0.5 GT:DR:DV 0/1:40:40 +chr1 2106812 93 N CCCTCTGGTGGGCGTAGGACCTGTCACCGTGTCACCAGGCCAGGTAACTCTCAGCAGG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2106813;STD_quant_start=19.0342;STD_quant_stop=19.9875;Kurtosis_quant_start=-1.45192;Kurtosis_quant_stop=-1.30053;SVTYPE=INS;SUPTYPE=AL;SVLEN=55;STRANDS=+-;STRANDS2=3,12,3,12;RE=15;REF_strand=40,54;Strandbias_pval=0.153747;AF=0.159574 GT:DR:DV 0/0:79:15 +chr1 2110063 94 CCCAACCAAGAGGATCCCAGAGGTGAGACACAGAACGGCCAGGGCTGAATCCGGGGCCCTCCCTGGGGGCAGCCAAGGACCTAAAACCAATGGG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2110160;STD_quant_start=19.0866;STD_quant_stop=18.9882;Kurtosis_quant_start=-1.18991;Kurtosis_quant_stop=-1.28399;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-97;STRANDS=+-;STRANDS2=20,21,20,21;RE=41;REF_strand=38,45;Strandbias_pval=0.84876;AF=0.493976 GT:DR:DV 0/1:42:41 +chr1 2121520 95 N GGTCATGAGGTGGTAGTTAAGTTATGGTAGTTAG . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2121520;STD_quant_start=0;STD_quant_stop=0.8044;Kurtosis_quant_start=2.24316;Kurtosis_quant_stop=2.47427;SVTYPE=INS;SUPTYPE=AL;SVLEN=33;STRANDS=+-;STRANDS2=21,15,21,15;RE=35;REF_strand=60,50;Strandbias_pval=0.704801;AF=0.318182 GT:DR:DV 0/1:75:35 +chr1 2122244 96 N GTTAGGGTCACGGCGGTGGTTAGGTCGTGGTGGGAGTTAGGGTCACGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGATCATGGCTGTAGTTAGCGTCATGGTGGTAGTTAGGGTCACGGCTATAGTTGGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTATTTAGGGTCACGGCTGTAGTTAGCGTCATGGTGGTGGTTAGGTCATGGTGGTAGTTAGGGGTCACGGCTGTAGTTAGGGTCATGGTGGTGGTTAGGTCACTTGCTGTAGTTAGGGTCATGGTGGTAGTTAGGTCATGGTGGTAGTTAGCGTCATGGTGGTGGTTAGGTCATGGTAGTTAGGGTCACTGCCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2122256;STD_quant_start=72.1976;STD_quant_stop=164.724;Kurtosis_quant_start=0.262966;Kurtosis_quant_stop=-1.09184;SVTYPE=INS;SUPTYPE=AL;SVLEN=340;STRANDS=+-;STRANDS2=0,6,0,6;RE=6;REF_strand=56,48;Strandbias_pval=0.0120583;AF=0.0576923 GT:DR:DV 0/0:98:6 +chr1 2123322 97 N TAATTGGGATCATGACCATGTGATTGGGGTCATGGTGTTAGTTAAGGTCATGACTGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2123322;STD_quant_start=21.4499;STD_quant_stop=25.9846;Kurtosis_quant_start=-0.532609;Kurtosis_quant_stop=-1.17357;SVTYPE=INS;SUPTYPE=AL;SVLEN=88;STRANDS=+-;STRANDS2=9,11,9,11;RE=20;REF_strand=54,52;Strandbias_pval=0.80797;AF=0.188679 GT:DR:DV 0/0:86:20 +chr1 2123768 98 N GGCTGTGGTTAGGGTCATGGTGGTAGTTAGGATCATGGCTGTAGTTAGGTCATGGTGGTAGGTCTGGTCACGGCTAGTTGGGGTCATGGTGGTAGTTAGATCATGGCTGTAGTTAGGGTCAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2123768;STD_quant_start=41.4871;STD_quant_stop=39.3296;Kurtosis_quant_start=-1.74964;Kurtosis_quant_stop=-0.19162;SVTYPE=INS;SUPTYPE=AL;SVLEN=112;STRANDS=+-;STRANDS2=11,11,11,11;RE=22;REF_strand=52,50;Strandbias_pval=1;AF=0.215686 GT:DR:DV 0/0:80:22 +chr1 2124290 100 N GGGTCATGGTGGTAGTTAGGATCATGGCTGTAGTTGGGGTCATGGTGGTAGTTAGGGTCACGGCTATAGTTAGGGTCATGGTGGTAGTTATTGGTCTGTGATAGTTAGCATCATGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTGGGGTCATAGCTGTAGTTAGGGTCATAGTGGTAGTTGGGGTCACGGCTATAGTTG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2124353;STD_quant_start=62.996;STD_quant_stop=56.9131;Kurtosis_quant_start=-1.3131;Kurtosis_quant_stop=-0.191076;SVTYPE=INS;SUPTYPE=AL;SVLEN=110;STRANDS=+-;STRANDS2=6,7,6,7;RE=13;REF_strand=55,51;Strandbias_pval=0.773897;AF=0.122642 GT:DR:DV 0/0:93:13 +chr1 2124333 99 N AAGGGTCATGGTGGTAATTAGGATCATGTAGCTGTAGTTAGGGTCATGGTGGTAGTTAGGGTCTGGCTATAGTTGGGGTCATGGTGGTAGTTAGGGTCACAGCGATAGTTAGCATCATGGTGGTAGTTAGGGTCATGGTGGTAGATTGGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGGTCATAGCTGTAGTTAGGGTCTGTGGTGGTAGTTGGGGTCCGCGGCTATAGTTGGGGTCCATGGTGGTAGTTAAGGTCACGGCTGTGATTAGCGTCATGGTGGTACGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2124347;STD_quant_start=28.9361;STD_quant_stop=52.7655;Kurtosis_quant_start=0.874818;Kurtosis_quant_stop=0.127135;SVTYPE=INS;SUPTYPE=AL;SVLEN=295;STRANDS=+-;STRANDS2=10,9,10,9;RE=19;REF_strand=54,52;Strandbias_pval=1;AF=0.179245 GT:DR:DV 0/0:87:19 +chr1 2142340 101 CTTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCTAGCAGCCGAAGCGCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2142424;STD_quant_start=9.44235;STD_quant_stop=8.28442;Kurtosis_quant_start=7.28739;Kurtosis_quant_stop=4.86915;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-84;STRANDS=+-;STRANDS2=23,15,23,15;RE=38;REF_strand=48,32;Strandbias_pval=1;AF=0.475 GT:DR:DV 0/1:42:38 +chr1 2280758 102 N GCCTCGGGAGAGTGACAGGCGGCGGCGGCGACACCAGAGAGCGGACGAGAGGACAGGCGGCGGCGGCGATCTTTCAGAGAGCGGGATTTTCCCGAGAGGGACAGAGAAGGCGGCGGAGATTGTCTTCAGAGAGAGGAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2280758;STD_quant_start=32.8507;STD_quant_stop=82.7345;Kurtosis_quant_start=-1.7272;Kurtosis_quant_stop=-1.34469;SVTYPE=INS;SUPTYPE=AL;SVLEN=205;STRANDS=+-;STRANDS2=2,4,2,4;RE=6;REF_strand=32,54;Strandbias_pval=1;AF=0.0697674 GT:DR:DV 0/0:80:6 +chr1 2280945 103 N CGGTGCGGAGAGATCTTCAGAGAGAGGACGCCTGAGAAGACAGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2281014;STD_quant_start=60.2633;STD_quant_stop=47.2864;Kurtosis_quant_start=-0.84573;Kurtosis_quant_stop=-0.496898;SVTYPE=INS;SUPTYPE=AL;SVLEN=51;STRANDS=+-;STRANDS2=1,5,1,5;RE=6;REF_strand=34,54;Strandbias_pval=0.40609;AF=0.0681818 GT:DR:DV 0/0:82:6 +chr1 2281986 104 GAGAGGACGCCCGAGAAGACAGGCGGTGGCGGAGATCTTCAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2282028;STD_quant_start=21.4103;STD_quant_stop=21.0879;Kurtosis_quant_start=-1.23914;Kurtosis_quant_stop=-0.042349;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-42;STRANDS=+-;STRANDS2=9,11,9,11;RE=20;REF_strand=35,54;Strandbias_pval=0.801434;AF=0.224719 GT:DR:DV 0/0:69:20 diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index 95b45309..e0b29e8e 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -108,6 +108,20 @@ def test_vcf(self): print(record, record.data) assert record.data['CLNSIG'] == 'Pathogenic' + def test_sniffle(self): + results = self.run_main(get_data('sniffles.vcf'), SUPPORTED_TOOL.VCF, False) + print(results.keys()) + record = results['vcf-35777'][0] + print(record, record.data) + assert record.data['event_type'] == 'translocation' + + def test_cuteSV(self): + results = self.run_main(get_data('cuteSV.vcf'), SUPPORTED_TOOL.VCF, False) + print(results.keys()) + record = results['vcf-cuteSV.BND.0'][0] + print(record, record.data) + assert record.data['event_type'] == 'inverted translocation' + def test_breakseq2(self): self.run_main(get_data('breakseq.vcf'), SUPPORTED_TOOL.BREAKSEQ, False) From 78a7677c9f5d6578c7173df79a5c1bba961be697 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 11 Jan 2022 01:04:51 -0800 Subject: [PATCH 070/137] added docker support for lr dependencies and edited documentation --- .github/CONTRIBUTING.md | 9 +++++++-- Dockerfile | 14 ++++++++++++++ docs/configuration/general.md | 2 +- docs/tutorials/mini.md | 2 +- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index d38244d4..e895f9f4 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -43,8 +43,13 @@ markdown_refdocs mavis -o docs/package --link mkdocs build ``` -The contents of the user manual can then be viewed by opening the build-docs/index.html -in any available web browser (i.e. google-chrome, firefox, etc.) +The contents of the user manual can then be viewed by opening the build-docs/index.html in any available web browser +(i.e. google-chrome, firefox, etc.). Future development to build the Markdown files into HTML and start a development +server to browse the documentation can be done using: + +```bash +mkdocs serve +``` ## Deploy to PyPi diff --git a/Dockerfile b/Dockerfile index b62ea761..2d0dae78 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,13 @@ RUN git clone https://github.com/lh3/bwa.git && \ cd .. && \ mv bwa/bwa /usr/local/bin +# install minimap2 +RUN git clone https://github.com/lh3/minimap2.git && \ + cd minimap2 && \ + git checkout v2.24 && \ + make && \ + cd .. && \ + mv minimap2/minimap2.1 /usr/local/bin # install blat dependencies RUN apt-get install -y libcurl4 @@ -26,6 +33,13 @@ RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ chmod a+x blat && \ mv blat /usr/local/bin +# install wtdbg2 +RUN git clone https://github.com/ruanjue/wtdbg2.git && \ + cd wtdbg2 && \ + make && \ + cd .. && \ + mv wtdbg2/wtdbg2 /usr/local/bin + COPY setup.py setup.py COPY setup.cfg setup.cfg COPY MANIFEST.in MANIFEST.in diff --git a/docs/configuration/general.md b/docs/configuration/general.md index e22f5339..176aef2e 100644 --- a/docs/configuration/general.md +++ b/docs/configuration/general.md @@ -8,7 +8,7 @@ The pipeline can be run in steps or it can be configured using a JSON configuration file and setup in a single step. Scripts will be generated to run all steps following clustering. -The config schema is found in the mavis package under `mavis/schemas/config.json` +The config schema is found in the mavis package under `src/mavis/schemas/config.json` Top level settings follow the pattern `
.`. The convert and library sections are nested objects. diff --git a/docs/tutorials/mini.md b/docs/tutorials/mini.md index 27b5f51f..d4475985 100644 --- a/docs/tutorials/mini.md +++ b/docs/tutorials/mini.md @@ -3,7 +3,7 @@ This tutorial is based on the data included in the tests folder of MAVIS. The data files are very small and this tutorial is really only intended for testing a MAVIS install. The data here is simulated and -results are not representitive of the typical events you would see +results are not representative of the typical events you would see reported from MAVIS. For a more complete tutorial with actual fusion gene examples, please see the [full tutorial](../../tutorials/full/). From 0ab2e8f9c55e5104041a9d15fc9596a49ac4fe30 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 11 Jan 2022 14:25:33 -0800 Subject: [PATCH 071/137] revert contributing message --- .github/CONTRIBUTING.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index e895f9f4..a7f71e24 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -43,14 +43,6 @@ markdown_refdocs mavis -o docs/package --link mkdocs build ``` -The contents of the user manual can then be viewed by opening the build-docs/index.html in any available web browser -(i.e. google-chrome, firefox, etc.). Future development to build the Markdown files into HTML and start a development -server to browse the documentation can be done using: - -```bash -mkdocs serve -``` - ## Deploy to PyPi Install deployment dependencies From 14d4a911457f5fd181065ae0c5a921621c8d8b09 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 11 Jan 2022 15:30:35 -0800 Subject: [PATCH 072/137] revert contribution.md file --- .github/CONTRIBUTING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index a7f71e24..d38244d4 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -43,6 +43,9 @@ markdown_refdocs mavis -o docs/package --link mkdocs build ``` +The contents of the user manual can then be viewed by opening the build-docs/index.html +in any available web browser (i.e. google-chrome, firefox, etc.) + ## Deploy to PyPi Install deployment dependencies From 7e07e2c3e99f4e6816ac8c7765d49c2d5757d9ee Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Thu, 20 Jan 2022 14:35:34 -0800 Subject: [PATCH 073/137] added handling of uncertain calls by sniffle --- src/mavis/tools/vcf.py | 212 ++++++++++++++++++++--------------- tests/data/sniffles.vcf | 242 +++++++++++++++++++--------------------- 2 files changed, 237 insertions(+), 217 deletions(-) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index eea0fadf..ae1410e5 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -2,6 +2,7 @@ import re from dataclasses import dataclass from typing import Dict, List, Optional, Tuple +from copy import deepcopy import pandas as pd @@ -16,19 +17,19 @@ from .constants import SUPPORTED_TOOL PANDAS_DEFAULT_NA_VALUES = [ - '-1.#IND', - '1.#QNAN', - '1.#IND', - '-1.#QNAN', - '#N/A', - 'N/A', - 'NA', - '#NA', - 'NULL', - 'NaN', - '-NaN', - 'nan', - '-nan', + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A", + "N/A", + "NA", + "#NA", + "NULL", + "NaN", + "-NaN", + "nan", + "-nan", ] @@ -53,7 +54,7 @@ class VcfRecordType: @property def stop(self) -> Optional[int]: - return self.info.get('END', self.pos) + return self.info.get("END", self.pos) def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: @@ -75,51 +76,51 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: | ru]p] | LL | """ # ru[p[ - match = re.match(r'^(?P\w)(?P\w*)\[(?P[^:]+):(?P\d+)\[$', alt) + match = re.match(r"^(?P\w)(?P\w*)\[(?P[^:]+):(?P\d+)\[$", alt) if match: return ( - match.group('chr'), - int(match.group('pos')), + match.group("chr"), + int(match.group("pos")), ORIENT.LEFT, ORIENT.RIGHT, - match.group('ref'), - match.group('useq'), + match.group("ref"), + match.group("useq"), ) # [p[ur - match = re.match(r'^\[(?P[^:]+):(?P\d+)\[(?P\w*)(?P\w)$', alt) + match = re.match(r"^\[(?P[^:]+):(?P\d+)\[(?P\w*)(?P\w)$", alt) if match: return ( - match.group('chr'), - int(match.group('pos')), + match.group("chr"), + int(match.group("pos")), ORIENT.RIGHT, ORIENT.RIGHT, - match.group('ref'), - match.group('useq'), + match.group("ref"), + match.group("useq"), ) # ]p]ur - match = re.match(r'^\](?P[^:]+):(?P\d+)\](?P\w*)(?P\w)$', alt) + match = re.match(r"^\](?P[^:]+):(?P\d+)\](?P\w*)(?P\w)$", alt) if match: return ( - match.group('chr'), - int(match.group('pos')), + match.group("chr"), + int(match.group("pos")), ORIENT.RIGHT, ORIENT.LEFT, - match.group('ref'), - match.group('useq'), + match.group("ref"), + match.group("useq"), ) # ru]p] - match = re.match(r'^(?P\w)(?P\w*)\](?P[^:]+):(?P\d+)\]$', alt) + match = re.match(r"^(?P\w)(?P\w*)\](?P[^:]+):(?P\d+)\]$", alt) if match: return ( - match.group('chr'), - int(match.group('pos')), + match.group("chr"), + int(match.group("pos")), ORIENT.LEFT, ORIENT.LEFT, - match.group('ref'), - match.group('useq'), + match.group("ref"), + match.group("useq"), ) else: - raise NotImplementedError('alt specification in unexpected format', alt) + raise NotImplementedError("alt specification in unexpected format", alt) def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: @@ -143,7 +144,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: try: value = record.info[key] except UnicodeDecodeError as err: - log('Ignoring invalid INFO field {} with error: {}'.format(key, err)) + log("Ignoring invalid INFO field {} with error: {}".format(key, err)) else: try: value = value[0] if len(value) == 1 else value @@ -152,27 +153,27 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: info[key] = value std_row = {} - if record.id and record.id != 'N': # to account for NovoBreak N in the ID field - std_row['id'] = record.id + if record.id and record.id != "N": # to account for NovoBreak N in the ID field + std_row["id"] = record.id - if info.get('SVTYPE') == 'BND': + if info.get("SVTYPE") == "BND": chr2, end, orient1, orient2, ref, alt = parse_bnd_alt(alt) std_row[COLUMNS.break1_orientation] = orient1 std_row[COLUMNS.break2_orientation] = orient2 std_row[COLUMNS.untemplated_seq] = alt if record.ref != ref: raise AssertionError( - 'Expected the ref specification in the vcf record to match the sequence ' - 'in the alt string: {} vs {}'.format(record.ref, ref) + "Expected the ref specification in the vcf record to match the sequence " + "in the alt string: {} vs {}".format(record.ref, ref) ) else: - chr2 = info.get('CHR2', record.chrom) + chr2 = info.get("CHR2", record.chrom) end = record.stop if ( alt and record.ref - and re.match(r'^[A-Z]+$', alt) - and re.match(r'^[A-Z]+', record.ref) + and re.match(r"^[A-Z]+$", alt) + and re.match(r"^[A-Z]+", record.ref) ): std_row[COLUMNS.untemplated_seq] = alt[1:] size = len(alt) - len(record.ref) @@ -180,9 +181,11 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: std_row[COLUMNS.event_type] = SVTYPE.INS elif size < 0: std_row[COLUMNS.event_type] = SVTYPE.DEL - std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2}) + std_row.update( + {COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2} + ) if info.get( - 'PRECISE', False + "PRECISE", False ): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged std_row.update( { @@ -196,65 +199,100 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: std_row.update( { COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] + 1, record.pos + info.get("CIPOS", (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + + info.get("CIPOS", (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, end + info.get("CIEND", (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get('CIEND', (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], + COLUMNS.break2_position_end: end + info.get("CIEND", (0, 0))[1], } ) - if 'SVTYPE' in info: - std_row[COLUMNS.event_type] = info['SVTYPE'] + std_row2 = {} + + if "SVTYPE" in info: + if info["SVTYPE"] in dir(SVTYPE): + std_row[COLUMNS.event_type] = info["SVTYPE"] + elif "/" in info["SVTYPE"]: + std_row2 = deepcopy(std_row) + std_row[COLUMNS.event_type] = info["SVTYPE"].split("/")[0] + std_row2[COLUMNS.event_type] = info["SVTYPE"].split("/")[1] try: - orient1, orient2 = info['CT'].split('to') - connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} + orient1, orient2 = info["CT"].split("to") + connection_type = {"3": ORIENT.LEFT, "5": ORIENT.RIGHT, "N": ORIENT.NS} std_row[COLUMNS.break1_orientation] = connection_type[orient1] std_row[COLUMNS.break2_orientation] = connection_type[orient2] + if bool(std_row2): + std_row2[COLUMNS.break1_orientation] = connection_type[orient1] + std_row2[COLUMNS.break2_orientation] = connection_type[orient2] except KeyError: pass - std_row.update( - {k: v for k, v in info.items() if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CT'}} - ) - records.append(std_row) + if bool(std_row2): + std_row2.update( + { + k: v + for k, v in info.items() + if k not in {"CHR2", "SVTYPE", "CIPOS", "CIEND", "CT"} + } + ) + std_row.update( + { + k: v + for k, v in info.items() + if k not in {"CHR2", "SVTYPE", "CIPOS", "CIEND", "CT"} + } + ) + records.append(std_row) + records.append(std_row2) + else: + std_row.update( + { + k: v + for k, v in info.items() + if k not in {"CHR2", "SVTYPE", "CIPOS", "CIEND", "CT"} + } + ) + records.append(std_row) return records def convert_pandas_rows_to_variants(df): def parse_info(info_field): info = {} - for pair in info_field.split(';'): - if '=' in pair: - key, value = pair.split('=', 1) + for pair in info_field.split(";"): + if "=" in pair: + key, value = pair.split("=", 1) info[key] = value else: info[pair] = True # convert info types for key in info: - if key in {'CIPOS', 'CIEND'}: - ci_start, ci_end = info[key].split(',') + if key in {"CIPOS", "CIEND"}: + ci_start, ci_end = info[key].split(",") info[key] = (int(ci_start), int(ci_end)) - elif key == 'END': + elif key == "END": info[key] = int(info[key]) return info - df['info'] = df['INFO'].apply(parse_info) - df['alts'] = df['ALT'].apply(lambda a: a.split(',')) + df["info"] = df["INFO"].apply(parse_info) + df["alts"] = df["ALT"].apply(lambda a: a.split(",")) rows = [] for _, row in df.iterrows(): rows.append( VcfRecordType( - id=row['ID'], - pos=row['POS'], - info=VcfInfoType(row['info']), - chrom=row['CHROM'], - ref=row['REF'], - alts=row['alts'], + id=row["ID"], + pos=row["POS"], + info=VcfInfoType(row["info"]), + chrom=row["CHROM"], + ref=row["REF"], + alts=row["alts"], ) ) return rows @@ -266,33 +304,33 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: """ # read the comment/header information header_lines = [] - with open(input_file, 'r') as fh: - line = '##' - while line.startswith('##'): + with open(input_file, "r") as fh: + line = "##" + while line.startswith("##"): header_lines.append(line) line = fh.readline().strip() header_lines = header_lines[1:] # read the data df = pd.read_csv( input_file, - sep='\t', + sep="\t", skiprows=len(header_lines), dtype={ - 'CHROM': str, - 'POS': int, - 'ID': str, - 'INFO': str, - 'FORMAT': str, - 'REF': str, - 'ALT': str, + "CHROM": str, + "POS": int, + "ID": str, + "INFO": str, + "FORMAT": str, + "REF": str, + "ALT": str, }, - na_values=PANDAS_DEFAULT_NA_VALUES + ['.'], + na_values=PANDAS_DEFAULT_NA_VALUES + ["."], ) - df = df.rename(columns={df.columns[0]: df.columns[0].replace('#', '')}) - required_columns = ['CHROM', 'INFO', 'POS', 'REF', 'ALT', 'ID'] + df = df.rename(columns={df.columns[0]: df.columns[0].replace("#", "")}) + required_columns = ["CHROM", "INFO", "POS", "REF", "ALT", "ID"] for col in required_columns: if col not in df.columns: - raise KeyError(f'Missing required column: {col}') + raise KeyError(f"Missing required column: {col}") # convert the format fields using the header return header_lines, df diff --git a/tests/data/sniffles.vcf b/tests/data/sniffles.vcf index 700df87a..dd631018 100644 --- a/tests/data/sniffles.vcf +++ b/tests/data/sniffles.vcf @@ -1,7 +1,7 @@ ##fileformat=VCFv4.1 ##FILTER= ##source=Sniffles -##fileDate=20210607:16:05 PMef_minus +##fileDate=20201105 ##contig= ##contig= ##contig= @@ -197,159 +197,141 @@ ##contig= ##contig= ##contig= -##contig= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= -##FILTER= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= +##INFO= ##INFO= ##INFO= +##INFO= ##INFO= -##INFO= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= -##contig= ##bcftools_viewVersion=1.11+htslib-1.11 -##bcftools_viewCommand=view --regions chr1 F24721_merged_sorted.bam_5_read_sorted.vcf.gz; Date=Tue Jan 4 22:45:21 2022 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /projects/jfan_prj/jfan_prj/Nanopore_Testing/2021_nanopore_sv_testing/scratch/depth_testing/POG/COLO829/minimap2_bam/F24721_merged_sorted.bam -chr1 10006 35777 N ]chr3:198172735]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198172735;STD_quant_start=32.4628;STD_quant_stop=44.8237;Kurtosis_quant_start=2.29519;Kurtosis_quant_stop=-0.995353;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=--;STRANDS2=0,6,6,0;RE=6;REF_strand=72,102;Strandbias_pval=0.0824618;AF=0.0344828 GT:DR:DV 0/0:168:6 -chr1 10030 36832 N ]chr17:41490827]N . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr17;END=41490827;STD_quant_start=48.6107;STD_quant_stop=4.67516;Kurtosis_quant_start=0.545103;Kurtosis_quant_stop=1.53121;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=--;STRANDS2=0,7,7,0;RE=7;REF_strand=24,9;Strandbias_pval=0.000613617;AF=0.212121 GT:DR:DV 0/0:26:7 -chr1 10312 35780 N ]chrX:449436]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chrX;END=449436;STD_quant_start=117.156;STD_quant_stop=68.302;Kurtosis_quant_start=-1.29786;Kurtosis_quant_stop=-0.029231;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=0,6,0,6;RE=6;REF_strand=45,128;Strandbias_pval=0.33926;AF=0.0346821 GT:DR:DV 0/0:167:6 -chr1 10466 35781 N N[chrX:156030800[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chrX;END=156030800;STD_quant_start=81.1924;STD_quant_stop=134.17;Kurtosis_quant_start=1.34083;Kurtosis_quant_stop=1.99911;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=5,0,0,5;RE=5;REF_strand=7,8;Strandbias_pval=0.0546956;AF=0.333333 GT:DR:DV 0/1:10:5 -chr1 10467 35779 N N[chr3:10002[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=10002;STD_quant_start=106.244;STD_quant_stop=161.729;Kurtosis_quant_start=0.552508;Kurtosis_quant_stop=2.99076;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=+-;STRANDS2=6,0,6,0;RE=6;REF_strand=4,5;Strandbias_pval=0.043956;AF=0.666667 GT:DR:DV 0/1:3:6 -chr1 10467 35782 N N[chr3:198174376[ . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198174376;STD_quant_start=17.5865;STD_quant_stop=297.518;Kurtosis_quant_start=0.324147;Kurtosis_quant_stop=0.886959;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=7,0,0,7;RE=7;REF_strand=57,112;Strandbias_pval=0.000675389;AF=0.0414201 GT:DR:DV 0/0:162:7 -chr1 10468 35778 N N[chr17:41490879[ . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr17;END=41490879;STD_quant_start=41.208;STD_quant_stop=1.92354;Kurtosis_quant_start=3.02235;Kurtosis_quant_stop=0.961601;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=11,0,0,11;RE=11;REF_strand=4,6;Strandbias_pval=0.00386997;AF=1 GT:DR:DV 1/1:0:11 -chr1 35143 35783 N N[chr20:60001[ . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr20;END=60001;STD_quant_start=0;STD_quant_stop=0.632456;Kurtosis_quant_start=nan;Kurtosis_quant_stop=-0.5;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=+-;STRANDS2=3,2,3,2;RE=5;REF_strand=0,3;Strandbias_pval=0.196429;AF=1 GT:DR:DV 1/1:0:5 -chr1 136637 0 N GTGTCGGCTGACCCTCTGTCCGCGTGGAGGCCGGTGGGGTGTGGAGGC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=136638;STD_quant_start=20.4524;STD_quant_stop=21.8541;Kurtosis_quant_start=-1.28738;Kurtosis_quant_stop=-1.27278;SVTYPE=INS;SUPTYPE=AL;SVLEN=46;STRANDS=+-;STRANDS2=10,8,10,8;RE=18;REF_strand=20,20;Strandbias_pval=0.780391;AF=0.45 GT:DR:DV 0/1:22:18 -chr1 136956 1 N TGACCTCTCTCAGTGTGGGAGGGGGCCGGTGTGAGGCAAGGGGCTCACGCGCGGCCTCTGTCCGCGTGGGAGGGGCCGGTGTGAGACAAGGGGCTCAGGCTGACCTCTCAGCGTGGGAGGGGCCGGTGTGAGGCAAAGGGCTCGGGCTGACCTCTCTCAGCGTGGGAGGGCCAGTGTGAGGCAGGGCTCACATGACCTCTCAGCATGGGAGGGGCCGGTGTGAGACAAGGGCTCGGG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=136989;STD_quant_start=36.1248;STD_quant_stop=37.4166;Kurtosis_quant_start=0.21249;Kurtosis_quant_stop=1.96205;SVTYPE=INS;SUPTYPE=AL;SVLEN=195;STRANDS=+-;STRANDS2=6,8,6,8;RE=14;REF_strand=18,20;Strandbias_pval=1;AF=0.368421 GT:DR:DV 0/1:24:14 -chr1 180694 35784 N ]chrX:449444]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chrX;END=449444;STD_quant_start=296.777;STD_quant_stop=10.3923;Kurtosis_quant_start=0.018679;Kurtosis_quant_stop=0;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=0,6,0,6;RE=6;REF_strand=26,30;Strandbias_pval=0.0354297;AF=0.107143 GT:DR:DV 0/0:50:6 -chr1 181262 2 N CCGGCAGGCGCAGAGAGGCGCGGGCCGGGGTCGGGCGCAGGCGCAGAGAGCGCGGCCGGCGCAGAGGCGCAGAGAGGGCGCAGCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=181262;STD_quant_start=28.5441;STD_quant_stop=24.1125;Kurtosis_quant_start=2.54835;Kurtosis_quant_stop=1.686;SVTYPE=INS;SUPTYPE=AL;SVLEN=93;STRANDS=+-;STRANDS2=19,15,19,15;RE=34;REF_strand=56,46;Strandbias_pval=1;AF=0.333333 GT:DR:DV 0/1:68:34 -chr1 257667 35785 N ]chr5:181462060]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr5;END=181462060;STD_quant_start=0.894427;STD_quant_stop=0.447214;Kurtosis_quant_start=2;Kurtosis_quant_stop=2;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=--;STRANDS2=0,5,5,0;RE=5;REF_strand=19,18;Strandbias_pval=0.0532252;AF=0.135135 GT:DR:DV 0/0:32:5 -chr1 350806 3 N ACTCACTGAAGGTGGAGGGAAAATGGTGACCTAAGTC . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=350807;STD_quant_start=1.22475;STD_quant_stop=2.82843;Kurtosis_quant_start=3;Kurtosis_quant_stop=-0.65625;SVTYPE=INS;SUPTYPE=AL;SVLEN=37;STRANDS=+-;STRANDS2=4,2,4,2;RE=6;REF_strand=10,4;Strandbias_pval=1;AF=0.428571 GT:DR:DV 0/1:8:6 -chr1 368936 4 CCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACGTGGGTGCCATCTCAGCAGCTCACGGTGTAGAAACTGCGACACTCCCATGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTAGAAACTGCGACACTCCCATGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGTGCCATCTCAGCAGCTCACGGTGTGGAAACTGCGACACTCACACGGGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=369307;STD_quant_start=139.183;STD_quant_stop=119.892;Kurtosis_quant_start=-0.086052;Kurtosis_quant_stop=-0.157727;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-371;STRANDS=+-;STRANDS2=7,5,7,5;RE=12;REF_strand=16,20;Strandbias_pval=0.511217;AF=0.333333 GT:DR:DV 0/1:24:12 -chr1 372679 5 CTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGCATATATGTATAATATATATTATATATGGTCCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCTATAACCGACCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=372858;STD_quant_start=74.4439;STD_quant_stop=102.876;Kurtosis_quant_start=-0.974097;Kurtosis_quant_stop=1.36116;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-179;STRANDS=+-;STRANDS2=6,4,6,4;RE=10;REF_strand=14,12;Strandbias_pval=1;AF=0.384615 GT:DR:DV 0/1:16:10 -chr1 374100 6 N CCCCCTCTCCTTTCTCCTCTCCATCCCCCCTCTCCATCTCCTCTCCTTTCTCCTCTCTCGCCCCCTCTCCTTTCTCCCTCTCTATCCCCCTCTCCTTTCTCCCTCTCTCCCCCTCTCCTTTCTCCTCTCCATCCCCTCTCCATCCCCCTCTCCATCTCCTCTCCTTTCTCCTCTCTAGCCCCTCTCCTTTCTCTCTCCTCCCCCTCTCCTTTCTCCCTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=374100;STD_quant_start=57.1456;STD_quant_stop=79.9085;Kurtosis_quant_start=0.684282;Kurtosis_quant_stop=-0.413029;SVTYPE=INS;SUPTYPE=AL;SVLEN=227;STRANDS=+-;STRANDS2=4,4,4,4;RE=8;REF_strand=14,10;Strandbias_pval=0.703493;AF=0.333333 GT:DR:DV 0/1:16:8 -chr1 606600 7 GGTCAGAGCTGTCCTGGGTCAGAGCTGCCCAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=606632;STD_quant_start=2.98329;STD_quant_stop=2.54951;Kurtosis_quant_start=2.61341;Kurtosis_quant_stop=3.87685;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;STRANDS2=7,4,7,4;RE=11;REF_strand=28,28;Strandbias_pval=0.516721;AF=0.196429 GT:DR:DV 0/0:45:11 -chr1 609583 8 GTGGCCAGCAGGCGGCGCTGCAGGAGAGGAGATGCCCAGGCCTGGCGGCACACGCGGGTTC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=609647;STD_quant_start=21.6956;STD_quant_stop=18.4174;Kurtosis_quant_start=-0.340189;Kurtosis_quant_stop=0.435423;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-64;STRANDS=+-;STRANDS2=9,4,9,4;RE=13;REF_strand=30,34;Strandbias_pval=0.223523;AF=0.203125 GT:DR:DV 0/0:51:13 -chr1 611309 9 TGTGGGTGTGACAGGGTGTGTTCTGTGTGAGAACATGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGATGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTTGGTGTGAGTTCATGGGTGTGACGGGGTGTGCTGTGTGAGAACGTGTGTGTAGTGTTCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCTGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACGGGGTGTGTGCTGTGTGAGAACGTGTGTGTAGTGTCCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=612033;STD_quant_start=78.5303;STD_quant_stop=59.8415;Kurtosis_quant_start=-0.363;Kurtosis_quant_stop=0.0992;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-724;STRANDS=+-;STRANDS2=7,6,7,6;RE=13;REF_strand=29,33;Strandbias_pval=0.763359;AF=0.209677 GT:DR:DV 0/0:49:13 -chr1 744867 10 N TATATATATATATATATATATATATATATATA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=744867;STD_quant_start=1.34164;STD_quant_stop=4.07431;Kurtosis_quant_start=2;Kurtosis_quant_stop=-0.953985;SVTYPE=INS;SUPTYPE=AL;SVLEN=35;STRANDS=+-;STRANDS2=4,1,4,1;RE=5;REF_strand=24,16;Strandbias_pval=0.635332;AF=0.125 GT:DR:DV 0/0:35:5 -chr1 814584 11 N AAAAAAAGATGTGAAACCTATTTTCAGAATTAACATTTCCTTCCTAAATATCTAACACAACACTGAAGGAGAAAGTCCAGTCAATTTTATGTAGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=814585;STD_quant_start=17.8792;STD_quant_stop=15.8719;Kurtosis_quant_start=-1.85801;Kurtosis_quant_stop=-1.90018;SVTYPE=INS;SUPTYPE=AL;SVLEN=96;STRANDS=+-;STRANDS2=13,11,13,11;RE=24;REF_strand=40,44;Strandbias_pval=0.64659;AF=0.285714 GT:DR:DV 0/0:60:24 -chr1 820880 12 N TCTACACTACCTGCCTGGCCAGCAGATCCACCCTGTCTACACTACCTGCCTGGGCAGTAGTTCCACGCAATCTCCCTTACCTGCCTCTCCAGCAGACCCGCCCTATCTATACTACTTGCCTGTCCAGCAGATCCACTTCCCATTCACACGACCTGCCTGTCCAGCAGATCCACCCTGTCTACACTACCTTCCTGCTTGTCCAGCAGGTCCACCCTGTCTATACTACCTGCCTGGCCAGTAGATCCACACACTA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=820881;STD_quant_start=6.70075;STD_quant_stop=12.2963;Kurtosis_quant_start=5.29053;Kurtosis_quant_stop=5.17296;SVTYPE=INS;SUPTYPE=AL;SVLEN=245;STRANDS=+-;STRANDS2=11,9,11,9;RE=20;REF_strand=54,40;Strandbias_pval=1;AF=0.212766 GT:DR:DV 0/0:74:20 -chr1 820906 13 TCCACCCTGTCTACACTACCTGCTTGTCCAGCAGG N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=820941;STD_quant_start=2.56905;STD_quant_stop=2.70185;Kurtosis_quant_start=-1.38237;Kurtosis_quant_stop=-1.08812;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=11,9,11,9;RE=20;REF_strand=54,40;Strandbias_pval=1;AF=0.212766 GT:DR:DV 0/0:74:20 -chr1 822428 14 CCTGGCCAGCAGATCCACCCTGTCTATACTACCTG N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=822463;STD_quant_start=2.98329;STD_quant_stop=2.91548;Kurtosis_quant_start=-1.2983;Kurtosis_quant_stop=-1.31531;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=11,9,11,9;RE=20;REF_strand=54,42;Strandbias_pval=1;AF=0.208333 GT:DR:DV 0/0:76:20 -chr1 839479 15 ACACACACCTGGACAAACACACCTGGACACACACACCTAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=839519;STD_quant_start=11.7558;STD_quant_stop=10.8904;Kurtosis_quant_start=-1.43253;Kurtosis_quant_stop=-1.20587;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-40;STRANDS=+-;STRANDS2=8,7,8,7;RE=15;REF_strand=42,42;Strandbias_pval=1;AF=0.178571 GT:DR:DV 0/0:69:15 -chr1 853534 16 GCCGTGTGGTAAACTGATGAACCCCGACCCTGATGAACGTGAGATG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=853581;STD_quant_start=21.0143;STD_quant_stop=21.4033;Kurtosis_quant_start=-1.50523;Kurtosis_quant_stop=-1.48919;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-47;STRANDS=+-;STRANDS2=7,8,7,8;RE=15;REF_strand=30,36;Strandbias_pval=1;AF=0.227273 GT:DR:DV 0/0:51:15 -chr1 866801 17 N CGCTCCTGGCCGTCTCCGAGCCCTCCACATGTCTCCTGCCTCATCCCTGACGTCCTCCCAGGCCCTCGTGGTCACTCCCCCTGCACTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=866839;STD_quant_start=45.3707;STD_quant_stop=37.0162;Kurtosis_quant_start=-2.21189;Kurtosis_quant_stop=-1.85872;SVTYPE=INS;SUPTYPE=AL;SVLEN=49;STRANDS=+-;STRANDS2=6,12,6,12;RE=18;REF_strand=32,44;Strandbias_pval=0.598063;AF=0.236842 GT:DR:DV 0/0:58:18 -chr1 872837 18 N GGGGAGGTTTCATTTGCTCCACCTGCAGCGAGTAAGTAGCCCATCTCAGGTTTGACTCCTGACTTAATTCCTAACAGGGGAAGCCAAGGTCCTGTGACCCTCCCGGGGGAGGGGTTTCATTTGTTCTACCTGCAGTGAGGTCTGTTAGCCCATCTCAGGTTTGACTCCTGACTCTAATTCTAACAGGAAGCTGTCCTGTAACTCTGGGGAGGGGGGGGTTTCATTTGCTCCACCTGCAGCGAGGTTAGCCCTCCATCTCAGGTTTGACTCCTGACTTAATTCCTAACAGGGGAAGCTGCTGTCCTGTGACTCTGGGAGAAGGGGTTTCATTTGCTCCACCTGCAGTGAGGTCTGCTAGCCCATCTCAGGTTTGACTCTGACTTAATTCCTAAACAGGGGAAGCTGCTGTCCTGTAACTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=872837;STD_quant_start=87.716;STD_quant_stop=108.899;Kurtosis_quant_start=-1.21956;Kurtosis_quant_stop=-1.1229;SVTYPE=INS;SUPTYPE=AL;SVLEN=416;STRANDS=+-;STRANDS2=7,7,7,7;RE=14;REF_strand=28,30;Strandbias_pval=1;AF=0.241379 GT:DR:DV 0/0:44:14 -chr1 876112 19 N CCCCATACTCCTCCCCCATACTCCCCCATACCCCCCCACACTCCCCCCATACTCCTCCCCCATACTCCCCCATACTCCCCCACACTCCCCCATACTCCTCCCCCATACTCCCCTATACTCCCCACACTCCCCCCAAACTCCCCCCATACTCCTCCCCCATACTCCCCATACTCCCCCACACTCCCCCACACTCCCCCATACTCCCCCACACTGTTCCCCCCATACCTCCCCCATACTCCCCCACACTCCCCCACACTCCCCCACGCTCCTCCCCCACACCCTCCCACACTCCCCCACACTCCCCTACTGCCTTCCCCCACACTCCCCCACACTCCTCCCCATACTCCCCCACACTCCCTCATACTCCCCATACTACCCCAACCTCCCCCATACTCCCCCATACTCCCCACACACTCCCCCCACACTCCCCCCAAACTCCCCCATACTCCTCCCCCAGTACTCCCCCATACTCCCACACTCCCACACTCCCCCACACTCCCCCCATACTCCCCCACACTCCCCCACACTCACTCCACACTCCCCATACTCCCCAAATCTCCCCCATACTCCCACATTCCCCCACACTCCCCACACTCCCCCATACTCCCCCACACTCCCCACACTCACCCACACCCCCCCATACTCCCCAACCTCCCCCATACTCCCCACATTCCCCCATACTCCCCCATACTCCTCCCCCATACTCCCCCCATACTCCCCCACACTCCCCACACTCCCCCATACTCCCCCACACTCCCCATACTCCCCCTGCATCCTCCCCATACTCCCCACATTCCCCCATACTCCCCATACTCCCCACACTCCCCCACACTCCCCCATACTCCCCCTCACACTCCCCCCATACTCCCCAACCTCCCAAACTCCCCCACATTCCCCCATACTCCCCATACTCCCCCAAACTCCCCATACTCCTCCCCTCAATACTCCCCATACTCCCCCATACTGCCCAACCTCCCCATACCCCCCACACTCCCCCCATACTCCCCCCACACCCCCCCCATACTCCCCCACACTCCCCTGCAACTCCCCTTATACTCCTCCCCCATACTCCCCATACTCCCCCCACACTCCCCAAACTCCCCATACTCCTCCCCATACTCCCCATACTCCCCCACACTCCCCCATACTCCTCCCCCATACTCCCCATACCCCCACACTCCCCCATACTCTCCCCATACTCCCCATACTCCCCACACTCCCCCAAACTCCCCCATACTCCTCCCCCATACTCCCCATACTCCCCCACACTCCCCCACACTCCCCCATACTCCCCACACTCCCCCATACTCCCCCAACCTCCCCATACTCCCCCACATTCCCCTATTACTCCCCATACTCCCCAAACTCCCCACATTCCCCCATACTCCCCCATACTCCCCAAACTCCCCCATACTCCTCCCCCACACTCCCCATACTCCCCCATACTCGCCCAACCTCCCCATACTCCCCCACTCCCCCATACTCCCCCACAGTCCCCCACACTCCCCCACACACTCCCCAACCTCCCCCATACTCCCCATACTCGCCCACACTCGCCCACACCCCCCCATACTCCCCACACTCCCCCACACTCCCCCACACCCCCCATACTCCCCCATACTCCCCATACTCCCCCACACCCCCACACT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=876112;STD_quant_start=160.964;STD_quant_stop=281.694;Kurtosis_quant_start=-1.48637;Kurtosis_quant_stop=-0.804806;SVTYPE=INS;SUPTYPE=AL,SR;SVLEN=1649;STRANDS=+-;STRANDS2=7,6,7,6;RE=13;REF_strand=48,38;Strandbias_pval=1;AF=0.151163 GT:DR:DV 0/0:73:13 -chr1 876433 35786 N N[chr4:189980733[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr4;END=189980733;STD_quant_start=373.501;STD_quant_stop=193.312;Kurtosis_quant_start=-0.601023;Kurtosis_quant_stop=-0.696578;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=4,3,3,4;RE=7;REF_strand=15,15;Strandbias_pval=1;AF=0.233333 GT:DR:DV 0/0:23:7 -chr1 878423 35787 N ]chr3:198124405]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198124405;STD_quant_start=0.632456;STD_quant_stop=0.316228;Kurtosis_quant_start=4.00716;Kurtosis_quant_stop=2.14525;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=12,5,12,5;RE=17;REF_strand=34,32;Strandbias_pval=0.182341;AF=0.257576 GT:DR:DV 0/0:49:17 -chr1 878423 36833 N ]chr3:198124405]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr3;END=198124405;STD_quant_start=2.72029;STD_quant_stop=0.632456;Kurtosis_quant_start=1.9394;Kurtosis_quant_stop=6.69527;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=-+;STRANDS2=3,8,3,8;RE=11;REF_strand=34,32;Strandbias_pval=0.19555;AF=0.166667 GT:DR:DV 0/0:55:11 -chr1 882645 20 ATATATTAGCTATTCTAGACTTTATGCATTTATGTAAAGTTTTCTTTGTTGCACTTTAAGTTCTGTGATACATGGGCAGAGCATGC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=882732;STD_quant_start=2.70801;STD_quant_stop=2.51661;Kurtosis_quant_start=0.409091;Kurtosis_quant_stop=3.73961;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-87;STRANDS=+-;STRANDS2=8,1,8,1;RE=9;REF_strand=77,90;Strandbias_pval=0.0153702;AF=0.0538922 GT:DR:DV 0/0:158:9 -chr1 883246 35788 N N[chr20:29351529[ . STRANDBIAS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr20;END=29351529;STD_quant_start=6.0208;STD_quant_stop=8.59506;Kurtosis_quant_start=4.94502;Kurtosis_quant_stop=2.53006;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=8,0,0,8;RE=8;REF_strand=31,35;Strandbias_pval=0.00564375;AF=0.121212 GT:DR:DV 0/0:58:8 -chr1 883246 35789 N N[chr20:29789177[ . STRANDBIAS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr20;END=29789177;STD_quant_start=1.73205;STD_quant_stop=2.05481;Kurtosis_quant_start=4.74074;Kurtosis_quant_stop=5.09003;SVTYPE=BND;SUPTYPE=SR;SVLEN=1;STRANDS=++;STRANDS2=9,0,0,9;RE=9;REF_strand=30,34;Strandbias_pval=0.00272312;AF=0.140625 GT:DR:DV 0/0:55:9 -chr1 886250 21 N TGTGCTGGCCCTTTGGCAGAGCAGGTGTGGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=886250;STD_quant_start=14.922;STD_quant_stop=15.3351;Kurtosis_quant_start=-0.359429;Kurtosis_quant_stop=-0.424765;SVTYPE=INS;SUPTYPE=AL;SVLEN=32;STRANDS=+-;STRANDS2=4,2,4,2;RE=6;REF_strand=20,38;Strandbias_pval=0.186216;AF=0.103448 GT:DR:DV 0/0:52:6 -chr1 893790 22 AAAAAAAAAAAAATATATATATATATATATATATAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=893826;STD_quant_start=0.738549;STD_quant_stop=0;Kurtosis_quant_start=-1.74362;Kurtosis_quant_stop=-1.42857;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-36;STRANDS=+-;STRANDS2=13,10,13,10;RE=23;REF_strand=28,24;Strandbias_pval=1;AF=0.442308 GT:DR:DV 0/1:29:23 -chr1 907836 23 N CTGCCCGGTCCTTCTGACCAGCCGAGAGAGTA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=907836;STD_quant_start=11.7346;STD_quant_stop=12.1491;Kurtosis_quant_start=-0.460251;Kurtosis_quant_stop=-0.470373;SVTYPE=INS;SUPTYPE=AL;SVLEN=32;STRANDS=+-;STRANDS2=6,6,6,6;RE=12;REF_strand=34,32;Strandbias_pval=1;AF=0.181818 GT:DR:DV 0/0:54:12 -chr1 909140 24 TTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCATCTTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCCGGGCGCACT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=909494;STD_quant_start=64.8764;STD_quant_stop=64.2294;Kurtosis_quant_start=-2.03504;Kurtosis_quant_stop=-1.70278;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-354;STRANDS=+-;STRANDS2=20,14,20,14;RE=34;REF_strand=39,33;Strandbias_pval=0.680851;AF=0.472222 GT:DR:DV 0/1:38:34 -chr1 934067 25 GGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=934880;STD_quant_start=19.0606;STD_quant_stop=20.0499;Kurtosis_quant_start=1.46688;Kurtosis_quant_stop=-1.88984;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-813;STRANDS=+-;STRANDS2=12,14,12,14;RE=26;REF_strand=24,33;Strandbias_pval=0.812937;AF=0.45614 GT:DR:DV 0/1:31:26 -chr1 936289 26 AGGGCTCCTGGACGGAGGGGGTCCCCGGTCCCGCCTCCTA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=936328;STD_quant_start=5.46316;STD_quant_stop=5.2915;Kurtosis_quant_start=0.217921;Kurtosis_quant_stop=0.801437;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-39;STRANDS=+-;STRANDS2=11,15,11,15;RE=26;REF_strand=24,34;Strandbias_pval=1;AF=0.448276 GT:DR:DV 0/1:32:26 -chr1 948662 27 N CCTGGCTGTCCTTGGTCCCCTGGTCCCTTGGCCCTGCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=948696;STD_quant_start=12.3786;STD_quant_stop=16.4784;Kurtosis_quant_start=-2.07742;Kurtosis_quant_stop=-2.33448;SVTYPE=INS;SUPTYPE=AL;SVLEN=37;STRANDS=+-;STRANDS2=8,19,8,19;RE=27;REF_strand=18,40;Strandbias_pval=1;AF=0.465517 GT:DR:DV 0/1:31:27 -chr1 964642 28 CAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCCGCAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCCGCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=964717;STD_quant_start=14.3717;STD_quant_stop=16.1442;Kurtosis_quant_start=-1.6698;Kurtosis_quant_stop=-1.78719;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-75;STRANDS=+-;STRANDS2=12,10,12,10;RE=22;REF_strand=26,22;Strandbias_pval=1;AF=0.458333 GT:DR:DV 0/1:26:22 -chr1 976811 29 N CAACCCCGGGAACCGCCTCCCACTCCCCCCACCAACCCCCGGGAACCGCCTCCCACTTCTCCCGCAACCCCGGGAACTGCCTCCCACTCCCTTCTGCAACCCCCGGGAACCGCTCCCACTCCCCGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=976916;STD_quant_start=53.1169;STD_quant_stop=72.0569;Kurtosis_quant_start=-0.975517;Kurtosis_quant_stop=-0.734689;SVTYPE=INS;SUPTYPE=AL,SR;SVLEN=87;STRANDS=+-;STRANDS2=9,10,9,10;RE=15;REF_strand=36,36;Strandbias_pval=1;AF=0.208333 GT:DR:DV 0/0:57:15 -chr1 977334 30 N CGCTCCCCACTCCCCCGCAACTTCGGGAACCGCCTCCCCACTCCCCCACCAACCCCTGAACCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=977334;STD_quant_start=59.633;STD_quant_stop=54.4259;Kurtosis_quant_start=-1.27956;Kurtosis_quant_stop=-1.57241;SVTYPE=INS;SUPTYPE=AL;SVLEN=131;STRANDS=+-;STRANDS2=11,13,11,13;RE=22;REF_strand=38,36;Strandbias_pval=0.814668;AF=0.297297 GT:DR:DV 0/0:52:22 -chr1 977541 31 N CCCCGGAACCGCTCCCACCGCGCGCAACCCCTGAACCGCCTCCCACTCCCCACCAACCCTGGAACCGCCTCCACTCCCCTCTTACCGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=977584;STD_quant_start=48.8615;STD_quant_stop=40.2961;Kurtosis_quant_start=-0.497493;Kurtosis_quant_stop=-0.185665;SVTYPE=INS;SUPTYPE=AL;SVLEN=67;STRANDS=+-;STRANDS2=3,6,3,6;RE=9;REF_strand=36,38;Strandbias_pval=0.490453;AF=0.121622 GT:DR:DV 0/0:65:9 -chr1 977848 32 N ACCAACCGGGGAGCCGCCTCCCCTCCCCCCACCCGCCCCGAGCCGCCTGCCCCCGCCACCAACCCCGGGAACCACCTCCCACTCCCCGCCCAACCCCGGGAACCGCCCCTCCCCTCCCCACG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=977882;STD_quant_start=61.4687;STD_quant_stop=66.9395;Kurtosis_quant_start=-1.68422;Kurtosis_quant_stop=-1.71073;SVTYPE=INS;SUPTYPE=AL;SVLEN=86;STRANDS=+-;STRANDS2=9,10,9,10;RE=19;REF_strand=34,38;Strandbias_pval=1;AF=0.263889 GT:DR:DV 0/0:53:19 -chr1 988831 33 N AGTTCTGGAGTTGATTGTTTCTCAGAGGTTCAGGGTTGAGTGTTC . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=988831;STD_quant_start=6.13314;STD_quant_stop=6.34277;Kurtosis_quant_start=-1.27849;Kurtosis_quant_stop=-1.31557;SVTYPE=INS;SUPTYPE=AL;SVLEN=46;STRANDS=+-;STRANDS2=11,16,11,16;RE=27;REF_strand=22,36;Strandbias_pval=0.815445;AF=0.465517 GT:DR:DV 0/1:31:27 -chr1 996353 34 N GCACCTACATCTGGGGCCACAGGATGCAGGGTGGGGAGGGCAAGGCCTCTGCGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=996353;STD_quant_start=25.8341;STD_quant_stop=26.3869;Kurtosis_quant_start=-1.7722;Kurtosis_quant_stop=-1.00787;SVTYPE=INS;SUPTYPE=AL;SVLEN=64;STRANDS=+-;STRANDS2=12,18,12,18;RE=30;REF_strand=30,52;Strandbias_pval=0.826462;AF=0.365854 GT:DR:DV 0/1:52:30 -chr1 998765 35 N GGGGAGGGCGCTGAGCCGAGGGGGAGGGCTGAGCGGGAG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=998770;STD_quant_start=11.4935;STD_quant_stop=9.94485;Kurtosis_quant_start=-1.80857;Kurtosis_quant_stop=-1.45908;SVTYPE=INS;SUPTYPE=AL;SVLEN=34;STRANDS=+-;STRANDS2=4,10,4,10;RE=14;REF_strand=26,46;Strandbias_pval=0.762111;AF=0.194444 GT:DR:DV 0/0:58:14 -chr1 1030890 36 TGTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1030928;STD_quant_start=15.6993;STD_quant_stop=15.5285;Kurtosis_quant_start=0.12083;Kurtosis_quant_stop=-0.088691;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-38;STRANDS=+-;STRANDS2=18,13,18,13;RE=31;REF_strand=39,26;Strandbias_pval=1;AF=0.476923 GT:DR:DV 0/1:34:31 -chr1 1041778 37 GGCCAGTGCCAGGGTCGAGGTGGGCGGCTCCCCCGGGGGAGGGCTG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1041824;STD_quant_start=15.7567;STD_quant_stop=16.687;Kurtosis_quant_start=-0.850302;Kurtosis_quant_stop=-1.83988;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-46;STRANDS=+-;STRANDS2=11,12,11,12;RE=23;REF_strand=23,25;Strandbias_pval=1;AF=0.479167 GT:DR:DV 0/1:25:23 -chr1 1068748 38 N AAGGCCACGCGGGCTGTGCAGATGCAGGTGCGGCGGGGCGGGCCACGCGGGCTGTGAAGGTGCAGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1068809;STD_quant_start=28.0891;STD_quant_stop=12.2066;Kurtosis_quant_start=-2.18788;Kurtosis_quant_stop=-1.80804;SVTYPE=INS;SUPTYPE=AL;SVLEN=75;STRANDS=+-;STRANDS2=19,14,19,14;RE=33;REF_strand=42,28;Strandbias_pval=0.832838;AF=0.471429 GT:DR:DV 0/1:37:33 -chr1 1076283 39 GCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGCTGGGAGGCTGAGGCTATGGGGACTCCGTCGGGGGAGGCTGAGTCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGGGGCAGGCTGAGGCTATGGTGACTCCGTGCAGGGCTGTGAGGCTACGGGGACTCCGTGGGGGGTGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1076735;STD_quant_start=48.1819;STD_quant_stop=70.5606;Kurtosis_quant_start=0.175533;Kurtosis_quant_stop=-1.43403;SVTYPE=DEL;SUPTYPE=AL,SR;SVLEN=-452;STRANDS=+-;STRANDS2=7,11,7,11;RE=18;REF_strand=18,23;Strandbias_pval=0.780972;AF=0.439024 GT:DR:DV 0/1:23:18 -chr1 1076341 40 GCTGGGAGGCTGAGGCTATGGGGACTCCGTCGGGGGAGGCTGAGTCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGGGGCAGGCTGAGGCTATGGTGACTCCGTGCAGGGCTGTGAGGCTACGGGGACTCCGTGGGGGGTGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGCCGGGAGGCTGAGGCTACGGGGACTCCGTGCGGGGAGGCTGAGTCTACGGGGACTCCGTGAGGGGTGGCTGAGTCTATGGGGACTCCGTGCGGGGAGGCTGAGTCTATGGGGACTCCGTGCGGGGTGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCTGTGCCGGGAGGCTGAGGCTACGGGGACTCCGTGCCGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGGGGGGAGGCTGAGTCTATGGGGACTCCGTGCCGAGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTTGGGGGAGGCTGAGGCTATGGGGACTCCGTTGGGGGAGGCTGAGGCTATGGGGACTCCGTTGGGGGAGGCTGAGGCTATGGGGACTCCGTGCGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1077781;STD_quant_start=341.572;STD_quant_stop=478.388;Kurtosis_quant_start=-1.1971;Kurtosis_quant_stop=-0.402974;SVTYPE=DEL;SUPTYPE=AL,SR;SVLEN=-1440;STRANDS=+-;STRANDS2=3,2,3,2;RE=5;REF_strand=19,23;Strandbias_pval=0.653637;AF=0.119048 GT:DR:DV 0/0:37:5 -chr1 1080919 41 N CTGTCCTTCTCACTTCCTGCCTCGGTCTCTGTCTCCTTCCCTCCGCCCTACCTCGGTCCTATCATCCTTCCTCGCCTACCTCAGGTCCCTGTCTCCTTCCCTCCATACACACTCGGTCCCTGTCTCTCTTCCCTCCGCCTGGTCCCTGTCTCCTTCCCTCCTTCCCCCCACCTCCGGTCCTGTCTCCTTCCCTCCCTTCCGCCTCAGTCTGTCTCACTTCCCTCCGCCCACCTCAGTCCCCTGTCTCCTTCCCTCCCACACTTCCTCTGGTCCTGTCTCCTTCCCTCAAGCCCCCTCAGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1080919;STD_quant_start=32.45;STD_quant_stop=47.856;Kurtosis_quant_start=-1.30067;Kurtosis_quant_stop=-1.55852;SVTYPE=INS;SUPTYPE=AL;SVLEN=289;STRANDS=+-;STRANDS2=12,9,12,9;RE=21;REF_strand=26,20;Strandbias_pval=1;AF=0.456522 GT:DR:DV 0/1:25:21 -chr1 1139106 42 GGGTCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGACCTGGGTCCTGGGGAGTTTCCTGGGGTCAGAAGGTAGGGGTGTCAATGTCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1139227;STD_quant_start=15.678;STD_quant_stop=18.4038;Kurtosis_quant_start=-2.1908;Kurtosis_quant_stop=-2.23094;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-121;STRANDS=+-;STRANDS2=15,6,15,6;RE=21;REF_strand=41,31;Strandbias_pval=0.312886;AF=0.291667 GT:DR:DV 0/0:51:21 -chr1 1140200 43 AGGTGGGGGTGTCAACGTCGAACCGGGGGGCCTGGGTCCTGGGGAGCTTCCTGGGGTC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1140259;STD_quant_start=17.0822;STD_quant_stop=14.6151;Kurtosis_quant_start=0.610656;Kurtosis_quant_stop=0.583042;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-59;STRANDS=+-;STRANDS2=14,6,14,6;RE=20;REF_strand=44,36;Strandbias_pval=0.312112;AF=0.25 GT:DR:DV 0/0:60:20 -chr1 1140410 44 N CGTCCGAACCGGGGGGACCTGGGTCCTGGGAGCTTCCTGGGTTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGAGTCCTGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCCTGGGGGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACATCGAACCGGGGGGCCCTGGGAGTCCTGGGAGCTTCTGGGGTCAGAAGGTGGGAGTGTCCAGCATCGAACCGGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTAGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCATGGGGGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGGCCTGGGTCCTGGGAGCTTCCTGGGGTCAGAAGGTAGGGGTGTCAACGTCAGACAGGGGACCTGGGTCCTGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACGCGTCGAACCGGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCCAG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1140410;STD_quant_start=232.849;STD_quant_stop=281.385;Kurtosis_quant_start=1.04118;Kurtosis_quant_stop=0.939709;SVTYPE=INS;SUPTYPE=AL;SVLEN=536;STRANDS=+-;STRANDS2=4,7,4,7;RE=11;REF_strand=42,40;Strandbias_pval=0.522879;AF=0.134146 GT:DR:DV 0/0:71:11 -chr1 1141388 45 N TCATCCTCTGTCCACAACCCCATCCTTACCTCTATCCCCCACCTTACATCTCATTCCTCTATCCCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1141394;STD_quant_start=10.6724;STD_quant_stop=14.1704;Kurtosis_quant_start=-1.44054;Kurtosis_quant_stop=-1.65538;SVTYPE=INS;SUPTYPE=AL;SVLEN=55;STRANDS=+-;STRANDS2=14,7,14,7;RE=21;REF_strand=44,40;Strandbias_pval=0.327428;AF=0.25 GT:DR:DV 0/0:63:21 -chr1 1168031 46 CGGGGCCAGCAGACGGGTGAGGGCGGAGGGCCGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1168063;STD_quant_start=14.2864;STD_quant_stop=13.7514;Kurtosis_quant_start=-1.22974;Kurtosis_quant_stop=-1.1581;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;STRANDS2=7,8,7,8;RE=15;REF_strand=38,32;Strandbias_pval=0.776548;AF=0.214286 GT:DR:DV 0/0:55:15 -chr1 1212606 47 N CAGCCCTCCTCCCAGCCCCTGGCTCCCTCTGCCCCCTCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1212613;STD_quant_start=10.1207;STD_quant_stop=8.7014;Kurtosis_quant_start=-0.032446;Kurtosis_quant_stop=-1.09448;SVTYPE=INS;SUPTYPE=AL;SVLEN=32;STRANDS=+-;STRANDS2=2,5,2,5;RE=7;REF_strand=30,28;Strandbias_pval=0.42665;AF=0.12069 GT:DR:DV 0/0:51:7 -chr1 1226331 48 CCCTCAACCCTGTACGGTCAGGAGGAAACATGGCACCTCCCCTCTGGGGGCTCTTTCCAGAAAC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1226395;STD_quant_start=5.74456;STD_quant_stop=5.1672;Kurtosis_quant_start=-1.50778;Kurtosis_quant_stop=-1.62886;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-64;STRANDS=+-;STRANDS2=8,7,8,7;RE=15;REF_strand=34,39;Strandbias_pval=0.77815;AF=0.205479 GT:DR:DV 0/0:58:15 -chr1 1227293 49 GCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1227467;STD_quant_start=2.14476;STD_quant_stop=1.34164;Kurtosis_quant_start=0.1517;Kurtosis_quant_stop=0.426462;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-174;STRANDS=+-;STRANDS2=8,8,8,8;RE=16;REF_strand=32,42;Strandbias_pval=0.782406;AF=0.216216 GT:DR:DV 0/0:58:16 -chr1 1240679 50 N CCGCCCCCATTCACCCCGGCCGTGGTCCCTACCGCAGCCCCA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1240683;STD_quant_start=7.86398;STD_quant_stop=8.59008;Kurtosis_quant_start=-1.97851;Kurtosis_quant_stop=-1.85397;SVTYPE=INS;SUPTYPE=AL;SVLEN=40;STRANDS=+-;STRANDS2=16,22,16,22;RE=38;REF_strand=34,48;Strandbias_pval=1;AF=0.463415 GT:DR:DV 0/1:44:38 -chr1 1245159 51 N CTCTGCCCTCCTCCCACCTTCCCCCTCCTCCCCCCACTCCCTCTCCCCTCTTCCCCCGACTCCCTTCCCCTACTCATCTCCTCCTCACCCACTCCTCTCCCCCTCCTCTCCCACTCCTCCCCCTCCTCCCCCCCACTCCTCCCCCCACTGCACTCTCCCCTCTTCCCCCACTCCTCCCCACTCCTCTCCCCTCCTTCTCACCTCCTCTCCCCTCCTCCTCCTCCTGTCCCTCCTCCCCCTCTTCCCCCTCCTCCCCATATACCCTCCTCCTCCTCTCCCTCTTCCTCCCACTCCCCCCACTCCTCCCCACTCCTCTCCCCTCTTGCCCCTCCTCCCTACCACTCCTTCCTCCTCTCCTCTCTTCCCCCCACTCCCTCCCCCACTCCTCTCCTCCTCCACCTCCTCTCCCCTCCTCCCCCACTCCTCTCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1245159;STD_quant_start=3.43996;STD_quant_stop=14.8436;Kurtosis_quant_start=4.9032;Kurtosis_quant_stop=3.59937;SVTYPE=INS;SUPTYPE=AL;SVLEN=439;STRANDS=+-;STRANDS2=12,13,12,13;RE=25;REF_strand=28,36;Strandbias_pval=0.813884;AF=0.390625 GT:DR:DV 0/1:39:25 -chr1 1248060 52 GATCTCCAACTCTGACCTACAGGCAGGAAAGTGGGCAGCCCTGGGAGGCTGGACTGAGGGAGGCTGGACTTCCCACTCAGGCCTACACGCAGGAAAATGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCACCCTACAGGCCAGGACACGGGCAGCCCTGGGAGGCTAGACCGAGGGAGGCTGGGCCTCCCATCTACCCTACAGGCCGGGACACAGGCAGCCCTGGGAGGCTGTACCGAGGGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1248319;STD_quant_start=52.6064;STD_quant_stop=21.0815;Kurtosis_quant_start=3.96322;Kurtosis_quant_stop=3.62028;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-259;STRANDS=+-;STRANDS2=2,5,2,5;RE=7;REF_strand=23,28;Strandbias_pval=0.686983;AF=0.137255 GT:DR:DV 0/0:44:7 -chr1 1249348 53 N CGCTCACACCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGGGGCTGGGCCTCCCCTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGGCTGGACCAGGGGAGGCGCCAGGCCTCCCACTCGCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCCCGAGGGAGGCTGGAGCCTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1249395;STD_quant_start=37.7478;STD_quant_stop=65.9212;Kurtosis_quant_start=-1.75534;Kurtosis_quant_stop=-1.77327;SVTYPE=INS;SUPTYPE=AL;SVLEN=124;STRANDS=+-;STRANDS2=8,10,8,10;RE=18;REF_strand=26,28;Strandbias_pval=1;AF=0.333333 GT:DR:DV 0/1:36:18 -chr1 1249588 54 N TGGGATCGAGAGCTGGCTCCCACCGCCTCCCAGGCCTGGACACTGCAGCCCTGGGAGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1249625;STD_quant_start=47.571;STD_quant_stop=45.3122;Kurtosis_quant_start=-1.76881;Kurtosis_quant_stop=-0.375917;SVTYPE=INS;SUPTYPE=AL;SVLEN=61;STRANDS=+-;STRANDS2=6,6,6,6;RE=12;REF_strand=26,28;Strandbias_pval=1;AF=0.222222 GT:DR:DV 0/0:42:12 -chr1 1284183 55 N TGAGGGGGTGGGGTGGGGGTTGAGTGAGGGGGTGGGGGGGTTGGGTGAGGGGGGTGGGGGGTTGGGTGAGGGGGTGGGGGGCTGGTGAGGGGGTGGGGTTGGGTGAGGGGGTGGGCTCGGGGGGGGTTGAGTGAGGGGGTGGGGTGGGGGGTTGGGTGAGGGGGGTGGGGTGGGGGTTGAGGAGGGGGTGGGGTGTTATGAGGGGTTGGGGGTTGGGTGAGGGGGGGTGGGGGTTGCGAGGGGGTGGGGGGTGGGGGGGTTGAGTGAGGGGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1284201;STD_quant_start=9.78895;STD_quant_stop=64.8333;Kurtosis_quant_start=-1.28536;Kurtosis_quant_stop=-1.87274;SVTYPE=INS;SUPTYPE=AL;SVLEN=282;STRANDS=+-;STRANDS2=18,16,18,16;RE=34;REF_strand=49,32;Strandbias_pval=0.535485;AF=0.419753 GT:DR:DV 0/1:47:34 -chr1 1288944 56 N CGTGTCCCTGCTCCGGGCCCCGTGTCTCTGTTCACTGGCCCCCGTGTCTCTGCTCCTCGTCCCGTGTCCCTTGCTCCGCCCTGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTTCTGCTCCGTCCTGTGTCTCTTGCTCCGGCCCCCGCGGTCTCTGCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1289049;STD_quant_start=87.6482;STD_quant_stop=60.9024;Kurtosis_quant_start=-0.741898;Kurtosis_quant_stop=-0.516487;SVTYPE=INS;SUPTYPE=AL;SVLEN=64;STRANDS=+-;STRANDS2=6,5,6,5;RE=11;REF_strand=28,34;Strandbias_pval=0.74488;AF=0.177419 GT:DR:DV 0/0:51:11 -chr1 1289357 57 N TGCTCCGTCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCTCCCCGTGTCTCTGCCCCGTCCCGTGTCTACTCCGTCCCGGTCTCTGCTCCGTCCCCCGTGTCTACTCCGTCCCCCGTGTCTCTGCTCCGTCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCGTCCCCGTGTCTCTGCTCCGTCCGTGTCTCTGCTCAATCCCCCGTGTCTCTGCCCCGTCCCGATGTCTCTGCTCCGTCGATGTCTCTATGAGCTCTCCCGTGTCTGCTCCGTCCGTGTCTCTGCTCCGTCCGATGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1289476;STD_quant_start=159.932;STD_quant_stop=97.4515;Kurtosis_quant_start=-1.84527;Kurtosis_quant_stop=-1.54002;SVTYPE=INS;SUPTYPE=AL;SVLEN=162;STRANDS=+-;STRANDS2=8,9,8,9;RE=16;REF_strand=28,32;Strandbias_pval=1;AF=0.266667 GT:DR:DV 0/0:44:16 -chr1 1289780 58 N CCCCGTGTCTCTGCTCCGTCCGTGTCTACTCCGTCCCGATGTCTCTGCCACGTCCCCGTGTCTCTGCCCCGTCCCCGTGTCTCTGCCCCGTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1289823;STD_quant_start=26.7955;STD_quant_stop=30.8675;Kurtosis_quant_start=-0.210759;Kurtosis_quant_stop=-0.202563;SVTYPE=INS;SUPTYPE=AL;SVLEN=91;STRANDS=+-;STRANDS2=8,7,8,7;RE=15;REF_strand=28,30;Strandbias_pval=0.778387;AF=0.258621 GT:DR:DV 0/0:43:15 -chr1 1290106 59 N GTGTCTCTGTCTGGCCCCCCGTGTCTCTGCTCCAGCCCCGTGCCCTGCTCCTCATT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1290106;STD_quant_start=38.9923;STD_quant_stop=30.4023;Kurtosis_quant_start=0.491131;Kurtosis_quant_stop=-0.169138;SVTYPE=INS;SUPTYPE=AL;SVLEN=103;STRANDS=+-;STRANDS2=8,9,8,9;RE=16;REF_strand=30,34;Strandbias_pval=1;AF=0.25 GT:DR:DV 0/0:48:16 -chr1 1324173 60 N GGGCTCAGGGGCTGGGGGCTGCTGGGCTGAGGCTGGGGAGACTGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1324173;STD_quant_start=5.02933;STD_quant_stop=13.0384;Kurtosis_quant_start=-1.45664;Kurtosis_quant_stop=-1.54076;SVTYPE=INS;SUPTYPE=AL;SVLEN=66;STRANDS=+-;STRANDS2=19,16,19,16;RE=35;REF_strand=38,36;Strandbias_pval=0.838779;AF=0.472973 GT:DR:DV 0/1:39:35 -chr1 1350109 61 GGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1351186;STD_quant_start=20.5878;STD_quant_stop=20.3663;Kurtosis_quant_start=-0.011268;Kurtosis_quant_stop=-1.64329;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-1077;STRANDS=+-;STRANDS2=15,13,15,13;RE=28;REF_strand=32,29;Strandbias_pval=1;AF=0.459016 GT:DR:DV 0/1:33:28 -chr1 1366913 62 TGAATTGGTGAGTTGGTGTGAATTGAATTGTGTGAATGAGTGGATTGGTGAGTGAATTGGTGAGTTGAATTGGTGTGTGTAGTGGATGAGTGTGGATGAATGTGAATTGGCGAGTATGGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1367033;STD_quant_start=14.0961;STD_quant_stop=39.5335;Kurtosis_quant_start=0.90213;Kurtosis_quant_stop=3.80352;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-120;STRANDS=+-;STRANDS2=7,3,7,3;RE=10;REF_strand=36,30;Strandbias_pval=0.499306;AF=0.151515 GT:DR:DV 0/0:56:10 -chr1 1382683 63 N CAACAATCCAGTAACAATCCAGAGGTCACCACCCTTCCCAACAATCCAGTAATCCAGAGGTTACCACCCTTCCCAACAATCCACTAACAATCCAGAGGCCACCACCCTTCCCAGCAATCGGCAAGGACCCAGAGGCCACCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGGTCACCACCCCTTCCCAAAATCAGTAACCAGGGAGTCCACCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGCCACCACCCCTTCCCACAACAATCCAGTAACAATCCAGAGGTACCACCCTTCCCAACAATCCAGTAACAATCGACCACCACCCTTCCCAACAATCCAGTAACAATCCAGAGGACACCACCCTTCCCAGCAATCCACTAGCAATCCAGAGGCCACCACCCCTTCCCAACAATCTGGCTTAGCGACCAGAGAGCCACCACCCCTTCCCAACAATCAGTAACAATCCAGGAGTCACCACCGCTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1382683;STD_quant_start=27.5276;STD_quant_stop=144.602;Kurtosis_quant_start=-2.03439;Kurtosis_quant_stop=-2.16418;SVTYPE=INS;SUPTYPE=AL;SVLEN=494;STRANDS=+-;STRANDS2=11,15,11,15;RE=26;REF_strand=24,34;Strandbias_pval=1;AF=0.448276 GT:DR:DV 0/1:32:26 -chr1 1427516 64 N TCCCAAGTCTCGGCCTCCCTCTCCACCCCTCCCCTTTCCCCTGCATCACCCCGCCCAGCCCCCACCCCTCCATCACCCTGCTCCCGCCCCCTCCCCTCCATCCTGCCCCCCTCCCCCTCCATCACCCTGCCCAGCCCCCTCCCCTCCATCACTCCCAAGCCCTGCCCCCTTCCATCACCCTGCCCTGCCCCCACCCCATCACCCTGCCCTGCCCCCTTCCCCTCCATCATCCCGCCCGCTCCCCTCTCCACCCCTCCCTCTCCCCTGCATCACTCCCTGCCCTGCCCCTTTCCCCCTCCATCACCCCAGCCTCTGCCCCTCCCCTCCCCTCCATCACCCTGCCCTGCCCTCCTCTCTCCATCACTCCCTGTCTCTGCCCCCACCCCTCCATCATTCTGCCCTGCTCCTCTCCACCTCCCCCTTCCCCTGCATCACCCAGCCTTCTAAGTTCCTTCCTCCATCAATTCTGCCTCTGCCCCTCCCCTCCATCACTCCTGGCACTCTGCCCTCCCTCCATCACTCCTGCTCCTGCCCCCCACCCCTCCATCATCTTCACACTTTCCACCCCTCCCTTCCCTTCCCCTGCATCATCTGCACTCCTGCCTCTTCCCCCTCCATCACCCTGCCCAGCCCCCTCCCCTCCACTGCTTGCGCCCTCCTCCATCACACCCCGGCCCTGCCCCCAGCTCCG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1427892;STD_quant_start=48.4345;STD_quant_stop=77.4325;Kurtosis_quant_start=-1.98608;Kurtosis_quant_stop=-1.067;SVTYPE=INS;SUPTYPE=AL;SVLEN=439;STRANDS=+-;STRANDS2=12,9,12,9;RE=21;REF_strand=38,32;Strandbias_pval=1;AF=0.3 GT:DR:DV 0/0:49:21 -chr1 1428085 65 N AGGAGGGAGGGGGAGGAGGGGAGGAAGAAGAAGGAGGAAGAGGAAGGAGGAAAAGAGGAGGAGGAAAGAGAGAGGAAGAAAGGAGGGGAGGAGAAAGAGGAGGGGACAGGAGGGAAGGAGGAGAGAAAGAGGAAAAGAAAGGAGGGAGGGAAGGAGAGAGGAGAGGAAGAGAGAGACAGGGAAGGGAAAGAAAAACAGGGAGGGGAAGGAGGAGGAAGAGGAGGGAAGGAAGAAGAGGAGGAGAGGGAGGGAAGAGAGGAGGGAAAGAGGGAGGAGGAAGAGGGGGCAGGGGAGGAAGAAGAGAACA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1428085;STD_quant_start=33.6102;STD_quant_stop=42.9676;Kurtosis_quant_start=4.12818;Kurtosis_quant_stop=-1.69999;SVTYPE=INS;SUPTYPE=AL;SVLEN=222;STRANDS=+-;STRANDS2=16,13,16,13;RE=29;REF_strand=42,34;Strandbias_pval=1;AF=0.381579 GT:DR:DV 0/1:47:29 -chr1 1442871 66 N TTTCTATGGTAATGGTGATAAACCAAGTCAA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1442871;STD_quant_start=13.6345;STD_quant_stop=12.775;Kurtosis_quant_start=-1.56517;Kurtosis_quant_stop=-1.35954;SVTYPE=INS;SUPTYPE=AL;SVLEN=31;STRANDS=+-;STRANDS2=9,10,9,10;RE=19;REF_strand=32,38;Strandbias_pval=1;AF=0.271429 GT:DR:DV 0/0:51:19 -chr1 1443674 67 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1443708;STD_quant_start=0;STD_quant_stop=1.04881;Kurtosis_quant_start=-0.5;Kurtosis_quant_stop=-1.79438;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-34;STRANDS=+-;STRANDS2=6,14,6,14;RE=20;REF_strand=34,42;Strandbias_pval=0.310419;AF=0.263158 GT:DR:DV 0/0:56:20 -chr1 1469099 68 TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA N . STRANDBIAS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1469134;STD_quant_start=10.2652;STD_quant_stop=9.18559;Kurtosis_quant_start=3.94733;Kurtosis_quant_stop=2.94515;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=0,8,0,8;RE=8;REF_strand=42,38;Strandbias_pval=0.00589579;AF=0.1 GT:DR:DV 0/0:72:8 -chr1 1477855 69 N CACCACGCCCGGCTAATGTTGTATTTTTAGTAGAGACGGGTTTCTCCCATGGTCAGGCTGGTCTCTAACTCCCGACCTCAGGTGATCCACCCGCCTCGGCCTCTCAACCAGTTGGGATTACAGGCATGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1477883;STD_quant_start=11.94;STD_quant_stop=21.8689;Kurtosis_quant_start=-1.766;Kurtosis_quant_stop=-1.90683;SVTYPE=INS;SUPTYPE=AL;SVLEN=131;STRANDS=+-;STRANDS2=17,15,17,15;RE=32;REF_strand=36,38;Strandbias_pval=0.832673;AF=0.432432 GT:DR:DV 0/1:42:32 -chr1 1497123 70 N CCTCGGCCTGGGCACGAACGGTCCCATCGAGAGCAGA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1497123;STD_quant_start=3.84708;STD_quant_stop=4.3589;Kurtosis_quant_start=-1.05282;Kurtosis_quant_stop=-1.08734;SVTYPE=INS;SUPTYPE=AL;SVLEN=40;STRANDS=+-;STRANDS2=7,8,7,8;RE=15;REF_strand=34,36;Strandbias_pval=1;AF=0.214286 GT:DR:DV 0/0:55:15 -chr1 1554173 71 CTAAGGGGTCCCCACGAAGCTGAGCACGAGGCGGATCCGGAC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1554213;STD_quant_start=9.73653;STD_quant_stop=10.0846;Kurtosis_quant_start=0.855249;Kurtosis_quant_stop=1.94672;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-40;STRANDS=+-;STRANDS2=6,7,6,7;RE=13;REF_strand=34,28;Strandbias_pval=0.760968;AF=0.209677 GT:DR:DV 0/0:49:13 -chr1 1595833 72 GAGCAGAACAGGGAGAGACAGAGAGAGAGAGACAGAGAGAGGCAGACAGAGACAGAGAGAGAGACAGACAC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1595902;STD_quant_start=33.7313;STD_quant_stop=33.2971;Kurtosis_quant_start=2.73708;Kurtosis_quant_stop=2.38236;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-69;STRANDS=+-;STRANDS2=5,5,5,5;RE=10;REF_strand=36,42;Strandbias_pval=1;AF=0.128205 GT:DR:DV 0/0:68:10 -chr1 1595853 73 N ACAGAGAGACAGAGAGAGAAACAGAGAGACAGAGACAGAGAGGCAGACAGAGAGAGACAGACAGAGAGCAGAACAGGGAGAGACAAAAGAGACAGAGAGAGAGAGACAC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1595863;STD_quant_start=39.2785;STD_quant_stop=49.7011;Kurtosis_quant_start=-0.536228;Kurtosis_quant_stop=-0.678589;SVTYPE=INS;SUPTYPE=AL;SVLEN=77;STRANDS=+-;STRANDS2=3,2,3,2;RE=5;REF_strand=36,42;Strandbias_pval=0.661994;AF=0.0641026 GT:DR:DV 0/0:73:5 -chr1 1605690 74 N GGCTGGGCTGGTCAGGTGTAGGCTGGGCTGGTCAGGCGTGGAGTGGGCTGGTCAGGCGTGGGGTGGGGTGGGCTGGTCAGGTGTGGGCTGGGCCTGGTCAGGTGTGAGGTGGGGTGGTGGGGGTGAGGGGGTTGTCTGGTCAGGTGTGGAGTGGGCTGGTCAGGTGTGGGCTGGGCTGGTCCAGACAGGGTCGGCTGGTCAGGTGTGGGCTGGGCTGGGCTGGTCAGGTGTGGGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1605690;STD_quant_start=28.6112;STD_quant_stop=43.4385;Kurtosis_quant_start=-0.191101;Kurtosis_quant_stop=-1.21501;SVTYPE=INS;SUPTYPE=AL;SVLEN=226;STRANDS=+-;STRANDS2=7,7,7,7;RE=14;REF_strand=38,40;Strandbias_pval=1;AF=0.179487 GT:DR:DV 0/0:64:14 -chr1 1666975 75 CACGCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCACTTCAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCAAACCAGAGAAATCCAGCTCTGGGTGACAGAGCAAGACTCTGTTTCGGGAAAAATAAAATACATAGGCAGGGCGCGGTGGCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1667142;STD_quant_start=0;STD_quant_stop=0;Kurtosis_quant_start=11.8809;Kurtosis_quant_stop=8.99409;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-167;STRANDS=+-;STRANDS2=18,14,18,14;RE=32;REF_strand=38,28;Strandbias_pval=1;AF=0.484848 GT:DR:DV 0/1:34:32 -chr1 1681989 76 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1682020;STD_quant_start=0;STD_quant_stop=1.30384;Kurtosis_quant_start=6.9449;Kurtosis_quant_stop=-0.962407;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-31;STRANDS=+-;STRANDS2=10,8,10,8;RE=18;REF_strand=34,26;Strandbias_pval=1;AF=0.3 GT:DR:DV 0/0:42:18 -chr1 1717605 77 GCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTCGGTCTGTGACACACGCAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1717656;STD_quant_start=8.22935;STD_quant_stop=8.90381;Kurtosis_quant_start=-1.60307;Kurtosis_quant_stop=-1.75287;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-51;STRANDS=+-;STRANDS2=18,19,18,19;RE=37;REF_strand=40,39;Strandbias_pval=1;AF=0.468354 GT:DR:DV 0/1:42:37 -chr1 1749606 78 N GTCCATGCATATTTTTCTGTGTGATGTGTCTGTGTGTGTGTCTCAGTGGT . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1749616;STD_quant_start=6.64118;STD_quant_stop=6.18572;Kurtosis_quant_start=-1.82765;Kurtosis_quant_stop=-1.82115;SVTYPE=INS;SUPTYPE=AL;SVLEN=48;STRANDS=+-;STRANDS2=19,19,19,19;RE=38;REF_strand=42,38;Strandbias_pval=0.84535;AF=0.475 GT:DR:DV 0/1:42:38 -chr1 1766411 79 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1766446;STD_quant_start=2.64575;STD_quant_stop=6.72681;Kurtosis_quant_start=1;Kurtosis_quant_stop=2.21022;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;STRANDS2=2,6,2,6;RE=8;REF_strand=42,40;Strandbias_pval=0.267342;AF=0.097561 GT:DR:DV 0/0:74:8 -chr1 1845825 80 ACACACACACACACACACACACACACACACAC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1845857;STD_quant_start=4.42396;STD_quant_stop=5.59336;Kurtosis_quant_start=0.08546;Kurtosis_quant_stop=1.51911;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;STRANDS2=4,3,4,3;RE=7;REF_strand=26,26;Strandbias_pval=1;AF=0.134615 GT:DR:DV 0/0:45:7 -chr1 1924230 81 N CCCCCAGCCTGCAGCCCACCCCCCCATCTCACCGCCTAGCCCCCATCTCACCAGCTGCCCCCTCCCCGACACACGCCCACCCCCTTATCTCACCAACCA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1924231;STD_quant_start=0.948683;STD_quant_stop=2.72029;Kurtosis_quant_start=0.969632;Kurtosis_quant_stop=-0.273855;SVTYPE=INS;SUPTYPE=AL;SVLEN=96;STRANDS=+-;STRANDS2=7,7,7,7;RE=14;REF_strand=36,28;Strandbias_pval=0.770084;AF=0.21875 GT:DR:DV 0/0:50:14 -chr1 1929385 82 N AGGGGACAGGTCTGGGGGGGAGGCAGGAGAGAGGGTGAGGGGGAGGCAGGAGTGGGGGAGGGAGGGGAGAGGGTAGGGAGGGAGGAGAGGGTAGGGGGAGGGAGGGAGAGAGGAGGAGGGGAGAGGGTGGGAGGGAGAGAGGAGGAGAAGGGAGGGGACATGGGGAGGGGAGAGGAAAGAGGAGGGAGGGAGAGGGGAGGGAGGGAGCGGGTGAGGGGAGGGAAAGGAGGGAAATGGTATGGGAGGGGAGGGAGGGGAGAGGGTGAGGGGGAGGGAGCAGAGGGAAAGGGTGGGGGAGGGAAGGAAGGGAGAGGGTGGGGGAGGGTAGGGAGGGAGGGAGAGAGAGGGTAGGGGGAGGGGGAGAGAGGGTGAGGAGGGGGAGGGTAGGGGAGGGAAGGAGGGGAGACGGTGAGGGAGGGAGGAGAGGGTAGGGGGGAGGGAGGAAGAGGAGGGGTAGGGAGGGAGGGAGAGGAGAGGGAGGAGGGGAGGAGGGGGAGAGAGGGGTAGGGAGGGAGGGGAGGGAGGGAAGAGGGTAGGAGGGAGGGAGAGGAGAGGGAGGGAGGGAGGGGAGGAGGGAGGGTGGGAGGAGGGAGAGGGTTAGGGGAGGGAGGGAGAGGGAGGGGGAGAGGGTAGGGAGGAGAGGAGGAGAGGGTAGAGGAGGGAGGAGGGGAGAGGGGAGGGGAGGGAGGGAGAAGAGGAGGGAGAGGGTAGGGAGGGAGGGAGAGGAGAGGGGGAGAGGGAGGAGGAGGAGGAGAGGGTAGGGAGGAGGGGAGGAGGGGAGGGGTAGGGAGGGAGGGAGAGGAGGGAGGGAGGGAGGGGGAGGAGGGGGAGAAAGTTAGGGAGGGAGGGAGAGGAGAGGGGGAGGGAGGGAGGGGGAGGAGGAGAGGGGTAGGGAGGGAGGAAGGGAGGGAGGGAGGAGGGCAGGAGGGGAAATTGGGAGGGAGGGGCAGGAGGGAGAGGGTAGGGAGGGAGGGCAGGAGGGAGAGGGTAGGGAGGGAGGGAGGAAGGGAGGGAGGTAGGGAGGAGGAGGAGAGGGTAGGGAGGGAGGAGGAGGGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1929385;STD_quant_start=0;STD_quant_stop=12.4023;Kurtosis_quant_start=0.385661;Kurtosis_quant_stop=-0.139538;SVTYPE=INS;SUPTYPE=AL;SVLEN=1062;STRANDS=+-;STRANDS2=14,9,14,9;RE=23;REF_strand=46,34;Strandbias_pval=0.814909;AF=0.2875 GT:DR:DV 0/0:57:23 -chr1 1934289 83 N TACACAGGTGTACATTAGATTATTAGGTTGTGAAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1934289;STD_quant_start=26.898;STD_quant_stop=24.4172;Kurtosis_quant_start=-1.47698;Kurtosis_quant_stop=-1.25573;SVTYPE=INS;SUPTYPE=AL;SVLEN=98;STRANDS=+-;STRANDS2=10,4,10,4;RE=14;REF_strand=34,20;Strandbias_pval=0.755487;AF=0.259259 GT:DR:DV 0/0:40:14 -chr1 1949003 84 N CTTCCCTTCCCCTTCCTTCCTTCTCTCCCTCTCCCTCCTCCTCTTCCCTCCTTTCCTTCCTTCCTTTCCCTTTCCTCCTTCCTCTCCCTCCCCTCCTTTCCCCTTTTCATTCCCTCTTCCCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1949044;STD_quant_start=33.9013;STD_quant_stop=7.46324;Kurtosis_quant_start=-2.16185;Kurtosis_quant_stop=-0.295908;SVTYPE=INS;SUPTYPE=AL;SVLEN=111;STRANDS=+-;STRANDS2=10,9,10,9;RE=19;REF_strand=30,48;Strandbias_pval=0.304283;AF=0.24359 GT:DR:DV 0/0:59:19 -chr1 1968925 85 CCCTCCTGGGGGCTCCGGTCCTGCCCAGCAGCCCCAGGTGAGACAGCGCCTGGCGGCCCCTCCCTAGCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1968994;STD_quant_start=2.28035;STD_quant_stop=2.25832;Kurtosis_quant_start=0.046742;Kurtosis_quant_stop=0.304863;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-69;STRANDS=+-;STRANDS2=7,10,7,10;RE=17;REF_strand=30,42;Strandbias_pval=1;AF=0.236111 GT:DR:DV 0/0:55:17 -chr1 1979021 86 AGGCTGCACAGAACACGTGTGTCGTGCTGAGCTGGGCGTGGGAAGGCGTCATGTGACGAGGCTGCACAGAACATGCGTGTGGTACTGAGCTGGGCGTGGGAAGGTGTCACGTGACAAGGCTGCACAGAACATGTGTGTGGTACTGAGCTGGGCGTGGGAAGGCATCATGTGACA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1979168;STD_quant_start=12.3369;STD_quant_stop=9.86577;Kurtosis_quant_start=3.47666;Kurtosis_quant_stop=3.76987;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-147;STRANDS=+-;STRANDS2=16,15,16,15;RE=31;REF_strand=34,35;Strandbias_pval=1;AF=0.449275 GT:DR:DV 0/1:38:31 -chr1 1980059 87 CTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGACTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGACTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1980291;STD_quant_start=9.01234;STD_quant_stop=11.0454;Kurtosis_quant_start=-1.20742;Kurtosis_quant_stop=-0.902165;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-232;STRANDS=+-;STRANDS2=18,18,18,18;RE=36;REF_strand=35,37;Strandbias_pval=1;AF=0.5 GT:DR:DV 0/1:36:36 -chr1 1981556 88 N CACGCAGGACACACAGCCGCGACGCACACCGGCACGCAGGACACCCAGCCACGGTCACACGCGGGGCACGCAGGACACCCAGCCGCGGTCACATGC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1981581;STD_quant_start=21.5465;STD_quant_stop=28.327;Kurtosis_quant_start=-1.84901;Kurtosis_quant_stop=-1.66641;SVTYPE=INS;SUPTYPE=AL;SVLEN=34;STRANDS=+-;STRANDS2=10,14,10,14;RE=24;REF_strand=34,40;Strandbias_pval=0.814965;AF=0.324324 GT:DR:DV 0/1:50:24 -chr1 1982045 89 N CGGGGACACGCAGGACACCCAGGACACCCAGCCGCGGACAGACACGGGGGCACACAGGACACCCAGCTCGTGGACAGACA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1982046;STD_quant_start=5.75698;STD_quant_stop=3.31663;Kurtosis_quant_start=-0.365062;Kurtosis_quant_stop=-0.55588;SVTYPE=INS;SUPTYPE=AL;SVLEN=79;STRANDS=+-;STRANDS2=15,14,15,14;RE=29;REF_strand=36,42;Strandbias_pval=0.666552;AF=0.371795 GT:DR:DV 0/1:49:29 -chr1 1982220 90 N AGATAGACACGGGACACGGACACCCCAGCCGTGACAGACACGGTGACAACACAGACACCCAGCCATGGACAGACACGGGCCACGAGGACACCCAGCCACGGACAGGGACATCGATGGCTTTATGACACTCCAGCCGGTAA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1982220;STD_quant_start=30.0322;STD_quant_stop=21.4957;Kurtosis_quant_start=-1.80704;Kurtosis_quant_stop=-1.88498;SVTYPE=INS;SUPTYPE=AL;SVLEN=206;STRANDS=+-;STRANDS2=12,19,12,19;RE=31;REF_strand=34,46;Strandbias_pval=0.830787;AF=0.3875 GT:DR:DV 0/1:49:31 -chr1 1993705 91 N GGGCACAGTGGCTCATGCCTGTAATCCCAGCAACATGGGAGCCTGAGGTGGGAGGCTCTCTTGACAGGAGTTTGAGACCAGCCTGGGCAACATAGCAGACCCCCCACCCCGCCATTTCTAGGAAAAAAAAAAAAAAAGTGGCC . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=1993712;STD_quant_start=0;STD_quant_stop=2.51396;Kurtosis_quant_start=11.9398;Kurtosis_quant_stop=0.982105;SVTYPE=INS;SUPTYPE=AL;SVLEN=141;STRANDS=+-;STRANDS2=23,27,23,27;RE=50;REF_strand=48,58;Strandbias_pval=1;AF=0.471698 GT:DR:DV 0/1:56:50 -chr1 2019222 92 N GGGGCGGGGGAGGAGAGGGGGGAGGGAGGGGGACCGGGTAGGGTGGGGGGGGGAGGGGAACGGGGAGGGGGCAGGCAGGCGCGGGGTGGGGGGAGGGGAGGGGGAGGGGAGAAGACGGGCAGCGGGAGGGGCGGGGGGAGGGGATGGGGGCGGGGGAGGAGGGCGGCGGGGGAGGGGATGGGGGCGGGGGAGGGGATGGGCGGGGGGAGGGGGA . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2019227;STD_quant_start=3.28633;STD_quant_stop=4.04969;Kurtosis_quant_start=-0.713007;Kurtosis_quant_stop=0.942016;SVTYPE=INS;SUPTYPE=AL;SVLEN=211;STRANDS=+-;STRANDS2=23,17,23,17;RE=40;REF_strand=46,34;Strandbias_pval=1;AF=0.5 GT:DR:DV 0/1:40:40 -chr1 2106812 93 N CCCTCTGGTGGGCGTAGGACCTGTCACCGTGTCACCAGGCCAGGTAACTCTCAGCAGG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2106813;STD_quant_start=19.0342;STD_quant_stop=19.9875;Kurtosis_quant_start=-1.45192;Kurtosis_quant_stop=-1.30053;SVTYPE=INS;SUPTYPE=AL;SVLEN=55;STRANDS=+-;STRANDS2=3,12,3,12;RE=15;REF_strand=40,54;Strandbias_pval=0.153747;AF=0.159574 GT:DR:DV 0/0:79:15 -chr1 2110063 94 CCCAACCAAGAGGATCCCAGAGGTGAGACACAGAACGGCCAGGGCTGAATCCGGGGCCCTCCCTGGGGGCAGCCAAGGACCTAAAACCAATGGG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2110160;STD_quant_start=19.0866;STD_quant_stop=18.9882;Kurtosis_quant_start=-1.18991;Kurtosis_quant_stop=-1.28399;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-97;STRANDS=+-;STRANDS2=20,21,20,21;RE=41;REF_strand=38,45;Strandbias_pval=0.84876;AF=0.493976 GT:DR:DV 0/1:42:41 -chr1 2121520 95 N GGTCATGAGGTGGTAGTTAAGTTATGGTAGTTAG . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2121520;STD_quant_start=0;STD_quant_stop=0.8044;Kurtosis_quant_start=2.24316;Kurtosis_quant_stop=2.47427;SVTYPE=INS;SUPTYPE=AL;SVLEN=33;STRANDS=+-;STRANDS2=21,15,21,15;RE=35;REF_strand=60,50;Strandbias_pval=0.704801;AF=0.318182 GT:DR:DV 0/1:75:35 -chr1 2122244 96 N GTTAGGGTCACGGCGGTGGTTAGGTCGTGGTGGGAGTTAGGGTCACGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGATCATGGCTGTAGTTAGCGTCATGGTGGTAGTTAGGGTCACGGCTATAGTTGGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTATTTAGGGTCACGGCTGTAGTTAGCGTCATGGTGGTGGTTAGGTCATGGTGGTAGTTAGGGGTCACGGCTGTAGTTAGGGTCATGGTGGTGGTTAGGTCACTTGCTGTAGTTAGGGTCATGGTGGTAGTTAGGTCATGGTGGTAGTTAGCGTCATGGTGGTGGTTAGGTCATGGTAGTTAGGGTCACTGCCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2122256;STD_quant_start=72.1976;STD_quant_stop=164.724;Kurtosis_quant_start=0.262966;Kurtosis_quant_stop=-1.09184;SVTYPE=INS;SUPTYPE=AL;SVLEN=340;STRANDS=+-;STRANDS2=0,6,0,6;RE=6;REF_strand=56,48;Strandbias_pval=0.0120583;AF=0.0576923 GT:DR:DV 0/0:98:6 -chr1 2123322 97 N TAATTGGGATCATGACCATGTGATTGGGGTCATGGTGTTAGTTAAGGTCATGACTGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2123322;STD_quant_start=21.4499;STD_quant_stop=25.9846;Kurtosis_quant_start=-0.532609;Kurtosis_quant_stop=-1.17357;SVTYPE=INS;SUPTYPE=AL;SVLEN=88;STRANDS=+-;STRANDS2=9,11,9,11;RE=20;REF_strand=54,52;Strandbias_pval=0.80797;AF=0.188679 GT:DR:DV 0/0:86:20 -chr1 2123768 98 N GGCTGTGGTTAGGGTCATGGTGGTAGTTAGGATCATGGCTGTAGTTAGGTCATGGTGGTAGGTCTGGTCACGGCTAGTTGGGGTCATGGTGGTAGTTAGATCATGGCTGTAGTTAGGGTCAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2123768;STD_quant_start=41.4871;STD_quant_stop=39.3296;Kurtosis_quant_start=-1.74964;Kurtosis_quant_stop=-0.19162;SVTYPE=INS;SUPTYPE=AL;SVLEN=112;STRANDS=+-;STRANDS2=11,11,11,11;RE=22;REF_strand=52,50;Strandbias_pval=1;AF=0.215686 GT:DR:DV 0/0:80:22 -chr1 2124290 100 N GGGTCATGGTGGTAGTTAGGATCATGGCTGTAGTTGGGGTCATGGTGGTAGTTAGGGTCACGGCTATAGTTAGGGTCATGGTGGTAGTTATTGGTCTGTGATAGTTAGCATCATGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTGGGGTCATAGCTGTAGTTAGGGTCATAGTGGTAGTTGGGGTCACGGCTATAGTTG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2124353;STD_quant_start=62.996;STD_quant_stop=56.9131;Kurtosis_quant_start=-1.3131;Kurtosis_quant_stop=-0.191076;SVTYPE=INS;SUPTYPE=AL;SVLEN=110;STRANDS=+-;STRANDS2=6,7,6,7;RE=13;REF_strand=55,51;Strandbias_pval=0.773897;AF=0.122642 GT:DR:DV 0/0:93:13 -chr1 2124333 99 N AAGGGTCATGGTGGTAATTAGGATCATGTAGCTGTAGTTAGGGTCATGGTGGTAGTTAGGGTCTGGCTATAGTTGGGGTCATGGTGGTAGTTAGGGTCACAGCGATAGTTAGCATCATGGTGGTAGTTAGGGTCATGGTGGTAGATTGGGGTCATGGTGGTAGTTAGGGTCATGGTGGTAGTTAGGGTCATAGCTGTAGTTAGGGTCTGTGGTGGTAGTTGGGGTCCGCGGCTATAGTTGGGGTCCATGGTGGTAGTTAAGGTCACGGCTGTGATTAGCGTCATGGTGGTACGTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2124347;STD_quant_start=28.9361;STD_quant_stop=52.7655;Kurtosis_quant_start=0.874818;Kurtosis_quant_stop=0.127135;SVTYPE=INS;SUPTYPE=AL;SVLEN=295;STRANDS=+-;STRANDS2=10,9,10,9;RE=19;REF_strand=54,52;Strandbias_pval=1;AF=0.179245 GT:DR:DV 0/0:87:19 -chr1 2142340 101 CTTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCTAGCAGCCGAAGCGCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2142424;STD_quant_start=9.44235;STD_quant_stop=8.28442;Kurtosis_quant_start=7.28739;Kurtosis_quant_stop=4.86915;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-84;STRANDS=+-;STRANDS2=23,15,23,15;RE=38;REF_strand=48,32;Strandbias_pval=1;AF=0.475 GT:DR:DV 0/1:42:38 -chr1 2280758 102 N GCCTCGGGAGAGTGACAGGCGGCGGCGGCGACACCAGAGAGCGGACGAGAGGACAGGCGGCGGCGGCGATCTTTCAGAGAGCGGGATTTTCCCGAGAGGGACAGAGAAGGCGGCGGAGATTGTCTTCAGAGAGAGGAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2280758;STD_quant_start=32.8507;STD_quant_stop=82.7345;Kurtosis_quant_start=-1.7272;Kurtosis_quant_stop=-1.34469;SVTYPE=INS;SUPTYPE=AL;SVLEN=205;STRANDS=+-;STRANDS2=2,4,2,4;RE=6;REF_strand=32,54;Strandbias_pval=1;AF=0.0697674 GT:DR:DV 0/0:80:6 -chr1 2280945 103 N CGGTGCGGAGAGATCTTCAGAGAGAGGACGCCTGAGAAGACAGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2281014;STD_quant_start=60.2633;STD_quant_stop=47.2864;Kurtosis_quant_start=-0.84573;Kurtosis_quant_stop=-0.496898;SVTYPE=INS;SUPTYPE=AL;SVLEN=51;STRANDS=+-;STRANDS2=1,5,1,5;RE=6;REF_strand=34,54;Strandbias_pval=0.40609;AF=0.0681818 GT:DR:DV 0/0:82:6 -chr1 2281986 104 GAGAGGACGCCCGAGAAGACAGGCGGTGGCGGAGATCTTCAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.12;CHR2=chr1;END=2282028;STD_quant_start=21.4103;STD_quant_stop=21.0879;Kurtosis_quant_start=-1.23914;Kurtosis_quant_stop=-0.042349;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-42;STRANDS=+-;STRANDS2=9,11,9,11;RE=20;REF_strand=35,54;Strandbias_pval=0.801434;AF=0.224719 GT:DR:DV 0/0:69:20 +##bcftools_viewCommand=view sniffles.vcf.gz; Date=Mon Jan 17 15:03:26 2022 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT F00209 +chr1 10175 30259 N ]chrX:449442]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=137.151;STD_quant_stop=51.4701;Kurtosis_quant_start=0.180157;Kurtosis_quant_stop=2.98567;SVTYPE=BND;RNAMES=36b45e58-0eba-46a5-9d01-819279aa26ae,66c0071b-0063-42fd-84db-bda11d270a1b,6dd212c3-59c6-48bc-ad46-e83521f1d4a7,8a407a84-39ff-438e-bbb3-939b2de7f56b,9c86fe43-fc57-45d2-a21e-be9c2b2a7ab2,c7599105-7f14-4b32-bcff-2718b1c27e14;SUPTYPE=SR;SVLEN=0;STRANDS=-+;RE=6;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:6 +chr1 66231 0 AATATATATTATATTATATAATATATAATATAAATATAATA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=66272;STD_quant_start=4.70106;STD_quant_stop=2.68328;Kurtosis_quant_start=1.94037;Kurtosis_quant_stop=2.86338;SVTYPE=DEL;RNAMES=18949eae-a81e-4a30-ab2f-714a2a651b3e,303df7a0-ffff-44da-9783-1266c2aa8e43,42f0045c-6ce4-4454-89b4-dcc55a60aa20,4620fc8c-a235-44f0-ab2f-132e3b23b1a3,46cd3de9-56ec-4a5c-af40-c772fda8c97e,4e2cad12-2931-4e21-8219-62ee48f4a71b,7d47d56b-7cea-4c3c-8499-e4e37d87d4c3,8b613887-454e-4e1c-98c2-e6c7ba7f5a82,96142aba-b9ef-4816-b99f-26d55fc7611e,be826b7d-8200-4805-abde-9b98bd68ae52,dbf59bf4-e594-449d-ab9d-822448f148b2,f6ec6a8f-ce44-4312-bf9e-c1d3ab70bad3;SUPTYPE=AL;SVLEN=-41;STRANDS=+-;RE=12;REF_strand=5,4;AF=0.571429 GT:DR:DV 0/1:9:12 +chr1 136971 1_0 N GTGTGGGAGGGGGCCGGTGTGAGGGCAAGGGGCTCACGCTGACCTCTGTCCGCGTGGGAGGGGCAGTGTGGACGGGGGCTCGGGCTGACCTCTCCCAGCGTGGGAGGGGCCAGTGTGAGGCAAGGAGCTCCACACTGACCTCTCAGCATCGGGAGGGGGCCGGTGTGAGACAGGGGCTCGGGCTGACCTCTCTCCCG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=137157;STD_quant_start=38.086;STD_quant_stop=40.5911;Kurtosis_quant_start=-1.74556;Kurtosis_quant_stop=-0.884647;SVTYPE=INS;RNAMES=232c6c93-a600-4ff3-913b-ea599f590441,2b160313-a262-4a4c-846c-6840b04830f3,2e0372f6-d634-4380-ba6a-a4ab5ad20d2b,2ec2281b-d0fe-48c1-b5b1-f8e1232d806f,3c3847a1-26c1-490e-8b36-d060b856de07,400b2a8f-a57b-4456-880f-4ee3770e9900,492047a3-a057-4612-ad54-6fcabf3edd04,51603ab9-4512-4957-ab67-388025b73ff7,548de1e6-386a-4a05-b589-b9c43db86d94,744e152e-272a-4871-98d7-51d9427da8ce,75c3d587-3f58-432e-99c3-427cb21d4f83,81b6dafe-1d61-45cf-95e7-1acdaa42e707,91ff0042-3aa3-418e-afc5-0ece769cc09f,9c5b3e92-dbf6-4d9f-9844-7cc97539e1c2,aa0055c1-7f35-44ce-a928-03dcbe35097a,b1d2f26e-496a-4f13-85aa-6add0867ac95,e5e48f9e-34f6-4dc3-8f7d-fe1a50492e82,e89599e5-28ba-4eb4-9916-beb01fddebf7,eb609540-ec5a-4cc2-b9c9-d8161df13388,ee1e714f-1506-494a-b627-03732f6d0fb6,f89425df-020b-4daa-96a7-526e8903e0b4,faf824ea-0cf9-4d2e-ba2c-b3e3b26c40f1,fc2651cd-6016-4c60-afdf-f37c1291e57f;SUPTYPE=AL;SVLEN=94;STRANDS=+-;RE=23;REF_strand=12,14;AF=0.469388 GT:DR:DV 0/1:26:23 +chr1 139403 1_1 GCAGGAGCTGGGCCTGGAGAGGCTGCAAAGAA N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=139435;STD_quant_start=0;STD_quant_stop=0;Kurtosis_quant_start=0.630869;Kurtosis_quant_stop=0.700919;SVTYPE=DEL;RNAMES=19379760-7221-41d8-b5db-64b06e0aaa71,23253008-8cc6-41cc-a562-482426273785,232c6c93-a600-4ff3-913b-ea599f590441,2b160313-a262-4a4c-846c-6840b04830f3,2ec2281b-d0fe-48c1-b5b1-f8e1232d806f,301da2b9-8ef6-48ce-85e3-05416b6cff9f,31427bc1-7882-452e-9d65-eb196ccdcbde,39875412-abe8-4c09-908c-d7358b9d1244,3c3847a1-26c1-490e-8b36-d060b856de07,400b2a8f-a57b-4456-880f-4ee3770e9900,492047a3-a057-4612-ad54-6fcabf3edd04,695b38ca-2d0d-40ee-ade7-94575dfeab24,69e3bbfc-05d9-41ca-97b8-e4100dc134bd,6a895396-2469-45a2-9b53-21325d361d45,744e152e-272a-4871-98d7-51d9427da8ce,8291b073-eb34-4cba-b548-3d7b8fac78f9,91ff0042-3aa3-418e-afc5-0ece769cc09f,9c5b3e92-dbf6-4d9f-9844-7cc97539e1c2,a3c44e09-58a0-4db2-97b2-9d26c8ab1af3,b1d2f26e-496a-4f13-85aa-6add0867ac95,b94fafca-6aa6-4d59-a6a7-e281d3abf987,eb609540-ec5a-4cc2-b9c9-d8161df13388,ee1e714f-1506-494a-b627-03732f6d0fb6,faf824ea-0cf9-4d2e-ba2c-b3e3b26c40f1,fc2651cd-6016-4c60-afdf-f37c1291e57f;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=25;REF_strand=2,6;AF=0.757576 GT:DR:DV 0/1:8:25 +chr1 372612 3 TGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCTATAACCGACCACCTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGCATATATGTATAATATATATTATATATGGTCCTCAGGGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=372770;STD_quant_start=85.6061;STD_quant_stop=72.4707;Kurtosis_quant_start=-1.37847;Kurtosis_quant_stop=0.624284;SVTYPE=DEL;RNAMES=0ea301f0-f842-48c6-99d9-789e5306ca31,51be7816-260b-4aa6-bb11-e7bf4970584b,9e9106b8-8807-402f-9b32-f6ec842256e8,c9381473-3100-4d39-9a7c-17f0952c8329,fa9a877c-5ca5-4c7d-a1ec-7ac05eb89dd0;SUPTYPE=AL;SVLEN=-158;STRANDS=+-;RE=5;REF_strand=3,1;AF=0.555556 GT:DR:DV 0/1:4:5 +chr1 610585 4_0 N TGGGTTCTCTGTGGCCGAACGGGCGCGCGGTGATGCAGGAGATGCCCAGACCTGGCGGCTAAAGGCG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=610745;STD_quant_start=38.4851;STD_quant_stop=42.1248;Kurtosis_quant_start=-1.68346;Kurtosis_quant_stop=-1.60045;SVTYPE=INS;RNAMES=19f22868-9df9-4dfb-b17f-1f9d4fa49c17,1b5a6250-2329-4d71-8055-5ddd95e62e84,2347a361-390f-4514-bfad-70bb6f85a06e,2769b543-d184-47fb-89d1-668ec575fbeb,2e264729-925c-443a-a171-14f917a224cd,725e215c-6117-42ae-b103-11dd0144b9e4,7aada54b-a736-4131-9d5a-e1196802f17a,83fb6dbf-7f76-460e-824e-c08e23603e92,8535d941-7b29-4387-ad34-baad65338467,90159235-0960-44f7-bc14-f5f4c0f05d03,91de4a73-81b8-4943-b471-8cd9a85075d0,944217ac-85b1-4b0a-9e4b-134a37d2dab4,95778194-63c1-42cb-9c62-29860bebdce9,9909d57d-46b5-4956-a28f-aacc34e97f1c,caa57e28-a0f5-44a0-80e6-9ca10e61de9e,dbf5d80b-fa42-4b4c-944b-ca97ebdf7747,ee509fc4-32f4-4d20-8598-446cd9eff2ea;SUPTYPE=AL;SVLEN=70;STRANDS=+-;RE=17;REF_strand=11,13;AF=0.414634 GT:DR:DV 0/1:24:17 +chr1 611533 4_1 GCTGTGTGAGAACGTGTGTGTAGTGTTCACATGTCCTCTGTGCGTGAGTCCCCGTGTGTGATGTTGTGTTCTCGGTGTGAGTTCATGGGTGTGACG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=611630;STD_quant_start=13.3454;STD_quant_stop=14.8257;Kurtosis_quant_start=0.822308;Kurtosis_quant_stop=0.534113;SVTYPE=DEL;RNAMES=1b5a6250-2329-4d71-8055-5ddd95e62e84,2e264729-925c-443a-a171-14f917a224cd,32296472-3493-443e-82c0-a123bf1c6203,7aada54b-a736-4131-9d5a-e1196802f17a,83fb6dbf-7f76-460e-824e-c08e23603e92,8535d941-7b29-4387-ad34-baad65338467,91de4a73-81b8-4943-b471-8cd9a85075d0,9909d57d-46b5-4956-a28f-aacc34e97f1c,b4157b20-84b8-49a5-a02c-e64fc3b473da,caa57e28-a0f5-44a0-80e6-9ca10e61de9e,fad6f4a8-5974-4453-b8ec-44dee69ca8ad,fea67814-cd2a-4ca0-887a-0423f749162d;SUPTYPE=AL;SVLEN=-97;STRANDS=+-;RE=12;REF_strand=1,1;AF=0.857143 GT:DR:DV 1/1:2:12 +chr1 744866 6 N TGTATATAAGTATATATATATATATATATATATATATATATATATATATATATATA . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=744916;STD_quant_start=1.34164;STD_quant_stop=6.40312;Kurtosis_quant_start=2;Kurtosis_quant_stop=-0.304343;SVTYPE=INS;RNAMES=31380806-bb1a-44fc-af33-32dafb8b1bd2,60ed3a7f-df66-44b8-9a3d-85ae047cb2d9,6527face-7c09-4095-b4a5-395c81bf0af6,bde72b50-3594-4ce7-9152-524b2041a889,cb820290-c6fb-4b50-9d74-dae21678da93;SUPTYPE=AL;SVLEN=50;STRANDS=+-;RE=5;REF_strand=2,7;AF=0.357143 GT:DR:DV 0/1:9:5 +chr1 820879 7_0 N TTCACTCACCCTGCCTGGCCAGCAGATCCACCCTGTCTACACTCACCTGCCTGGGCAGTAGTTCCACGTGAATCTCCCCTACCTGCCTCTCCAGCAGACCCGCCCCATCTATACTACTTGCCTGTCCAGCAGATCCACTCTATCTGCTACCCACCTGCCTGTCCAGCAGGATCCACCCAAGTCTACCTGCCTCCTGCTTCTTGTCCAGCAGGTCCACCCTGTCTATACTACCTGCCTGGCCAGTAGATCCACACTA . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=821129;STD_quant_start=7.30121;STD_quant_stop=4.91466;Kurtosis_quant_start=3.29404;Kurtosis_quant_stop=-1.30228;SVTYPE=INS;RNAMES=08277f66-4246-4196-a8c6-0e935cefd23b,12a4e196-c480-4f95-add7-01dc344ddc95,173ce38d-faa4-4d49-a590-8f7d91cc381e,30b916a3-789e-4e6b-836f-56b482e17b85,31f1731a-474f-4059-b0af-f35ca10dc0bb,33fb62e8-4b9e-48a5-b3cc-fe874bae84bb,34d557b0-c085-4370-bacb-708eca60fc92,3cf47072-e6cd-4818-b8f9-a563494f585e,5b8f73ed-d36d-4762-8a40-120a7850ad64,5b9c493f-d9b7-4ef0-af6a-da1dbccd4de4,744c7b3c-b4d5-4f77-8eca-3dc88f3ab8a9,840c98a5-9354-4c10-9f1c-bada9822e1a2,889ca98e-fa18-4759-8d64-992080ed5292,9c636e2f-414e-495b-a746-c65d424324ff,9eabda16-e5f3-4f38-8a40-b3e55b54b4ac,ab64bd0c-2bd9-4504-a3c5-d6b5d24de147,bbf9711d-8100-438c-b863-be499e145ac6,bf87f7b4-2063-4be9-a2da-d50684789239,d2431d93-a300-43be-9fe3-53e37023fa04,d2579512-6af0-4071-94bb-e6a335f02a4b,d2a2d42b-9520-42fb-aa83-08120d9c4990,da10e67a-f6bc-43c0-836e-21f5cbd4f89c,da5f3bed-ce32-4afe-977f-dec998afcd20,da95c1a1-06a7-4e80-89ba-8836eb320f83,fd27d273-d4e5-4079-941a-64bd4dd859da,ffa7d785-a137-48b2-bb17-ad92db6d6c42;SUPTYPE=AL;SVLEN=244;STRANDS=+-;RE=26;REF_strand=13,12;AF=0.509804 GT:DR:DV 0/1:25:26 +chr1 822427 7_1 CCCTGGCCAGCAGATCCACCCTGTCTATACTACCTG N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=822462;STD_quant_start=7.58837;STD_quant_stop=7.27438;Kurtosis_quant_start=-1.25525;Kurtosis_quant_stop=2.28728;SVTYPE=DEL;RNAMES=08277f66-4246-4196-a8c6-0e935cefd23b,12a4e196-c480-4f95-add7-01dc344ddc95,173ce38d-faa4-4d49-a590-8f7d91cc381e,30b916a3-789e-4e6b-836f-56b482e17b85,31f1731a-474f-4059-b0af-f35ca10dc0bb,33fb62e8-4b9e-48a5-b3cc-fe874bae84bb,34d557b0-c085-4370-bacb-708eca60fc92,3cf47072-e6cd-4818-b8f9-a563494f585e,5b8f73ed-d36d-4762-8a40-120a7850ad64,5b9c493f-d9b7-4ef0-af6a-da1dbccd4de4,6638dfba-bd16-4489-840a-8961dccca88f,744c7b3c-b4d5-4f77-8eca-3dc88f3ab8a9,9c636e2f-414e-495b-a746-c65d424324ff,9eabda16-e5f3-4f38-8a40-b3e55b54b4ac,a39a01e8-9332-42c6-ac8f-d50f08bfa762,ab64bd0c-2bd9-4504-a3c5-d6b5d24de147,bbf9711d-8100-438c-b863-be499e145ac6,bf87f7b4-2063-4be9-a2da-d50684789239,d2579512-6af0-4071-94bb-e6a335f02a4b,da10e67a-f6bc-43c0-836e-21f5cbd4f89c,da5f3bed-ce32-4afe-977f-dec998afcd20,da95c1a1-06a7-4e80-89ba-8836eb320f83,dc757ea7-bb53-4189-862a-762bfcc45215,fd27d273-d4e5-4079-941a-64bd4dd859da;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;RE=24;REF_strand=0,1;AF=0.96 GT:DR:DV 1/1:1:24 +chr1 839472 9 CTAGACACACACACCTGGACAAACACACCTGGACACACACA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=839513;STD_quant_start=18.0924;STD_quant_stop=18.9209;Kurtosis_quant_start=-0.921195;Kurtosis_quant_stop=-0.652794;SVTYPE=DEL;RNAMES=06082ac4-343f-46cb-8dbb-16044a695d40,173ce38d-faa4-4d49-a590-8f7d91cc381e,290bc1f7-bba6-401e-914e-e3e08dac6ca6,33633824-c73a-43b1-8755-e413b56b11cd,59fe81ea-0f96-4d4f-aec8-6d99e1b74249,935fc28f-4368-4a5d-b152-03e3a54580c9,ac29ef04-2573-42e2-9a2c-22c376c8956b,d2ce726d-3d70-4406-b906-4c83146f54da,db7670c0-06cf-4e31-a4bb-12cbc91ff9ff;SUPTYPE=AL;SVLEN=-41;STRANDS=+-;RE=9;REF_strand=2,11;AF=0.409091 GT:DR:DV 0/1:13:9 +chr1 876055 10_0 N CCCACACTCCCCACACTCCCATACCCCCACACTCCCCCACACTCACCCACACCCCCCCCATACTCCCCAACTCTCCCCATACTCCCCACATTCCCCATACTCCCCCATACTCCCAAACTCCCCCATACTCCCCTATACTCCCCCCATACTCCCCACACTCCCCCATACCCTCCCCCATACTCCTCCCCCATACTCCCATATTCCCCCCATACTCCCCCATACTCCCCCCAAACTCCCCCATACTCCCTCCCCCACACTCCCCCATACTCCCCCCACACTCCCCCACACTCCCTGCAAACTCCCCCATACTCCCCCATACCCCCACACTTTCACACTTCACACTCCCCACACTCCCCCAAACTCCCCCATACTCCTCCCCCATACTCCCCATACCCCCACACTCCCCCACACTCCCCCACACTCCCCATACTCCCTATACTCTTCCCCATACTCCCCCATACTCCCCCACACCCCCCCAAACTCCCCCCATACCTCCTCCCCATACTCCCTCACACCCCCACAATCCCCACACACTCCCCCACACTCCCCCACACTCCCCCATACTCCCCACACTCCCCCACACTCCCCCATACTCCCCCACACTTCCATACCCCCAACCTCCCCATACTCCCCCACATTCCCCATACTCCCCATACTCCCCTAAACTCCCCCATACTCCTCCTCCCCCCACACTCCCCACACTCCCCCCACACTCCCCCAAACTCCCCCATACTCCTCCCCCCATACTCCCCATACTCCCCCACACTCCCTTCATACTCCCCCAACCTCCCCATACTCCCCACATTCCCCATACTCTCATACTCCCCAAAACTCCCCATACTCTCCCCCATACTCCCCATACTCCCCACACTCCCCCACACTCTTATACTCTCTGCACTCCCCATACCCCTAACCTCCCCCCATACTCCCCACATTCCCCCTATACTCCCCCATACTCCCCCAAACTCCCCATACTCCTCCCCTATACTCCCCATACTCCCCCACACTCCCACACTCCCCCCATACTTCCCCACACACTCCCCCATACTCCCCCAACCTCCCCATACTCCCCACATTCCCCCATACTCCCCATACTCCCTAAACTCTCTATACTCTTCCCCATACTCCCCATACTCCCCCACACTCCCCCCACACTCCCCCATACTCCCCCAAACTCCCCCCATACCCTCCTCCCCATACTCCCCCATACTCCCCCACACTCCCCCCAAACTCCCCATACTCCTCCCTCACTCCCCCATACTCCCCAACCTCCCCCATACTCCCCCACATTCCCCCATACTCCCCCATACTCCCCCAAACTCCCCCATACTCCCTCCCCATACTCCCCACACTCCCCCACACTCCCCCAAACTCCCCATACCCTCCCCCATACTCCCCACACTCCCCACACTCCCCCTACACTCCCCCATACTCCCCCACACTCCCCACACTCCCCCATACTCCCACACATTCCCCCACACTCCCCAACTCCCATACTCCCCCACATTCCCCAGTTACCCCCCATACTCCCCAAACTCCCCCATACTCCTCCCCCCACACTCCCCCATACTCCCCATACTCGCCCAACCTCCCCCATACTCCCCACACTCCCCCATACTCCCCACAGTCCCCCACACTCCCCCACACTCCCCCCAACCTCCCCCATACCTCCCCATACTCGCCCACACTCGCCCACACCCCCCCCCATACTCCCCCACACTCCCCCATACTCCCCACACCCCCATACTCCCCATACTCCCCCATACT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=877822;STD_quant_start=61.8558;STD_quant_stop=515.837;Kurtosis_quant_start=0.589259;Kurtosis_quant_stop=-0.256238;SVTYPE=INS;RNAMES=24d22080-dc3e-40a2-9ca5-671548228dc4,2ba3fe9c-4032-49da-af3f-d8ec59a55560,53e073b5-1dac-49fb-b5d1-3564047008e6,5f5806db-7c48-4e2a-a314-0a7dcb1ca284,7a8adb88-1a90-4871-ae73-a4211a91e352,b3933a23-f008-40a9-b8fd-16142ec02196,dee43754-3fd9-4b9b-8734-ffac87bb82ae;SUPTYPE=AL,SR;SVLEN=1767;STRANDS=+-;RE=7;REF_strand=5,11;AF=0.304348 GT:DR:DV 0/1:16:7 +chr1 876234 10_1 N CCCACACTCGCCACACTCCCCCACACTCTCACAGCCGCCCACACTCCCCCCACACTCCCCACACTCGCCCACACACTCCCTCACGCTCCCCTATACTCGCCACGCCTCCCCATACGTCACGCTCCCCCACGCCTCCCCCACACTCGCCCACGCTCCCACACTCGCCCACGCCTCCCCACGCTCCCCTGCCACCCCGCCACGCTTCCCCACGCGCCCACACTCCCCACGCTCCCCATACCCGCCCACGCTCCCCCACGCGCCCCACGCTCCCCACGCCTCCCCACACTCTCGCCCACACTCCCCACACTCCCGCCCACACTTCTGACACTCA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=876574;STD_quant_start=111.344;STD_quant_stop=186.734;Kurtosis_quant_start=-0.221175;Kurtosis_quant_stop=2.38407;SVTYPE=INS;RNAMES=0bc44752-5742-4579-adba-8631316c72cb,0dc02ef8-8f74-4162-a5e4-efe931a57738,297b488b-933a-419b-ad51-1ccbc4d4a005,2ba3fe9c-4032-49da-af3f-d8ec59a55560,3b2e373b-12df-4457-9d80-cdf85a4783d2,53e073b5-1dac-49fb-b5d1-3564047008e6,7a8adb88-1a90-4871-ae73-a4211a91e352,88b4b3e8-f5a5-4dc6-b806-09e01d74d202,b3933a23-f008-40a9-b8fd-16142ec02196,dee43754-3fd9-4b9b-8734-ffac87bb82ae,f2061e65-523d-4a76-8217-2ec92f003d78;SUPTYPE=AL;SVLEN=305;STRANDS=+-;RE=6;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:6 +chr1 876435 30260 N N[chr4:189981026[ . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=8.33667;STD_quant_stop=4.22295;Kurtosis_quant_start=-0.21474;Kurtosis_quant_stop=2.25059;SVTYPE=BND;RNAMES=2b803f71-4a7c-40e0-b7fd-7cb14781ccd7,3aad2dc7-a939-4dda-aef3-79752a09e031,41fe3fce-daef-4b91-a138-0a043ec01c1e,a1686d65-694d-419a-87c5-b59aea8ead01,a2d1106c-5908-4a52-bfc4-2d2c9f82f0ab,f7ddba20-5e7d-4bb6-89c5-13113bd7f9e1;SUPTYPE=SR;SVLEN=0;STRANDS=++;RE=6;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:6 +chr1 878422 12_1 N ]chr3:198124404]N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=2.30217;STD_quant_stop=7.16938;Kurtosis_quant_start=1.44065;Kurtosis_quant_stop=-0.509718;SVTYPE=BND;RNAMES=0933c3d1-d309-47b4-a379-0c40a5d41472,12bbb6c8-1b3a-4f9c-b6d4-583c5de2853b,4eb474a5-759c-4751-bf00-2632e7513a00,5842611b-34f3-4833-90ea-2105b399f00b,74b0cf2b-41fa-41e1-a1b8-ed795e5b69f4,96694515-ad94-44d0-976e-224b88435a94,972a72b9-c511-4f41-86fe-585ad5fd0801,b369b2b2-bad2-4494-9e32-7eecff663621,b6befb03-ee20-4bbc-9848-323e1aa2cb3f,c467bfb0-2ef6-4e90-ad38-b6d6e20f114a,c8929532-4d6d-4b90-83af-495c63b7e4a7,f6684964-716f-4b3b-9235-568583100244;SUPTYPE=SR;SVLEN=0;STRANDS=-+;RE=12;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:12 +chr1 882644 12_0 AATATATTAGCTATTCTAGACTTTATGCATTTATGTAAAGTTTTCTTTGTTGCACTTTAAGTTCTGTGATACATGGGCAGAGCATGC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=882731;STD_quant_start=1.06904;STD_quant_stop=1.06904;Kurtosis_quant_start=0.5;Kurtosis_quant_stop=0.5;SVTYPE=DEL;RNAMES=0933c3d1-d309-47b4-a379-0c40a5d41472,3a5b25c9-aade-4476-8014-39b5642704d8,74b0cf2b-41fa-41e1-a1b8-ed795e5b69f4,b6befb03-ee20-4bbc-9848-323e1aa2cb3f,bb8d3472-1ce4-4b38-8b80-4b84b1701d43,bcb08d1d-3e42-4615-a634-ef3d9480e50f,f6684964-716f-4b3b-9235-568583100244;SUPTYPE=AL;SVLEN=-87;STRANDS=+-;RE=7;REF_strand=5,5;AF=0.411765 GT:DR:DV 0/1:10:7 +chr1 893788 13_0 AAAAAAAAAAAAAAAATATATATATATATATATATAT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=893823;STD_quant_start=1.14017;STD_quant_stop=1.41421;Kurtosis_quant_start=-1.64583;Kurtosis_quant_stop=-0.318339;SVTYPE=DEL;RNAMES=00d5acc9-d809-4589-950a-60e1d649dfc7,109e8760-9901-47ff-8a78-d1fedbe48b93,3b534e86-089c-4fd1-b9f3-8d1163c5a507,489cc112-f6ce-4256-96f0-ca3cf41eb59c,53e073b5-1dac-49fb-b5d1-3564047008e6,5c54e581-16b7-46ba-84f6-e11aa4b8d9e0,6b1e6aec-dd79-4ef4-8043-90eb43c7fa2c,9147c917-4393-4322-977b-e6338fd29061,9be9485e-ad2f-41d0-8ac4-f57e3ac93aed,b2e8130d-86be-4002-a5bc-7235556660cc,b56a92bb-ce39-498d-922f-2802050d7204,c818c252-86b6-4110-b421-8c279ec8c231,e1167d89-45c7-47bc-8539-370607a5358c,ebac6587-35c3-4616-a81b-bd1da9a477d7,fefb4062-9f96-4f27-8b0b-162ccea44e01;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;RE=15;REF_strand=10,5;AF=0.5 GT:DR:DV 0/1:15:15 +chr1 904485 14_0 CCGAACGCGGCCGCCTCCTCCTCCGAACGTGGCCTCCTCCGAACGCGGCCGCCTCCTCCTCCGAACGCGGCCGCCTCCTCCTCCGAACGTGGCCTCCTC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=904584;STD_quant_start=51.4369;STD_quant_stop=33.1021;Kurtosis_quant_start=4.67957;Kurtosis_quant_stop=4.14782;SVTYPE=DEL;RNAMES=19a83ded-da76-4fd1-9d6c-4572155cc51b,3b5bfdad-0924-428c-8284-18d99627090f,3ba8bc4f-69e0-4d8d-b69f-9e2289b06688,415d65ee-1c0b-461c-b310-a38b48e5692a,610ff93f-9e06-4e31-9bf4-70a050355a1a,df47dc10-1b84-4197-970b-c094542544bf,e1167d89-45c7-47bc-8539-370607a5358c,f83b72b2-ba10-4b95-8511-0b082a408426;SUPTYPE=AL;SVLEN=-99;STRANDS=+-;RE=8;REF_strand=4,12;AF=0.333333 GT:DR:DV 0/1:16:8 +chr1 909237 14_1 CCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCATCTTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCCGGGCGCACTGTTGCTGGTATATGCGGTGGTCGGGGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=909509;STD_quant_start=51.2513;STD_quant_stop=11.9122;Kurtosis_quant_start=-2.0209;Kurtosis_quant_stop=-0.565726;SVTYPE=DEL;RNAMES=19a83ded-da76-4fd1-9d6c-4572155cc51b,3433165a-feff-48f7-91ee-0279d610ddc7,360ed506-7a48-4421-8af0-b64753d62481,3b5bfdad-0924-428c-8284-18d99627090f,3e451da8-35d7-4d31-8657-457c259b5530,5e0336fb-9a37-4ffa-b3c6-2bf76dd82963,610ff93f-9e06-4e31-9bf4-70a050355a1a,6e076f45-b13c-4c3e-a6fd-f5480d2226e5,78b39159-73d3-4d3d-96b9-9deb44c8eb61,78d028b3-4e8e-4f27-9dd1-7ef337811157,7d4c1e6b-566f-40f5-bf7a-424b7c727093,80c8f62a-aa31-4749-b42e-231f2fcf38f8,8c8772c1-9835-4ddf-981d-9deffa314b75,9147c917-4393-4322-977b-e6338fd29061,a19a1b07-5d05-445e-80ee-f7e497c9ecea,aa594e54-c5b8-40a5-833c-58875d3e5999,cfd6cb35-a661-4ea7-95cb-3aa38bfcaa7f,d9016bbe-ab64-4d23-b7a1-776e7e517976,df47dc10-1b84-4197-970b-c094542544bf,f83b72b2-ba10-4b95-8511-0b082a408426;SUPTYPE=AL;SVLEN=-272;STRANDS=+-;RE=20;REF_strand=0,5;AF=0.8 GT:DR:DV 0/1:5:20 +chr1 934091 16_0 GGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=934910;STD_quant_start=19.3391;STD_quant_stop=21.5105;Kurtosis_quant_start=-0.515907;Kurtosis_quant_stop=-1.15124;SVTYPE=DEL;RNAMES=124b3bf2-e03c-4634-846b-2d530a2dbcb2,1781f8b9-81a6-42d3-ac68-1800883e491e,209cc8c7-9960-4a36-a996-4008bd485e46,242e4f3f-18e4-496c-ba24-6a9ff3a24125,2bebd884-0751-49b7-94e3-bd32bf865963,40bbb350-5375-47cf-944c-61b5f8c9d041,46ca9b09-71f1-4292-af26-cd48f1404028,5977409f-df40-4fe5-b999-a88b4b73634d,61c20644-d612-4241-abba-a6d54e3c1de4,662ce493-1ac9-4aee-b8b4-c43eb1018c81,7026857a-757b-4e15-bbe7-cb3eaed77a8a,77b9f439-e950-4fc8-892b-a63bd3e71bfa,83d4916f-5cb8-4b1b-babb-ecfac63be3b7,9528f7c1-6606-4854-920b-0fabdc84e9cd,96557e7b-5546-4ed2-a367-922cd2900539,b099ef7d-46d6-45d4-890d-91d1ca8501f0,ba913561-f678-4664-abe9-4a2f1d9bce07,bedf1d2f-09c6-4b54-b06b-9dec7430b3f9,d59e7196-e5d4-44da-9cb9-695dee19261b;SUPTYPE=AL,SR;SVLEN=-819;STRANDS=+-;RE=19;REF_strand=12,14;AF=0.422222 GT:DR:DV 0/1:26:19 +chr1 936270 16_1 CCCGGTCCCGCCTCCTAGGGCTCCTGGACGGAGGGGGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=936308;STD_quant_start=11.3181;STD_quant_stop=10.8674;Kurtosis_quant_start=0.18586;Kurtosis_quant_stop=0.128029;SVTYPE=DEL;RNAMES=124b3bf2-e03c-4634-846b-2d530a2dbcb2,209cc8c7-9960-4a36-a996-4008bd485e46,242e4f3f-18e4-496c-ba24-6a9ff3a24125,2bebd884-0751-49b7-94e3-bd32bf865963,46ca9b09-71f1-4292-af26-cd48f1404028,47d773c5-943d-4289-b877-2815dc141bad,5977409f-df40-4fe5-b999-a88b4b73634d,61c20644-d612-4241-abba-a6d54e3c1de4,7026857a-757b-4e15-bbe7-cb3eaed77a8a,77b9f439-e950-4fc8-892b-a63bd3e71bfa,83d4916f-5cb8-4b1b-babb-ecfac63be3b7,b099ef7d-46d6-45d4-890d-91d1ca8501f0,ba913561-f678-4664-abe9-4a2f1d9bce07,bedf1d2f-09c6-4b54-b06b-9dec7430b3f9,d59e7196-e5d4-44da-9cb9-695dee19261b;SUPTYPE=AL;SVLEN=-38;STRANDS=+-;RE=15;REF_strand=1,2;AF=0.833333 GT:DR:DV 1/1:3:15 +chr1 964676 18_0 CAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCCGCAGTGGGGATGTGCTGCCGGGAGGGGGGCGCGGGTCC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=964770;STD_quant_start=21.6036;STD_quant_stop=23.8911;Kurtosis_quant_start=-2.11167;Kurtosis_quant_stop=-2.08637;SVTYPE=DEL;RNAMES=007f21ba-17b4-46a3-aa4d-081ef24edbbd,0afc74bd-f468-4913-9597-1315c6a8533e,10d02ba9-b64d-4956-bf6f-825cef849b25,13506373-57b1-47eb-b006-787867b085ee,16824a47-9cc6-40fa-b0ed-6e1f55c2cecd,1b9f382f-b4f4-42f8-95ab-b1a1f10f50fb,290e4239-4819-430c-a311-bfd0563349c8,2b549ec4-b7ad-4e8c-b457-c6996e0c759c,39573df9-cf23-40a7-a704-d5ab2dfd5703,430f1b34-3d72-4b36-a77c-cec897e28ca7,4d0451c5-1810-48b4-97af-926032a90afa,52d3f37a-a63a-4756-92da-ac05f339f986,584a8bca-cf13-4c4f-b503-b9e62504fc1c,6af59770-110b-4424-a602-ea3ccb6a7878,788d7471-2b7e-413d-9723-0dfe554f8af8,7b3747dd-59c6-4178-a12e-b55b321d5167,7d3ab225-0534-4dad-a10c-b1e7fb2d238e,88bd6216-b874-4349-9782-8c6da9c98893,9f727bd1-9793-40c0-b319-e35c7200e56e,acc0886f-ee67-4fc8-bb0b-a2c590c1d054,b75e9abf-252a-4b0e-ae59-739b9f66d901,b99a4d9f-e569-413a-86dd-4a943b0d33cd,be9533d0-ad23-4627-9ebe-a1e6d7022a73,bf9ba184-af83-4258-9d40-7057111a23fe,d491d22b-dd30-4ada-8a29-1a8835ce9218,d8373e97-7940-456a-91ee-19616db9be98,d9ae820d-7a7f-4f8f-be5a-19cf08f5577c,daa2cc74-ea58-4f46-8483-176dd9419d26,feb36428-bbec-40fc-a31f-7efee21eff79;SUPTYPE=AL;SVLEN=-94;STRANDS=+-;RE=29;REF_strand=13,11;AF=0.54717 GT:DR:DV 0/1:24:29 +chr1 977229 18_1 GGGAACCGCCTCCCACCACCCCGCCAACCCCGGGAACCGCCTGCCCCCACCGACCAACCCCGGGAACCGCCTCCCACTCCCCCCGCAACCCCG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=977322;STD_quant_start=11.2101;STD_quant_stop=4.83046;Kurtosis_quant_start=2.61025;Kurtosis_quant_stop=1.51714;SVTYPE=DEL;RNAMES=2b549ec4-b7ad-4e8c-b457-c6996e0c759c,43704193-64c6-43d5-973d-7101b47356bc,7f53067c-50d7-454b-a1e4-7908d7c99b0e,b99a4d9f-e569-413a-86dd-4a943b0d33cd,d11dbfd0-a514-48d5-9f05-966223c8a579,d6cc2ee8-8aba-4960-986f-05f699117e8c;SUPTYPE=AL;SVLEN=-93;STRANDS=+-;RE=6;REF_strand=4,7;AF=0.352941 GT:DR:DV 0/1:11:6 +chr1 988819 20_0 N CAGGTGTTGGAGTTCTGGGTTGATTGTTTCTGGAGTTCAGGGTT . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=988865;STD_quant_start=2.62679;STD_quant_stop=1.61245;Kurtosis_quant_start=-0.549025;Kurtosis_quant_stop=-0.683669;SVTYPE=INS;RNAMES=029e506b-adf5-4deb-a0c6-edbdb42a09f7,12e733b1-6ea2-402b-aebc-2084047b6a62,1893e96e-75ee-4722-a8ef-3dfcb414a8a7,1d3f51af-4cba-46b6-bad7-5f426a10f1ea,42604927-b04c-47d6-939f-2621d9edaf1f,4f3c8ec8-67a1-4ecc-a764-6dc570e2af24,4f6b1562-1987-432b-a6bf-f0efaaecd4db,5353a5d1-4fe7-404f-b4de-eb80ad893f58,638ec215-8161-4df5-92ab-d643aa423e67,7f3be9df-7ed3-4d06-a42e-d1a476654736,92509d04-fc7b-4c4a-8766-a4241623bd40,b62c6c2b-8459-46b5-b0c2-4e178cf260ca,c088d984-e900-4ba1-8cce-080d4928e682,c5b2df39-0e38-478a-9210-6b8b609a12ed,d7955e6b-2206-481a-a262-d8989c555700,ddd90a85-78d0-422a-af35-ae50309bc66c,e99082ef-93e5-40ad-8455-94a80916b1c3,ec37fbf0-be64-41e1-982f-d23b2a266bf2;SUPTYPE=AL;SVLEN=46;STRANDS=+-;RE=18;REF_strand=10,8;AF=0.5 GT:DR:DV 0/1:18:18 +chr1 996295 20_1 N GGGGCCACAGGATGCGGGGTGGGGAGGGTGAAGAGCCCCCGCGGGAAGGGGGCACCCCACATCTGG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=996360;STD_quant_start=26.7245;STD_quant_stop=37.4793;Kurtosis_quant_start=0.970638;Kurtosis_quant_stop=0.790126;SVTYPE=INS;RNAMES=0915ef75-f739-4301-96af-ebceac688c4c,1893e96e-75ee-4722-a8ef-3dfcb414a8a7,28d1e658-9bae-4858-a742-061855b9278a,6333df9d-145c-4aa8-9028-f3f9ccc06b81,b62c6c2b-8459-46b5-b0c2-4e178cf260ca,d6f79315-e46b-497e-b827-767ac6cf1545,db2806e1-917f-45ee-a8b8-25636c4a5378,e99082ef-93e5-40ad-8455-94a80916b1c3,f07b32d9-c7f7-4680-a840-f666b6cf7729,fe68c184-2a97-4d11-9114-1df0a66767bf;SUPTYPE=AL;SVLEN=63;STRANDS=+-;RE=10;REF_strand=8,7;AF=0.4 GT:DR:DV 0/1:15:10 +chr1 998781 20_2 N GGGCGCGGAGCCAGAGGGGAGGGGCGCGGGCGGAG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=998815;STD_quant_start=11.393;STD_quant_stop=13.914;Kurtosis_quant_start=-1.1487;Kurtosis_quant_stop=-1.85367;SVTYPE=INS;RNAMES=0915ef75-f739-4301-96af-ebceac688c4c,0eac73c2-c883-4445-b0cc-009930fc72a1,1893e96e-75ee-4722-a8ef-3dfcb414a8a7,399e5085-05e9-4c7b-9914-335284af39eb,444b7e2a-3c4a-43ac-8b7b-e6c31efde0a2,6333df9d-145c-4aa8-9028-f3f9ccc06b81,8dcd62ff-861a-4c89-ad36-6a8517077d79,b14e68a3-8227-437a-803e-b29758052bc7,b4eb0a05-9685-4301-8b20-242f525b1ab9,b62c6c2b-8459-46b5-b0c2-4e178cf260ca,cd30eb7e-a992-468f-af66-0557742c3fea,e99082ef-93e5-40ad-8455-94a80916b1c3,f07b32d9-c7f7-4680-a840-f666b6cf7729,fe68c184-2a97-4d11-9114-1df0a66767bf;SUPTYPE=AL;SVLEN=49;STRANDS=+-;RE=14;REF_strand=5,3;AF=0.636364 GT:DR:DV 0/1:8:14 +chr1 1029388 23_0 N CAGTGTCTGTACGCGGGCAGGTGGGGGGGACATC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1029422;STD_quant_start=17.6716;STD_quant_stop=19.5631;Kurtosis_quant_start=-0.450147;Kurtosis_quant_stop=0.145801;SVTYPE=INS;RNAMES=0dbbc1b4-43d9-4326-bd7b-cc802c9a2b4c,10e9b032-4bcc-4068-b5b1-1724a0b5795a,138eebd4-774a-4f46-8938-81e94e0af82d,2479e4b6-a85b-443b-9e27-cd05ed5e955e,2ab3f0c1-cb90-4d95-b552-11036cc90f15,4e2eef40-74dc-4389-90aa-056e20d4c94a,a34f86bd-7e61-443f-a57a-2024d132a2d5;SUPTYPE=AL;SVLEN=31;STRANDS=+-;RE=7;REF_strand=22,13;AF=0.166667 GT:DR:DV 0/0:35:7 +chr1 1030889 23_1 TGTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1030927;STD_quant_start=21.095;STD_quant_stop=18.0305;Kurtosis_quant_start=0.568655;Kurtosis_quant_stop=1.76382;SVTYPE=DEL;RNAMES=025d71ad-b816-465c-85c7-852cc50e8f01,067562c3-ce9e-4e07-8b38-b04c2694a162,0dbbc1b4-43d9-4326-bd7b-cc802c9a2b4c,0f0ef869-42bb-4cf8-a626-d91c8bb04fe1,10e9b032-4bcc-4068-b5b1-1724a0b5795a,2479e4b6-a85b-443b-9e27-cd05ed5e955e,33077168-17c4-49b2-9e40-4908688eab22,4e2eef40-74dc-4389-90aa-056e20d4c94a,5ac7bfbe-f94b-4ccc-ad13-05785a0fa5d4,97876ae0-e67d-4a5d-ad62-dcfb2b317ec2,a34f86bd-7e61-443f-a57a-2024d132a2d5,b1f44b33-f202-460b-ab97-16997ba51e0b,de31c769-d318-481c-82fa-9bacc0ec7a3a;SUPTYPE=AL;SVLEN=-38;STRANDS=+-;RE=13;REF_strand=3,5;AF=0.619048 GT:DR:DV 0/1:8:13 +chr1 1041780 23_2 TGCCAGGGTCGAGGTGGGCGGCTCCCCCGGGGGAGGGCTGCGGC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1041825;STD_quant_start=12.9264;STD_quant_stop=14.7741;Kurtosis_quant_start=-1.54283;Kurtosis_quant_stop=-1.637;SVTYPE=DEL;RNAMES=07160a2f-abd0-4716-a7ae-ec72c443f87c,0dbbc1b4-43d9-4326-bd7b-cc802c9a2b4c,10e9b032-4bcc-4068-b5b1-1724a0b5795a,1b91e1fa-f320-4299-b4e9-4efcd5b3d8e1,23b5e5aa-f1f1-499a-a8a7-fef5b4df5432,35a03981-f54c-4e50-a133-1dfa1a201bbb,52e9d7df-a0e8-4e5b-9c63-93205fb3c801,59745b2a-056c-4e54-a844-c9ee0ce88256,664b6dd2-e6c1-4a97-b0ea-8e8d344b3f76,751f8a9c-2b84-4175-8133-ea28806910cb,796b71e9-2116-47b5-ac6a-50ee08cb9331,79ad12ce-dd22-4a58-b850-2357f13acfee,97876ae0-e67d-4a5d-ad62-dcfb2b317ec2,97d20bbd-53e0-4018-acf0-db05251300ec,9ff5026e-72e2-497d-8630-e3bc7293038b,b1f44b33-f202-460b-ab97-16997ba51e0b,ba433a82-16c3-438d-b25f-703eeebfe478,bd2f2b5c-c504-41e4-9525-b435c4564f47,c80ed0a5-8989-4fbd-9d7b-50e259ae7b10,e04f050f-0ccf-4a65-80aa-72be947daae1,ee88b583-c87d-43c4-8c0f-0d476bacec62,fae31778-33ce-4698-8e4f-4446a886e78a;SUPTYPE=AL;SVLEN=-45;STRANDS=+-;RE=22;REF_strand=19,9;AF=0.44 GT:DR:DV 0/1:28:22 +chr1 1068767 26_0 N TGCGGCAGGGCGGGGCCACGCGGGCTGTGAAGGTGCAGGTGCGGCGGGGCGGGGCCACGCGGGCTGTGCAGATCTA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1068844;STD_quant_start=30.133;STD_quant_stop=23.4825;Kurtosis_quant_start=-0.543543;Kurtosis_quant_stop=0.420942;SVTYPE=INS;RNAMES=3613b9c9-bc84-4fe5-b334-fbe6cfefb0b3,4d613eaa-c75b-427b-bcfa-d2a36e9752e1,67c7025c-1529-4b62-8002-5226234cfe92,a85f6095-9dc2-4f1e-bf02-4a687064798b,e127f38a-6eea-4e8f-8f9c-ace633135257,e21dedd9-7626-4e45-bf24-cf0e7656d95d,eef7a5b9-dd4d-4a41-9cf1-6ba86ffd8ccf;SUPTYPE=AL;SVLEN=76;STRANDS=+-;RE=7;REF_strand=7,10;AF=0.291667 GT:DR:DV 0/0:17:7 +chr1 1076286 26_1 GGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGCTGGGAGGCTGAGGCTATGGGGACTCCGTCGGGGGAGGCTGAGTCTATGGGGACTCCGTGGGGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGGGGCAGGCTGAGGCTATGGTGACTCCGTGCAGGGCTGTGAGGCTACGGGGACTCCGTGGGGGGTGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGGGGGGAGGCTGAGGCTACGGGGACTCCGTGG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1076752;STD_quant_start=268.831;STD_quant_stop=235.392;Kurtosis_quant_start=4.8416;Kurtosis_quant_stop=0.720321;SVTYPE=DEL;RNAMES=4d613eaa-c75b-427b-bcfa-d2a36e9752e1,505f2694-d3a3-4eea-a0d6-30d0458e700c,67c7025c-1529-4b62-8002-5226234cfe92,69e95039-a35c-430e-a899-7f82938a75e9,705d9f28-3f68-4b5d-8324-0dc43b348b30,a5dc9ee6-2b37-434f-b073-36d2c59b4036,a85f6095-9dc2-4f1e-bf02-4a687064798b,b0bfdf33-8a60-40d5-9119-96ed2e3158e9,e21dedd9-7626-4e45-bf24-cf0e7656d95d,eef7a5b9-dd4d-4a41-9cf1-6ba86ffd8ccf;SUPTYPE=AL;SVLEN=-466;STRANDS=+-;RE=10;REF_strand=9,6;AF=0.4 GT:DR:DV 0/1:15:10 +chr1 1077148 28 GAGGGGTGGCTGAGTCTATGGGGACTCCGTGCGGGGAGGCTGAGTCTATGGGGACTCCGTGCGGGGTGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGGCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCCGTGCCGGGAGGCTGAGTCTATGGGGACTCTGTGCCGGGAGGCTGAGGCTACGGGGACTCCG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1077357;STD_quant_start=102.569;STD_quant_stop=91.527;Kurtosis_quant_start=-0.624606;Kurtosis_quant_stop=1.477;SVTYPE=DEL;RNAMES=212133cc-17f8-4547-b7c0-71bf328b78b6,6e23a346-ccd3-4e0a-83da-19ac7bf5d569,705d9f28-3f68-4b5d-8324-0dc43b348b30,76c26f65-ea8f-4f9c-8c9d-74c46b6f68ef,93978c11-26de-45b7-b3be-f04dba46a851;SUPTYPE=AL;SVLEN=-209;STRANDS=+-;RE=5;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:5 +chr1 1080846 26_2 N CCCCCTCGTCCCTATCTCCTTCCCTCCCGCCCCACCTCGGTCCCTGTCTCCTTCCCTCCCGCCCCCACCTCGGTCCCATCTCCTTCCCTCCGCCCCCACCTCGGTCTGTCCTTCCCTCCGCCCCCACCTCGGGTCCCCTGTCTCCTTCCTCCGCCCCCCACCTCGGTCCCTGTCTCTCTTCCCTCCGCCCCCACCTCGGTCCCTGTCTCCTTCCCTCCGCCCCCACCTCGGTCCCTGTCTCCTTCCCTCCGCC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1081030;STD_quant_start=89.569;STD_quant_stop=98.2558;Kurtosis_quant_start=-1.80857;Kurtosis_quant_stop=-1.09275;SVTYPE=INS;RNAMES=440b87b8-c6ec-442a-af75-dfa89e00edf8,4424b8d4-4051-48fa-be56-d6919c8a0dc7,4d613eaa-c75b-427b-bcfa-d2a36e9752e1,65a7caae-cab8-4f0c-875f-28cbf39e1a0e,76c26f65-ea8f-4f9c-8c9d-74c46b6f68ef,86084906-62ee-4dd3-b4a4-47006f5cbd0c,9d9069f8-b384-4f99-8dad-c399744fca4d,a5dc9ee6-2b37-434f-b073-36d2c59b4036,a6cacd3b-9403-486b-87fc-b66025e3de6b,a745741f-6c0e-48c7-96ae-398a2f5a2dd1,a85f6095-9dc2-4f1e-bf02-4a687064798b,c7c22436-4967-4980-857e-d4105fd50e88,e21dedd9-7626-4e45-bf24-cf0e7656d95d;SUPTYPE=AL;SVLEN=83;STRANDS=+-;RE=13;REF_strand=7,6;AF=0.5 GT:DR:DV 0/1:13:13 +chr1 1139864 30_0 GCTTCCTGAGGTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGTTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGGACCTGGGTCATGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGGCCTGGGTCCTGGGG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1140256;STD_quant_start=134.584;STD_quant_stop=128.357;Kurtosis_quant_start=-1.78526;Kurtosis_quant_stop=-1.77303;SVTYPE=DEL;RNAMES=03579faf-d5cd-44d0-bb48-de23461ec62a,0e6102ec-395f-44b3-a0af-91bb5151dafb,20ee4b65-a9e4-4e77-b144-17ee965f41d1,2be9761a-6073-4af9-a552-d3d45d98e6bb,30402e66-d8d8-49c6-a718-a6c263f7a348,426ef750-0308-48cf-927e-8386cb02b5ee,867072c4-7e47-47d9-bf1f-b1056ac2ff08,95eb0ff6-823f-408e-924b-e401abeb5176,96a778dd-8bff-43f5-a535-f23595d8a1e4,97ccf349-50cd-40d3-aad9-abe54542a246,a48ccc5b-fe15-4ff0-84d5-8e72fc9270c2,acdc8e01-1f35-4f4c-99cc-dab2db17fe7b,af81c555-08ce-4b39-970c-731d4345afae,b03b7b95-fa87-4c5b-95af-169634b315ae,ca39052e-b26a-4d0b-bb09-e1b0f886fff7,e162dc0b-3b10-4a23-ad43-04c1b770f511,edb790fc-a9a8-41e5-9330-733d40b6dd4a,f8620b5e-2fe8-41db-bf86-f25be5003c44;SUPTYPE=AL;SVLEN=-392;STRANDS=+-;RE=18;REF_strand=13,11;AF=0.428571 GT:DR:DV 0/1:24:18 +chr1 1140472 30_1 GGTCCTGGGGAGCTTCCTGGGTTCAGAAGGTGGGGGTGTCAGCATCGAACCGGGGAACCTGGGTCCTGGGGAGCTTCCTGGGGTCAGAAGGTGGGGGTGTCAACGTCGAACCGGGGGACCT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1140613;STD_quant_start=49.8849;STD_quant_stop=25.1515;Kurtosis_quant_start=-1.2216;Kurtosis_quant_stop=-0.707103;SVTYPE=DEL;RNAMES=0e6102ec-395f-44b3-a0af-91bb5151dafb,1744c00c-5a62-44aa-8a0f-6686d8fd0c2c,20ee4b65-a9e4-4e77-b144-17ee965f41d1,27bec66e-3470-4d6a-a1a0-bdf2e9f76aa7,30402e66-d8d8-49c6-a718-a6c263f7a348,426ef750-0308-48cf-927e-8386cb02b5ee,43236ba2-f7bc-4926-9518-f80bf9379dec,69ac7be4-859a-490c-aa1d-710e74c87805,867072c4-7e47-47d9-bf1f-b1056ac2ff08,95eb0ff6-823f-408e-924b-e401abeb5176,96a778dd-8bff-43f5-a535-f23595d8a1e4,a9f6b127-756a-46c4-b05b-36fe658bcdc2,ca39052e-b26a-4d0b-bb09-e1b0f886fff7,dce3bf36-c24f-4f92-8a66-2a28546aa380,e162dc0b-3b10-4a23-ad43-04c1b770f511,edb790fc-a9a8-41e5-9330-733d40b6dd4a;SUPTYPE=AL;SVLEN=-141;STRANDS=+-;RE=16;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:16 +chr1 1141363 30_2 N CATCCACCTTAAAAATCCACAACCCCATCCTTACCTCTATCCCCACCACATCCTTACCA . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1141434;STD_quant_start=5.99444;STD_quant_stop=7.48777;Kurtosis_quant_start=-0.908378;Kurtosis_quant_stop=-1.01514;SVTYPE=INS;RNAMES=03579faf-d5cd-44d0-bb48-de23461ec62a,059316d4-869c-4aa5-b5cb-89219c0e8cf7,0e6102ec-395f-44b3-a0af-91bb5151dafb,1744c00c-5a62-44aa-8a0f-6686d8fd0c2c,20ee4b65-a9e4-4e77-b144-17ee965f41d1,240ad04f-3a6d-4091-a021-14b0d5c5852e,27bec66e-3470-4d6a-a1a0-bdf2e9f76aa7,2be9761a-6073-4af9-a552-d3d45d98e6bb,2e886ee0-ba8e-44d2-870b-aeb6fe4d42dc,30402e66-d8d8-49c6-a718-a6c263f7a348,34f6a8b7-dee3-471e-8d7c-1d82c2af6fce,3efd9d8b-152e-4717-ac25-67a387888f95,426ef750-0308-48cf-927e-8386cb02b5ee,43236ba2-f7bc-4926-9518-f80bf9379dec,5814c0bd-ae36-442f-baab-cda55d24b2bb,69ac7be4-859a-490c-aa1d-710e74c87805,867072c4-7e47-47d9-bf1f-b1056ac2ff08,93ce268e-dc9c-4025-b06e-67ab7d5bca98,95eb0ff6-823f-408e-924b-e401abeb5176,96a778dd-8bff-43f5-a535-f23595d8a1e4,97ccf349-50cd-40d3-aad9-abe54542a246,a48ccc5b-fe15-4ff0-84d5-8e72fc9270c2,a9f6b127-756a-46c4-b05b-36fe658bcdc2,acdc8e01-1f35-4f4c-99cc-dab2db17fe7b,af81c555-08ce-4b39-970c-731d4345afae,b03b7b95-fa87-4c5b-95af-169634b315ae,ca39052e-b26a-4d0b-bb09-e1b0f886fff7,dce3bf36-c24f-4f92-8a66-2a28546aa380,e162dc0b-3b10-4a23-ad43-04c1b770f511,edb790fc-a9a8-41e5-9330-733d40b6dd4a;SUPTYPE=AL;SVLEN=57;STRANDS=+-;RE=30;REF_strand=3,1;AF=0.882353 GT:DR:DV 1/1:4:30 +chr1 1157295 33 GCCCACCCATCCCGCCCCCAGCCCACCCATCCCATCCCC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1157334;STD_quant_start=3.24037;STD_quant_stop=2.46982;Kurtosis_quant_start=0.152647;Kurtosis_quant_stop=1.82248;SVTYPE=DEL;RNAMES=1cb1934d-1fd3-494b-bf4b-ae8f9cf41d90,6738bc4d-79ff-4d32-8fe0-faeae3af727e,7c2f604f-d8aa-473c-81e9-9a90eb796bbe,8778275f-b6a5-436b-8397-8d2d7e021575,8aa2352e-5d62-40fe-9317-aa7c2f53bf41,8ef36b02-ba48-419c-beed-613ee47ba1cf,a395395c-4e15-486a-b806-e344f3f55174,c4687b3b-680d-4be8-b9f9-d5265f5a0964,dcc3d03e-894a-4ee9-addb-ea3bf13ae874,eb25737e-1e54-43ae-9847-00c581fb18cd,fe3e2ffe-81a9-4348-92c5-960666a6a0a2;SUPTYPE=AL;SVLEN=-39;STRANDS=+-;RE=11;REF_strand=11,13;AF=0.314286 GT:DR:DV 0/1:24:11 +chr1 1184798 34_0 N GGAGCTAGCCTCCAGGCAACTGTGCCCCCAGTTCATGCAGAAGCTCCTCTTAGTTAGGGGGATGTTCCCTCTTGGGGATCCCTCATGAGGACAGGTCTCCTGGACAGCTTCCGGGAGCCAGTCTCCAGGGCAACCGTGCCCCCAGTTCATGCAGAAGTTTCTAGGTTAGGGGATGTTCTCTTGGGGGACCTCTGCAGGAGGACGAGCCCTCCGACAGTCTGGGAGCCAGTCCCAGGCACCGTGTGCCCCCAGTTCATGCAGAAGCCTTCTAGGTTAGGGGGATGTTCTTCTTGGGACCCCCGTGAGGACAGGCCTCCGGACAGCCC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1185052;STD_quant_start=69.5507;STD_quant_stop=31.3783;Kurtosis_quant_start=-1.85297;Kurtosis_quant_stop=-0.741594;SVTYPE=INS;RNAMES=277e1ae1-0168-457f-927c-4cdd7bb99461,29a280f3-67bb-467d-88bc-3b8ab020891c,3136fe80-1a33-41be-b56c-78a92eae8d90,35bfe6d5-d6f0-463d-aac8-a977a63158e2,454a61df-2858-4bc0-bac1-85e3b73fe7e0,49ac4bf5-faae-4ba9-af5b-de6fc0f24626,51bba82b-43e2-4996-831c-beb240efcd26,60f02903-b561-4003-9ff2-04dcae4ba733,62a0f631-8725-4dbf-a7dc-119d44e4b0cf,8102c2a1-7b8e-40a3-ad9a-87abf4ea1dda,85b18239-9e8e-4ab5-9642-ce22386937be,8a38435f-7873-41f0-8b55-e81b11347e5e,8b2e4a49-5031-41c4-88ed-fa12579cc05a,b002a3cb-36f2-46b4-bfbb-4d4ba9f178f0,bd6b4530-715c-461c-a245-5e6bec500fd4,ee0f5472-a5f9-4c75-b6c5-ea35f92b1742;SUPTYPE=AL;SVLEN=309;STRANDS=+-;RE=15;REF_strand=4,12;AF=0.483871 GT:DR:DV 0/1:16:15 +chr1 1192679 34_1 N CAGCAACACTCCAGTTGGTAAGGAGGTAGACACTCCAGTTGATTTGGTAGACACCCAGTTGGTATGAGTGCAGACACTCCGTTCAGACG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1192770;STD_quant_start=34.6569;STD_quant_stop=33.5604;Kurtosis_quant_start=-1.06469;Kurtosis_quant_stop=-1.07101;SVTYPE=INS;RNAMES=11919ebe-07ef-4563-a58e-7e8852e40c62,29a280f3-67bb-467d-88bc-3b8ab020891c,430451ae-8c86-45ec-9e02-204b85126a03,4bc42b23-3c32-4cf4-a56d-76d7ee9d460a,51bba82b-43e2-4996-831c-beb240efcd26,62a0f631-8725-4dbf-a7dc-119d44e4b0cf,8102c2a1-7b8e-40a3-ad9a-87abf4ea1dda,85b18239-9e8e-4ab5-9642-ce22386937be,8a38435f-7873-41f0-8b55-e81b11347e5e,90410340-c9ec-4f3c-9f05-83aeaa50aea4,9ae6c2a6-0912-44e3-bc02-2c4088899257,9c49056c-60c3-4694-a9f2-1d24903ebaf6,a8000c57-5c8f-41b9-88ce-7a7c8e029e58,a861fb94-46de-474e-bd54-9b9b6967ee64,aace44f8-b9b9-440c-86d9-846e5972c063,bd6b4530-715c-461c-a245-5e6bec500fd4,c89afb67-abb2-41ab-bede-4ec6864e51af,fc640450-f292-4db5-a8cc-cda19822e131;SUPTYPE=AL;SVLEN=95;STRANDS=+-;RE=18;REF_strand=3,11;AF=0.5625 GT:DR:DV 0/1:14:18 +chr1 1195863 35_1 N ACTCCTGAGCTCAAGCGATCCTCCTGCCTCAGCCTCCCAAAGTGCTGGGACTACAGGTGTGAGCCACGTGCCCGACTAACTTTGTGTATTTCTAGTAGAGATGGGGTCTCACCATGTTGGCCAGGCTGGTCTCAAACTCCTGAGCTCAAGCGATCCTCCTGCCTCAGCCTCCCAAAGTGCTGGGACTACAGGTGTGAGCCATGCGCCCGACCAATTTGTGTATTTTTAGTAGAGATGGGGTCTCACCATGTTGGCCAGGCTGGTCTCAA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1196134;STD_quant_start=10.8904;STD_quant_stop=9.59166;Kurtosis_quant_start=0.14018;Kurtosis_quant_stop=0.571278;SVTYPE=INS;RNAMES=00a96eba-04ea-4c0c-be77-943a5ddcbb3e,08988e9c-750f-4f68-aafe-2e5dc576b27e,11919ebe-07ef-4563-a58e-7e8852e40c62,309ad33e-4564-479c-beb3-51ed8d00faf8,430451ae-8c86-45ec-9e02-204b85126a03,4bc42b23-3c32-4cf4-a56d-76d7ee9d460a,50b989c6-4808-4fc5-9c7a-9d4356562d73,7bf4a6f9-c730-4436-909d-0e84b3b3d360,8102c2a1-7b8e-40a3-ad9a-87abf4ea1dda,9ae6c2a6-0912-44e3-bc02-2c4088899257,9c49056c-60c3-4694-a9f2-1d24903ebaf6,a8000c57-5c8f-41b9-88ce-7a7c8e029e58,aace44f8-b9b9-440c-86d9-846e5972c063,b4b6d8a2-0f6b-4d4d-bdbc-5a480dce1460,bfcefdd8-6767-449a-98d3-18de2c092438,fc640450-f292-4db5-a8cc-cda19822e131;SUPTYPE=AL;SVLEN=267;STRANDS=+-;RE=16;REF_strand=2,2;AF=0.8 GT:DR:DV 0/1:4:16 +chr1 1202531 36_1 N AGCCCAGTACAGCCAGGCCAGTAACCCAGTCTCCAGCCCAGTACCCAACCCCCGAGGCCCAGTACCCATCCCGGGCCCAGTACGGCCAG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1202642;STD_quant_start=40.7689;STD_quant_stop=41.1278;Kurtosis_quant_start=-1.10253;Kurtosis_quant_stop=-1.13284;SVTYPE=INS;RNAMES=1adb49a2-289d-45e5-bc0b-3673e037af90,2df4f476-58f5-4eb4-a57f-71f911ddb69f,3b1716ef-9033-4f29-aeee-9b124b922b17,6d83fc2c-dd1e-442b-9850-57e3d6995045,7bf4a6f9-c730-4436-909d-0e84b3b3d360,95b9accd-2c2f-431f-99e3-eb6495dd7f8e,a8000c57-5c8f-41b9-88ce-7a7c8e029e58,ac886eb9-d238-4e1e-b71d-afc052b0e108,b4b6d8a2-0f6b-4d4d-bdbc-5a480dce1460,babe9803-f544-4f38-8e57-73141b7fc495,c7db4fb1-544c-42d8-b7aa-6bf08e24ea24,c9ad1d7d-d31f-4863-b739-176e8ecdb97d,cc9ffd44-267e-45e9-b1e2-632c01537746,f4f62c5e-60a8-4ba5-af8e-bc8262002045,f7d86242-2de8-4d7d-a21d-a9dfb95cf47f;SUPTYPE=AL;SVLEN=135;STRANDS=+-;RE=15;REF_strand=5,5;AF=0.6 GT:DR:DV 0/1:10:15 +chr1 1212606 37_1 N CTGTGTCTCCTCCCAGCCCCTGGCCCTCTGCTCCCTG . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1212643;STD_quant_start=9.40744;STD_quant_stop=8.43801;Kurtosis_quant_start=-0.556909;Kurtosis_quant_stop=-1.21037;SVTYPE=INS;RNAMES=04062431-13ec-4e44-ab66-5a1b0f33c5fe,26efab64-d240-43c4-8a69-2dddbb7442b0,2d0c7e92-46b6-4b6a-b2f3-33de1bda9d4a,333dc1ea-8db5-4688-baa6-4a7269baa8a4,8ee6f2fb-956b-4569-959a-3a774dc2119b,994d504b-d7de-46f8-887e-f979cce5b46e,a84ea522-82cc-4147-8fc1-d1cda00b2e15,c7db4fb1-544c-42d8-b7aa-6bf08e24ea24,dbc43eb6-558c-4a1d-923d-50df791325e7,f7d86242-2de8-4d7d-a21d-a9dfb95cf47f;SUPTYPE=AL;SVLEN=32;STRANDS=+-;RE=10;REF_strand=5,6;AF=0.47619 GT:DR:DV 0/1:11:10 +chr1 1225741 39_0 CACACACTCCACATGCCACAGACACGGGCCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1225772;STD_quant_start=12.8776;STD_quant_stop=12.5366;Kurtosis_quant_start=-0.436071;Kurtosis_quant_stop=-0.820559;SVTYPE=DEL;RNAMES=1b9aba61-1a2b-43f2-9eff-6ff778de4a59,39425bb9-5854-41f9-b741-082aa63d0cf4,45562285-7d3c-45a4-87fc-36ca744e483b,a2cec193-29f2-4d60-8aa9-0467e37c5f4f,a6bd9111-655a-4da8-a72d-7f469d9b22e6,b756ca77-feee-479e-b35a-628f46eafc0f;SUPTYPE=AL;SVLEN=-31;STRANDS=+-;RE=6;REF_strand=9,8;AF=0.26087 GT:DR:DV 0/0:17:6 +chr1 1226338 39_1 GTACGGTCAGGAGGAAACATGGCACCTCCCCTCTGGGGGCTCTTTCCAGAAACCCTCAACCC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1226400;STD_quant_start=3.34664;STD_quant_stop=2.89827;Kurtosis_quant_start=-0.781543;Kurtosis_quant_stop=-0.761724;SVTYPE=DEL;RNAMES=008e889f-e959-4777-95e1-f0f53a4334dd,00be4721-5fce-43ac-8334-ba4546a49991,080d6acc-03a7-4cba-a86d-742a26c144de,0fa1f5d5-11c3-4de1-98cc-f9d60c32c056,1b9aba61-1a2b-43f2-9eff-6ff778de4a59,39425bb9-5854-41f9-b741-082aa63d0cf4,857efaff-3674-4ef8-89fe-f9638f609115,93c5ef6d-b54c-4808-acf2-05c00acdc5cc,93cfcbe8-1573-4033-a833-27cbdd8ec529,994d504b-d7de-46f8-887e-f979cce5b46e,a2cec193-29f2-4d60-8aa9-0467e37c5f4f,a6bd9111-655a-4da8-a72d-7f469d9b22e6,b756ca77-feee-479e-b35a-628f46eafc0f,c6c2851d-1031-45d1-95aa-486eb563efa5,d5fa6b3d-485a-4e4f-8efc-1f6c74c6802a,eff9bc97-f12b-43ec-86a8-2aabe4538a97;SUPTYPE=AL;SVLEN=-62;STRANDS=+-;RE=16;REF_strand=0,1;AF=0.941176 GT:DR:DV 1/1:1:16 +chr1 1227295 39_2 GGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCGGCGGGAAGGCGAGCTCGTGGCCAGGCCCTGCGGGAAGGCGAGCTCGTGGCCAGGCCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1227466;STD_quant_start=3.72827;STD_quant_stop=2.32379;Kurtosis_quant_start=-0.966443;Kurtosis_quant_stop=-0.297101;SVTYPE=DEL;RNAMES=008e889f-e959-4777-95e1-f0f53a4334dd,00be4721-5fce-43ac-8334-ba4546a49991,080d6acc-03a7-4cba-a86d-742a26c144de,1b9aba61-1a2b-43f2-9eff-6ff778de4a59,39425bb9-5854-41f9-b741-082aa63d0cf4,4c94aa0d-60a3-40ca-b22e-cc7a3bef0bfe,857efaff-3674-4ef8-89fe-f9638f609115,93c5ef6d-b54c-4808-acf2-05c00acdc5cc,93cfcbe8-1573-4033-a833-27cbdd8ec529,994d504b-d7de-46f8-887e-f979cce5b46e,a2cec193-29f2-4d60-8aa9-0467e37c5f4f,a6bd9111-655a-4da8-a72d-7f469d9b22e6,b756ca77-feee-479e-b35a-628f46eafc0f,c6c2851d-1031-45d1-95aa-486eb563efa5,d5fa6b3d-485a-4e4f-8efc-1f6c74c6802a,e38a0a27-360a-443d-a1f9-623653437f99,e3c28301-b82b-445e-8e8f-e452cb05525d,f66db954-420d-4eb9-8ff8-ad2bd3709180,f98168e1-e2ea-4e02-9b61-8bc06c077c08;SUPTYPE=AL;SVLEN=-171;STRANDS=+-;RE=19;REF_strand=1,4;AF=0.791667 GT:DR:DV 0/1:5:19 +chr1 1240679 41_1 N CAGCCCTTCGCCTCGCCCCCATTCACCCCGGCCGTGGTCCCCGCCG . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1240721;STD_quant_start=9.65919;STD_quant_stop=8.89382;Kurtosis_quant_start=-1.4823;Kurtosis_quant_stop=-1.52921;SVTYPE=INS;RNAMES=09729ccf-3196-4cae-9ebe-4faaa862e66e,0d7665dd-6576-42b7-a8c1-27bd68a9d49c,194e0f94-996f-4a42-ab07-2facd258dd40,1a7dab12-4454-4bed-b41b-2d6946aac30b,2f2a2f48-af67-4fbc-ba75-94f0aa75156c,8be9430e-48c3-4bcb-877c-45fd3777346b,a4a727eb-971e-46da-8003-6ed814f3532f,a9d6e62e-975c-4494-b03f-2f3297fb5338,bf321fd3-95ca-4cff-abe6-9a298c318f0f,cf376f94-eacf-4013-89ca-707dab183b9e,d600852f-8fa2-40ea-9339-07a7cd033a84,e38a0a27-360a-443d-a1f9-623653437f99,ebf79d47-5397-4b0c-8100-c1d91f9ebfbe,f98168e1-e2ea-4e02-9b61-8bc06c077c08,f9f30360-e8ec-480e-9803-34b7ded4ae8b;SUPTYPE=AL;SVLEN=41;STRANDS=+-;RE=15;REF_strand=4,8;AF=0.555556 GT:DR:DV 0/1:12:15 +chr1 1245142 42_1 N CCCACCTCCCCCACTCATCTCCCTCTCCCCACTCCTCTGCCCTCCCTCCCTTCCCCCTCCTCCCCCACTCCCTTCCCTCTTCCCCCGACTCCTTCCCCCTACTCATCTCCCTCCTCCCCCACTCCCTCTCCCTCCTCTCCCACTCCTCCCCCCTCCTCCCCACTCCTCCCCACTGCTCTCCCTCTTCCCCCCCCACTCCTCCCCACTCCTCCCTCCTTCTCCACTCCTCTCCCCTCCCACTCCCCTCCCCCACTCCTGTCCCCTCCCTCCCCCCCCTCTTCCCCCTCCCTCCCCCACTCATCTCCCTCCTCCCACTCCCTCTCTCCCCTTTCCTCCCACTCCCCCCACTCCTCCCCCACTCCTCTCCCCTCTTGCTCCACTCCTCCCCCCCACTCCTTCCCCACTCTCCTCTCTCTCTCCCCCCACTCCTCC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1245551;STD_quant_start=23.8328;STD_quant_stop=45.7821;Kurtosis_quant_start=-0.017098;Kurtosis_quant_stop=-0.158047;SVTYPE=INS;RNAMES=07c11880-546b-4465-a6c4-0b239950bb6d,09729ccf-3196-4cae-9ebe-4faaa862e66e,1a7dab12-4454-4bed-b41b-2d6946aac30b,2f2a2f48-af67-4fbc-ba75-94f0aa75156c,317e4f04-e8e2-47a8-9c05-ef94c9a7ebd4,66736e57-ef09-42ad-8f74-beb60ddd937c,9a22573e-7dbd-417a-a38d-eeaf6301029b,a9d6e62e-975c-4494-b03f-2f3297fb5338,afe6a776-b0e5-4d00-9289-bb67aafd12c1,d15d462f-e582-4923-ac5d-a8d5295903a6,eb6918ae-e91b-4b8e-99ca-2aeef6556276,f9f30360-e8ec-480e-9803-34b7ded4ae8b,fbc2109b-27cd-451e-90a1-19586aaf4a1a;SUPTYPE=AL;SVLEN=420;STRANDS=+-;RE=13;REF_strand=3,9;AF=0.52 GT:DR:DV 0/1:12:13 +chr1 1248881 42_2 ACGGGCAGCCCTGGGAGGCTGGAGCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGAGCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGAGCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGACACGGGCAGCCCTGGGAGGCTGGACCGAGGGAGGCTGGGCCTCCCACTCCGCCCTACAGGCCGGGAC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1249211;STD_quant_start=22.8053;STD_quant_stop=21.8403;Kurtosis_quant_start=-1.69581;Kurtosis_quant_stop=-1.68758;SVTYPE=DEL;RNAMES=037003f8-b2b7-4e67-ab0a-e7b297be8a5d,09729ccf-3196-4cae-9ebe-4faaa862e66e,1a7dab12-4454-4bed-b41b-2d6946aac30b,2016fd8d-7da2-46f5-96f9-2a2b3c15f2fa,2f2a2f48-af67-4fbc-ba75-94f0aa75156c,317e4f04-e8e2-47a8-9c05-ef94c9a7ebd4,4cdbb972-6e35-4c1a-81b5-47a0e61cf0c1,66736e57-ef09-42ad-8f74-beb60ddd937c,83ddf732-bfb5-4e42-8042-160d0881c90e,901358de-fdfe-451c-81ab-16ae8872aa65,9a22573e-7dbd-417a-a38d-eeaf6301029b,9f9ef125-570b-4a75-b83a-beb47feef6d6,a9118b80-3582-4c37-8439-e4ee6e2820ba,a9d6e62e-975c-4494-b03f-2f3297fb5338,ac195749-76e0-41fd-a703-e392ee7a4426,afe6a776-b0e5-4d00-9289-bb67aafd12c1,b166d473-f790-4559-b720-9ad3025ad640,bc8c5a51-d98e-4f23-b1c8-1a311a65a568,c221a396-2145-44d3-9355-789c193d1c96,cf376f94-eacf-4013-89ca-707dab183b9e,d15d462f-e582-4923-ac5d-a8d5295903a6,d6107ee4-f1a4-4cba-b91e-68f4dd2812a4,e9fd2d30-9f04-4e0c-99ce-a984301a6235,eb6918ae-e91b-4b8e-99ca-2aeef6556276,f5588eb1-c65b-4885-8ac8-32b6c5b5c475;SUPTYPE=AL;SVLEN=-330;STRANDS=+-;RE=25;REF_strand=4,4;AF=0.757576 GT:DR:DV 0/1:8:25 +chr1 1284190 45 N GGGGTGTTGGTGAGGGGTTGGGGTTGGGTGAGGGGGTGGGGTGGGGTTGGGTGAGGGGGTGGGGGTTGGGTGAGGGGGGTGGGGTCGGGGTTGGAGTGAGGTGGGCGCCACAGGCAAAGCCAGCAGGGTGGGGGCTGGGTGAGGGTGGGGCAAGGGCAGGGGCTGGGGCTAAGTGAGGGGGTGGGGTTGGGGTGAGGGGGTGTGGGGGCCGGTGAGGGGGTGGGGGGTTGGAGGAGGGGGTGGGGTGTTGGGTGAGGGGTTGGGGTTGGGTGAGGGGGTGGGGGGTTGGGTGAGGGGGTGGGGTGTTGGGTGAGGGGTTGGGGTTGGGTGAGGGGGGTGGGGGGTTGGTGAGGGGGTGGGGGTGGGGGTTGGAGGAGGGGGCTGGGGGCTGGGGTGAGGGGGTGTTGGGTGAGGGGTTGGGGTTGCGCAGGTGTGTGTGTGGCTGGGGGTGAGGGGGTGGGGGTGGGGTTGGAGGAGGGGAAGGTTGGGGGGTTGGGTGAGGGGGTGGGGGTGGGTGTTGGGTGAGGGGTTGGGGGGTTGGGGTGAGGGGGTGGGGCTGGGGGTGAGGGTGGCAGGGGGCTGGGGGAGGGGG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1284799;STD_quant_start=22.926;STD_quant_stop=86.9787;Kurtosis_quant_start=-0.693689;Kurtosis_quant_stop=-1.20235;SVTYPE=INS;RNAMES=0c6ea7a6-172e-42cc-897a-13d95d62bc45,3b2ce036-07ef-4213-949c-d0377d2950a5,42a6610e-7aeb-416d-bbef-1cef0d386717,47282d13-ea62-41a6-a0bd-92cedb34cb3f,8cd6c891-3e8b-4f83-a190-abd1f0321ad2,98eba6a7-8652-4240-a7bb-79a06683518a,bc037eb7-a08b-4888-9ec8-b5b5c886615e,c8ae8560-ca59-4cf5-846e-be8a2746f03b,cfb0bf08-d8bd-4f88-b3ce-22ee1fb0567b,d41ae278-2dde-4c65-81d5-6728386c7ea1,d905968a-b77b-4b8c-9ddc-7ec356b5148b,e02988fa-b749-4ba1-98c7-69cde5026997,ef9795fd-1558-4c0f-9a07-3ffc80f58234,f090615a-b5ec-4a2f-96ab-094ac8219fee;SUPTYPE=AL;SVLEN=599;STRANDS=+-;RE=14;REF_strand=4,10;AF=0.5 GT:DR:DV 0/1:14:14 +chr1 1324159 46 N TATCGAGCCCGTGGCCAAATGAGGCTTGAGGCAAATTTCAAAAACACTTGCCCCAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1324218;STD_quant_start=18.1576;STD_quant_stop=31.3401;Kurtosis_quant_start=0.83085;Kurtosis_quant_stop=-0.226595;SVTYPE=INS;RNAMES=0554bafb-13c8-4360-afde-ed31ba1dc2bf,1283b68b-cce2-4adf-897e-bb538321787a,1aaa67ca-6c19-476f-83b6-067ccdf62a30,30d14799-fb8b-4caf-bc0c-f49a1ab3e01d,3fd47162-bcf1-4282-9e93-c2b97119ea48,801df1d1-72f7-41ea-88bc-23f3beabe09f,80961a72-f599-48fd-89f8-bcda39a7b522,ae7e2c7d-9539-4ef8-87c2-149fcbcd929c,b63cfa8a-2ef1-4598-9677-9dfbddf4961d,bde6cca3-0f44-4abc-b9b1-3dc3b39093e6,c46748b8-a101-4214-8b60-8dcdd901052d,d4ff9d85-36b6-41b0-ab88-a56a12e0fcfb,e7bb299d-38db-4c6f-b60b-3ceb9efe73e3,f6b6760c-e9eb-4aff-8e43-dedb9698d437;SUPTYPE=AL;SVLEN=55;STRANDS=+-;RE=14;REF_strand=5,8;AF=0.518519 GT:DR:DV 0/1:13:14 +chr1 1339902 47_0 N CCCCCAGCAGCCCCCACAGACTCCACCCACAGCCGCATGTCCCCCAGCAGCCCCCGCAGACCCACCCGCAGCCGCATGTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1339971;STD_quant_start=11.7516;STD_quant_stop=11.9164;Kurtosis_quant_start=-0.816288;Kurtosis_quant_stop=-1.33387;SVTYPE=INS;RNAMES=04da80bc-81b5-4cdc-8ffe-7f1bec4f6472,10156bf4-1468-45e1-80e5-c9d29b6c9ce5,262a5294-1b83-4afb-a9a8-4a00e9d3cca3,298e0884-c687-4f03-85ea-722fcd4e4c8a,4819a1ff-f986-49b2-bcdb-227fca169690,500dac03-7a9b-4739-8335-a0b096e79808,66fb9e46-bfd0-43ba-9836-481feb3ccb0c,6daf23b8-9c71-4508-a570-9c76ec1e6415,7ad61263-7c14-4778-9ac4-e5a8bcb01870,80961a72-f599-48fd-89f8-bcda39a7b522,850cec68-5eb5-41e0-b940-dbfc8ae45fbc,a50d14a7-5e42-40b5-9c92-a5e88265cf14,c380b278-00d8-4313-9249-ef02a0ecfd0b,cdca1690-e8b9-4e03-b9b3-df4333ff650b,d366963c-1b8d-4554-9e4a-2986f8337291,d90af666-03d9-44e4-bd39-13cc4b1cd95c,deeabf69-eddb-4515-9169-048cc1ab5e8a,fb253266-0c10-43f1-92f5-11c1601107f5;SUPTYPE=AL;SVLEN=75;STRANDS=+-;RE=18;REF_strand=5,10;AF=0.545455 GT:DR:DV 0/1:15:18 +chr1 1350096 47_1 GGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCTGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACCGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGTGAGGAGGGGGCCTGGACGGGGCAGGAGCGACGGGGGGAGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1351176;STD_quant_start=21.5198;STD_quant_stop=28.674;Kurtosis_quant_start=1.78891;Kurtosis_quant_stop=2.40552;SVTYPE=DEL;RNAMES=2d5ac90d-887b-406e-8d4f-3942594b3bf3,32e9e238-02d6-487c-a9be-8bf9bd99270a,3c99af7e-584a-448b-81dd-0c39c927200d,3d4d0474-244a-4618-8f28-e383b161918a,3e429703-76cf-4f53-aa9d-03325d98056c,3f950823-a8a6-433d-b805-d3bceb1a6e0f,45d3a422-62d7-4592-8cc4-3aff359a06ef,500dac03-7a9b-4739-8335-a0b096e79808,550a7bb2-4116-43ff-806a-24fbbe31653c,61090925-ea4d-4b09-850b-cf04c0566601,63c74d33-7e97-45f9-a1d6-a589aa2f74da,850cec68-5eb5-41e0-b940-dbfc8ae45fbc,a50d14a7-5e42-40b5-9c92-a5e88265cf14,d50cb4a7-ea94-44a4-bc0d-34c1e54743ea,d781296c-fbd6-4b69-a189-19ceb599cf93,ef7451df-782a-4d52-b4bd-8cdb75b048f7,f9882952-a02e-4100-ba78-54f16d02075a;SUPTYPE=AL,SR;SVLEN=-1080;STRANDS=+-;RE=17;REF_strand=6,8;AF=0.548387 GT:DR:DV 0/1:14:17 +chr1 1382491 49_0 N CACCCCTTCCCAACAATACAGTAACAATCCAGAGGCCACCACCCCTTCCCAATCCAGTAACAATCCGGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1382627;STD_quant_start=54.9363;STD_quant_stop=78.4895;Kurtosis_quant_start=0.239206;Kurtosis_quant_stop=-1.35672;SVTYPE=INS;RNAMES=031dab0a-580f-4be2-a665-8873f4fe00b5,18ed9029-212b-4761-935b-e24e3f2335a0,2425f835-9cd7-4965-891a-445d5c82100e,47b4bc54-5717-42c7-8a3a-74c006870b50,51645d41-cfe8-4d0c-8cf4-3370c85e8d8f,5fa055e8-9694-47fb-bfa3-026a05270ceb,7ae4b36e-e16f-48be-b025-b65c61c10f22,ace3a3d8-cfe5-4fa4-a6fa-87bab1cc8974,c9a408b4-e1d5-4ea4-89e2-1be3c480b771,d2c003a3-5b0f-4938-8c0b-30c582fd859b,eb9380e9-db69-4c1c-b09e-498537795866,f470deff-8d76-4561-819f-6e0a15bffd71;SUPTYPE=AL;SVLEN=112;STRANDS=+-;RE=11;REF_strand=12,8;AF=0.354839 GT:DR:DV 0/1:20:11 +chr1 1382671 49_1 N AACAATCCACTAACAATCCAGAGGTCACCACCTTTGTGGCCAGTAACAATCCAGAGGCCACCACCCCTTTAACAATCCAGTGATCCAGGTCACACCCTTTTCCAACAATCCACTAATCCAGAGGTCACCACCCCAACAATCCTAACAATCCAGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCCAGAGGTACCACCCCTTCAACAATCCAGTAACAGTCCAGAAGTGCCACGCTGCAACAATCACTAAACAATGAAGGTCACCACCTTAACAATCACTAACAATCAGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCCAGGTGCCACCCCCTTCCCAACAATCCAGTAACAATCCAGAGGTCACCACCAGCACCAACAATCCACTAACAATCCAGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCCCGAAGTCACCGCACCACTTCAACGACCAGTAACAATCCAGGGTTACCACCCTTC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1383328;STD_quant_start=48.5716;STD_quant_stop=219.294;Kurtosis_quant_start=-0.873336;Kurtosis_quant_stop=-0.73496;SVTYPE=INS;RNAMES=031dab0a-580f-4be2-a665-8873f4fe00b5,1fa65f63-cc9c-47d5-ae79-edc5ad0ea2a5,47b4bc54-5717-42c7-8a3a-74c006870b50,51645d41-cfe8-4d0c-8cf4-3370c85e8d8f,7005bf86-94d0-477c-9c5b-6621cac52dcc,7511864c-ceb2-41e0-990a-47e3a512333c,7ae4b36e-e16f-48be-b025-b65c61c10f22,9da165d9-cd92-4f66-838c-3ce69f156e84,ace3a3d8-cfe5-4fa4-a6fa-87bab1cc8974,ba9672e5-1031-4619-95e7-b569294cdb48,f470deff-8d76-4561-819f-6e0a15bffd71;SUPTYPE=AL;SVLEN=653;STRANDS=+-;RE=11;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:11 +chr1 1382773 49_1 N GCTAACAATCCAGTAACAATCCAGAGGCCACCACCCTTCCCAACAATCCAGTAACAATCCAGAGGTACCACCCCCTTCCCAACAATCCAGTAACAATCAAGGCCACCACCCCTTCCCAACAATCCAGTAACAATCCAAGAGGACACCACCCTTCCCAACAATCCACTAGCAATCCAGAGGCCACCACCCCGCTTCCCAGCAATCTGACAACGACCCAGAGGCCACCACCCCTTCCCCAACAATCAGTAACAATCCAGAGGTCTTACCCTTCCCAACAATCCAGTAACAATCCCGAGGTGCACCACCCCTTCCCAACAATCCAGTAACAATCCAAGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCCGGTCACACCCCTTTTCAACAATCTCCTGGTAACAATCCGATT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1383281;STD_quant_start=93.8962;STD_quant_stop=287.586;Kurtosis_quant_start=-0.319993;Kurtosis_quant_stop=1.42519;SVTYPE=INS;RNAMES=18ed9029-212b-4761-935b-e24e3f2335a0,5fa055e8-9694-47fb-bfa3-026a05270ceb,8785c74a-f492-4719-9121-079a6417e94b,9da165d9-cd92-4f66-838c-3ce69f156e84,a439ed56-3edf-44d9-bf00-d97b318e76cb,ab32879e-eb33-4a1a-8487-f33420ef77a6,b225fd26-a014-46f0-9d67-69bc928cc6bf,b527a53b-3e90-4329-8a95-741663307174,d2c003a3-5b0f-4938-8c0b-30c582fd859b,eb9380e9-db69-4c1c-b09e-498537795866;SUPTYPE=AL;SVLEN=576;STRANDS=+-;RE=10;REF_strand=1,0;AF=0.909091 GT:DR:DV 1/1:1:10 +chr1 1477854 52 N TTTTAGTAGAGACGGGGTTTCTCCATGTTGGTCAGGCTGGTCTCTAACTCCCGACCTCAGGTGATCCACCCGCCTCGGCCTCTCAAGCTGTTGGGATTACAGGCATGTGCCACCACGCCTGGCTAATGTTGTAT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1478020;STD_quant_start=12.7243;STD_quant_stop=24.7809;Kurtosis_quant_start=-1.6989;Kurtosis_quant_stop=-1.97912;SVTYPE=INS;RNAMES=07b60109-cdef-4cd2-abca-2ea1363e0c91,0e05fbc5-5ee7-4ad1-942d-b164cadb94d3,2924c668-d524-45ac-bead-4d285f81b619,2f7309fb-cddd-4875-a0f3-f840e3e40213,59110e1d-3156-4eac-a45f-715787273ff4,67a300aa-1927-4adc-9ee6-ed3534ccfba1,6a36cbf8-7d4b-4691-b122-131c5af687af,6bded483-378a-458d-9f76-78e4f172d4e1,6decd45c-0d98-4323-be81-152e5e0fd97f,88dbeec9-ccb8-4466-b504-1276b26e437a,972d07da-7206-475c-bb67-d9ec6eadf48f,a04994e3-af48-4d81-9393-3b909f1eb329,a205b141-4498-48c2-8dd6-4a9a0d41de87,aabe76bd-9eb5-4a00-a7ce-80624ed3249b,b0c224b4-f321-4396-a5d6-b4532e6055a0,bdd1fd54-de1f-48db-bc04-0f44fa780b44,d1c4824a-20f7-4c6a-94fc-8631e56a0e5f,d2809436-8fce-4737-93f9-c4d6365fd29b,dd8aa318-13f5-47f6-b942-a15cfa44ed9f,ee7926cb-fdb1-44f8-9088-8c9564bd9901,f0da4292-174c-4644-b7a7-99656f6c0b0a,ffbc26e5-d2c2-47de-84db-a2dc513802ba;SUPTYPE=AL;SVLEN=133;STRANDS=+-;RE=22;REF_strand=12,10;AF=0.5 GT:DR:DV 0/1:22:22 +chr1 1546874 53 N GCCGGGCACGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCATCTGAGGTCAGGAGTTTGAGACCAGCCTGGCCAACATGGAGAAACCCTGTCTCTACC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1546993;STD_quant_start=10.127;STD_quant_stop=14.476;Kurtosis_quant_start=4.51638;Kurtosis_quant_stop=4.89892;SVTYPE=INS;RNAMES=09299259-d83c-4aa5-82bd-f56be5ce9644,405fa48d-a656-4ed0-8b6f-62eb733f543c,42fe0d36-e9c9-40ab-9ba9-230f92cf317b,6a51b1c9-4ed2-4fb0-8938-3715299a7bae,7077c5e7-6290-494b-b083-95ab4d5ff81d,764610e4-2563-4d90-b950-d39c00d405f7,9af8a08f-f027-4782-975c-709f3691fd59,a3444d11-9c79-4a51-a077-576c8b8cc347,ea688935-8eef-4635-9532-06a1ac559daa;SUPTYPE=AL;SVLEN=119;STRANDS=+-;RE=9;REF_strand=9,7;AF=0.36 GT:DR:DV 0/1:16:9 +chr1 1565675 54 N CCTGTGTGGTGCAGGGCAGAGAACAGGACGTCGCATGGGCCCGACGGTGCTGGCTCCATGGGAACCGAGACCCAACACTCAAAGGAGACA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1565765;STD_quant_start=30.6359;STD_quant_stop=29.5296;Kurtosis_quant_start=-1.01354;Kurtosis_quant_stop=-1.11149;SVTYPE=INS;RNAMES=25cac45a-d879-4485-95f3-033b3d563fcc,6986875f-0079-43f4-b9e3-cd3aff393a0c,ac287bd2-b6ac-47b8-81ad-5a57790ee05b,c94b73ad-6aa2-4f25-adae-67029234b312,caf29d05-d9a4-4ffe-b87a-52f61e34df88,cf636c5d-4d04-4716-8915-4374376612e6,db5aff64-6919-4022-a47f-193efee55adb,e79af9a6-2b0f-42ae-b7dd-afb3ae25f7bd,ed481cd5-296a-4124-bb1d-4eede5da8b48;SUPTYPE=AL;SVLEN=98;STRANDS=+-;RE=9;REF_strand=11,7;AF=0.333333 GT:DR:DV 0/1:18:9 +chr1 1605539 55 AGGTGGGGTGGTGAGGTGAGGGGTTGTCTGGTCAGGTGT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1605578;STD_quant_start=20.1742;STD_quant_stop=6.72947;Kurtosis_quant_start=3.80595;Kurtosis_quant_stop=1.69901;SVTYPE=DEL;RNAMES=25a79ac4-2527-42c8-a92f-4413cf2d79a7,4532170c-09d3-4a4d-bb82-6bf9a0285295,4625b6d8-c84a-451d-b1c0-8b69ffe7d14e,58febb71-3f6d-4ae4-ab92-a274f854d1ab,69944d07-10fc-4685-ae49-0b1abf934484,b5d1648f-e2b7-429c-a313-5097fa483adf,c9971d2d-63f1-495c-bd4f-b6b89a24ec55;SUPTYPE=AL;SVLEN=-39;STRANDS=+-;RE=7;REF_strand=15,7;AF=0.241379 GT:DR:DV 0/0:22:7 +chr1 1651424 56 CCCCTCTGAACGGTCTGTGACACACGCATGCTTTCAGCTAGAGTACTCTATAGCC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1651479;STD_quant_start=14.9963;STD_quant_stop=13.3083;Kurtosis_quant_start=1.5721;Kurtosis_quant_stop=1.07756;SVTYPE=DEL;RNAMES=3e560e4f-c1b7-4a35-9414-eb142c3b7371,5fa2de07-06ec-45b5-8fea-d8e40c4e87c4,80ba1e79-0b04-4fe7-a303-b445b8542da8,8323da44-1b1e-44a6-bc51-e8f2ef737de0,86f27843-e02f-42ab-92cf-d2bf6aa778af,997bd44b-d3a6-463b-8b23-f666f2fe9020,aabdff84-36d9-49c2-b616-d4735837347f,c25206e5-b872-4b86-8f9f-65694bb54114,ce9f1e56-86b8-4126-8e10-aab2b1eb7714;SUPTYPE=AL;SVLEN=-55;STRANDS=+-;RE=9;REF_strand=10,11;AF=0.3 GT:DR:DV 0/0:21:9 +chr1 1666974 57 CACGCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCACTTCAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCAAACCAGAGAAATCCAGCTCTGGGTGACAGAGCAAGACTCTGTTTCGGGAAAAATAAAATACATAGGCAGGGCGCGGTGGCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1667141;STD_quant_start=0;STD_quant_stop=0;Kurtosis_quant_start=-0.819463;Kurtosis_quant_stop=0.270047;SVTYPE=DEL;RNAMES=060d6b86-2e66-4bbc-9a7c-9415526fa59b,072bb3a8-d200-4713-93c2-402b748b423e,075bc3cd-b1fc-4aa1-ba75-68943deea3c6,10b7c597-ef63-402c-9097-f3376630e3c4,14c39ed9-2b60-45b4-a156-ac1b0d908e95,2a1516d4-c826-4977-b218-2e1cd8604862,31afceef-4ee2-4387-9ac2-fd2e68aa0aca,35a63615-be4e-4c08-aa01-82f619f97040,3fbb21ab-76a1-4150-8514-b6a0469f9b15,4edd050e-a954-4530-a877-d6f971f06b68,51a68670-f0ec-4dcb-8724-f4cb2aa00601,615e378f-c130-4a00-9bf0-abd8e121548f,667e1d2b-597e-4aab-8a12-afcff720acf5,80ba1e79-0b04-4fe7-a303-b445b8542da8,8442a8ac-c73a-4f46-b2f5-30b585ad4616,8464a7a3-7aac-4df0-879c-fbd179ec8f2d,93501dc1-8719-427c-a534-5213d8388a95,9e14698f-da46-4339-bb7e-f2e110546fd4,b4a6b25c-ca71-4120-9dcb-5d60e5a23d1a,c51126bb-e48b-40ce-96a6-1a4301d59c71,d9d4337e-9156-4143-9353-42d114ccb8f7,e8098d58-79d8-4078-aa30-0d00a54368d8;SUPTYPE=AL;SVLEN=-167;STRANDS=+-;RE=22;REF_strand=7,12;AF=0.536585 GT:DR:DV 0/1:19:22 +chr1 1717623 58_0 TCTCTCTGGTTTTCGGTCTGTGACACACGCATGCTTTCAGCTAGAGTTTG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1717673;STD_quant_start=11.201;STD_quant_stop=11.5226;Kurtosis_quant_start=-2.1147;Kurtosis_quant_stop=-2.02344;SVTYPE=DEL;RNAMES=06f856d1-5b17-42aa-9d6b-f21c66498038,0c812675-f9c3-4cc2-b4bf-5e1522b413f7,15f0fc0a-5a10-4b82-a5dc-d24da2596044,2dedee60-2b41-417c-ba5e-0a4643d1e88e,2ecf0dc8-2605-42cd-a772-c9daa1ef189d,35dbb28b-1fab-4d4e-a6e4-224dcd7d5ece,44af996b-4fd6-40fa-a745-74450571919f,4a860044-14a8-4664-9846-b4c79cc1d378,5c1563fd-0a6f-45b2-90cb-1766cc4005aa,5e2b3871-32ff-43b5-9d9c-347908496c13,630856f7-d053-4cf3-a4f4-1ee922a7d71b,6f5bbe6e-5829-4055-b877-9bb70b07481f,87115b33-5081-4912-8489-633290dcac77,8d802d4a-8303-46ad-a184-e69ef9423609,95eb2d56-d6f7-4d0c-bb3a-55fc40088a7b,988e5d2a-032a-47a9-ac76-91ca9bce9a5e,9bb4d606-0331-4508-bbd8-9539a1bfc5af,a7cb795d-1d7d-4660-9999-e9287d3e3793,a89a4a91-6936-448e-bce6-8fbf748a97b8,ac3c2093-1fae-425c-a1f1-746d9562a403,b49ad046-fa38-4191-86db-753a9a202f44,c7ea695f-a0d3-43d0-9a12-3f7ae05af65b,c92a3f69-30ad-4d7a-8a7a-865c2b8817ad,cacb7c9c-efc3-4873-8d2a-2bcf4d08d93e,dcc35244-21cb-4b79-9826-5846971fdd0a,e20d17db-5b1c-447b-a1cc-322875c886f4,fb6a3418-9ee8-46cf-886e-8e50e7743a19;SUPTYPE=AL;SVLEN=-50;STRANDS=+-;RE=27;REF_strand=10,17;AF=0.5 GT:DR:DV 0/1:27:27 +chr1 1719634 58_1 N TTTTTTTTTTTTTTTTTTTTTAGACTCGCTCTGTCACCC . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1719670;STD_quant_start=0;STD_quant_stop=3.06594;Kurtosis_quant_start=nan;Kurtosis_quant_stop=-1.18244;SVTYPE=INS;RNAMES=0c812675-f9c3-4cc2-b4bf-5e1522b413f7,15f0fc0a-5a10-4b82-a5dc-d24da2596044,2ecf0dc8-2605-42cd-a772-c9daa1ef189d,a89a4a91-6936-448e-bce6-8fbf748a97b8,c92a3f69-30ad-4d7a-8a7a-865c2b8817ad;SUPTYPE=AL;SVLEN=36;STRANDS=+-;RE=5;REF_strand=2,1;AF=0.625 GT:DR:DV 0/1:3:5 +chr1 1749608 60 N CGTACATATTTTTTCTGTGTGATACGTGTGTGTGTGTGTCCTCAGTAATT . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1749658;STD_quant_start=5.15752;STD_quant_stop=4.69042;Kurtosis_quant_start=-1.52873;Kurtosis_quant_stop=-1.51005;SVTYPE=INS;RNAMES=069e1174-a90c-4a8a-b9be-dabb89b4f617,0d26ef2f-f950-4838-b0cd-2653c1ba9321,1dce2fd9-4045-47b9-ba8b-06dd52f2cabe,507e6a6b-87e9-4a0d-a837-9f91679a8d0e,50d64c3e-602f-4f8e-afdc-75c12855505f,76ad039b-f625-4578-ac35-3f536b735d06,794c41e1-4459-4017-9203-d77cc61b11ba,8a853631-8dd9-488a-92dd-714e0379f76f,8b335610-8201-44d1-9ae4-8744df3d006e,8bd5465a-56cc-4113-90e6-02441057a170,94d7beb9-6b85-49f6-aea2-250625cb0a0a,a642d74f-8330-43bf-b3f5-47b511d7cefb,d28a37eb-cbd4-4e36-a6a9-db5e5304af79,d412e8ee-9386-43f2-81c9-fd71136a8258,d74db67f-e377-4c86-b559-29839d3cc68e,e1435545-3a30-4d63-add9-f8192bf2c8b1,e2550f3f-e93c-4725-bf15-9e889c3c44be,f4197f06-21c9-4510-ae09-9a76159e43ab,fbc25fb5-5bbe-41d0-8819-3f71b5b42d6b;SUPTYPE=AL;SVLEN=48;STRANDS=+-;RE=19;REF_strand=6,10;AF=0.542857 GT:DR:DV 0/1:16:19 +chr1 1929384 61_0 N GAGGGGACAGGTCTGGGGAAGGCTAGGAGAGAGAGGTGAGGGGGGAGGCAGGGGAGATGTTGAGGGGGAGGGAGGGGAGAGGGTGGGGGAGGGAGGGGGGAGAGAGGGGGTAAGGGGGAGGAGGGAGGGGGAGAGGGGTAGGGAGGAGGAGAGGAGGAGGGAGAGGGTAGGGAGGGAGAGGAGGAAGAAGGGGAGGCGCTTGGGGAGGGAGGGAAGGAAGAGGGAGGGAGGGGAGAGGGAGGAGGGAGCAGGTGGGGGAGGGAAGGAGGGGGAAATGGTATGGGGGAGGGAGGGAGGGGGAGAGAGGGTAGGGAGAGAAGGGGGAGCAAGAGGGAAAGGGTAGGGGGAGGAAGGAAGGGAGAGGGTAGGGGAGGTAGGGAGGAGGAGGGTAGGGGAGGGGAGAGGGGTAGGGGAGGGAGAGGGCAGGGGGAGGGAAGGGGAGGGAGACGGTAGAGGGAGGGAGGGAGGAGAGGGAGTAGGGGAGGAGAGGAGGGGAGAGGGTAGGAGGGGAAGGGGGAGGGAGGAGAGGAGGGGAGGGAGGGGGAGGGAGGGGAGAGGGTAGGGAGGGAGGGGGAGGAGGGAAGAGGGTAGGGAGGGAGGGAGAGGAGAGGGGAGGAGGGAGGGAGGGAGGGAGAGGGTGGGGAGGAGGGAGGAGGTTAGGGAGGGAGGGAGAGGAGGGGAGAGGGTAGGGGGAGGGAAGGAGGGAGAGGGTAGGGGAGGGAGGAGGGGGAGAGGGGAGGGGGGAGGGAGGAAGAGGAGGGAGAGGGTAGGGGAGGAGGGAGAGGAGAGGGAGGGGGAGGGGAGGAGGGAGAGGGTAGGGAGGGAGGGAGGAGGAGGGGAGGGAGTGGGGAGGGAGGGAGAGGAGAGGGGAGGGAGGAGGGGGAGGAGGGAGAGGGTAGGGAGGGAGGGAGAGGAGGAGGGAGGGGGAGGGGAGGAGGAGAGGGTAGGGAGGGAGGGAGAGAGGAGGGAGGGAGGGGGCAGGAGGGAGAGGGGTAGGGAGGAGGCAGGAGGGGAGAGGGTAGGGAGGGAGGGCAGGAGGGAGAGAGGGTAGGGAGGGAGGGGAGGAAGGAGAGAGGTAGGGAGGGAGGGAGGGGGAGAGGGTAGGGAGGGAGGGAGAGAGGAGGGGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1930417;STD_quant_start=26.8721;STD_quant_stop=60.9526;Kurtosis_quant_start=-0.068928;Kurtosis_quant_stop=-0.877909;SVTYPE=INS;RNAMES=1f7ba560-1978-466a-8153-158e01dfe601,5a22fc3b-c4e9-4a52-9230-fcf16cee6cc1,8b6629f2-2a9c-4eaf-976d-d798aea4837e,90fb6a6c-97f6-487a-b61f-ae5f107333fe,94f3e08b-29e7-463f-9ea1-6a4d37f1b66a,a26dd940-0f20-4de8-966d-a3965fe00286,aa773c47-82ee-4b6b-80f6-38ea90908a12,ad7c3f25-2dba-44cf-ba68-3129cdf177cf,ce88f9c2-7d06-4543-aff6-4318dcd0117d;SUPTYPE=AL;SVLEN=1008;STRANDS=+-;RE=9;REF_strand=4,5;AF=0.5 GT:DR:DV 0/1:9:9 +chr1 1935132 61_1 GTACACACGTGTGTATGTGTGTTGCTGTGGGTACACACGTGTGTACGTGGGTGTTAGGCTGTGGGTACACACGTGTGTACGTGGGTGTTAGGTTGTGGGTACACAGGTGTGTACGTGGGTGTTAGGTTGTAGGTACACACGTGTGTACGTGGGTGTTAGGTTGTAGGTACACACGTGTGTACGTGGGTGTTAGGTTGTGGGTACACACGTGTACGTGGGTGTTGTAGGTACACACGTGTGTACGTGGGTGTTAGGCTGTAAGTACACACGTGTGTATGTGGGTGTTAGGTTGTAGGTACACAGGTGTATACGTGGGTGTTAGGTTGTAGGTACACAGGTGTGTACGTGGGTGTTAGGTTGTAGGTACACACGTGTGTATGTGGGTGTTAGGTTATAGGTACACACGTCTGTATGTGTGGGTGTTAGGTTGTAGGTACACACGTGTGTACGTGGGTGTTAGGTTGTAGGTACACAGGTGTGTGCGCGCTAGGTTGTAGGTACACATGTGTGCACGTGGGTTAGGTTGTAGGTACACACGTGTGTACCTGTTAGGTTGTAGGTATACACGTGTGTACGTGTGTGTGTT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1935578;STD_quant_start=153.559;STD_quant_stop=132.515;Kurtosis_quant_start=-1.19934;Kurtosis_quant_stop=1.97897;SVTYPE=DEL;RNAMES=1f79460c-a490-4e87-8ed1-bd0e2311fd96,35ca5bc9-cb85-4600-a1cc-cc7ad8ea0900,46bb71d9-c744-41b2-9e5b-280327785e81,5a22fc3b-c4e9-4a52-9230-fcf16cee6cc1,ad7c3f25-2dba-44cf-ba68-3129cdf177cf;SUPTYPE=AL;SVLEN=-446;STRANDS=+-;RE=5;REF_strand=12,5;AF=0.227273 GT:DR:DV 0/0:17:5 +chr1 1937841 61_2 CACCAGGTCCACCTCTGGACACAGGTCCACC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1937872;STD_quant_start=1.78885;STD_quant_stop=0.894427;Kurtosis_quant_start=2;Kurtosis_quant_stop=2;SVTYPE=DEL;RNAMES=17cf1867-d8d0-4d6c-a0a4-7cd27f98d21c,35ca5bc9-cb85-4600-a1cc-cc7ad8ea0900,46bb71d9-c744-41b2-9e5b-280327785e81,5a22fc3b-c4e9-4a52-9230-fcf16cee6cc1,ad7c3f25-2dba-44cf-ba68-3129cdf177cf;SUPTYPE=AL;SVLEN=-31;STRANDS=+-;RE=5;REF_strand=1,0;AF=0.833333 GT:DR:DV 1/1:1:5 +chr1 1948947 64 N CCTCCTTCCTTCCTCTTTCCTTCCTTCCTTCCCTCCCCTTACTCCTTCTTCCTTCCTTCCCCTTCCTTCTTCCTTCTCTCCCTCCCTCCCTTCCCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1949050;STD_quant_start=5.56776;STD_quant_stop=18.5661;Kurtosis_quant_start=-0.180711;Kurtosis_quant_stop=-0.853763;SVTYPE=INS;RNAMES=055ba478-5d2e-4d99-b186-a0135c719653,0ba34aee-4104-4f2c-a266-ee9e94ffd88a,16677c20-3c6c-4b74-91e8-9a8eaff0cc41,17cf1867-d8d0-4d6c-a0a4-7cd27f98d21c,2193c379-056a-4f6b-ad63-500a7d6b0f81,276d02c1-43c7-4839-b05e-e369b5027db1,4050aa75-6cc0-4e4a-9a4d-6ef9c7a531b0,4d69d239-a49f-4385-9509-38d2c650dd91,5b314be2-40de-453b-b564-55e0f23e0f9d,60487873-d91c-416a-b799-b3cfa6ece402,870a857f-39a7-49e9-9ca2-3e5167e0080c,8c86180c-2b7d-4542-abb2-60c8669ba5f3,9fc8b03e-d153-4578-b4ef-72542bb4681f,eb2e28c0-57be-443a-b9fd-aaca2ce7331a,f371bad6-cc9f-4532-96cc-ca566fb95edb,f4df7d94-09a6-4fef-8265-b6b7f695023d,f597f2f9-4eb8-4ff5-b998-0775bdaebcf8;SUPTYPE=AL;SVLEN=103;STRANDS=+-;RE=17;REF_strand=10,6;AF=0.515152 GT:DR:DV 0/1:16:17 +chr1 1968925 65 CCCTCCTGGGGGCTCCGGTCCTGCCCAGCAGCCCCAGGTGAGACAGCGCCTGGCGGCCCCTCCCTAGCTCC N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1968995;STD_quant_start=6.21289;STD_quant_stop=6.26099;Kurtosis_quant_start=2.1732;Kurtosis_quant_stop=3.01628;SVTYPE=DEL;RNAMES=0054b1d2-e5b3-4425-8021-7bfc27793f2e,03eab717-c88a-49b3-b03d-3fccf189e51b,114859cf-340f-46a6-82ae-597f42d2c3a2,46b53123-ab33-4ba3-be43-aa960b5282e6,5032e8bb-0e9f-4ef3-8f74-853166bdf8d1,58437404-2131-42f8-a4a7-3c238e0dce06,7e0e439a-bd8d-45b6-92af-6d04069896de,ba78be33-e248-406d-9ea9-65de8c94cb9b,c9c5a4c8-212d-49de-a138-0b77a80bf5df,f664d4be-60e1-44b4-b2ba-08356f03b7c5,f6fff9c2-22f7-4909-82d3-ef14a9823885;SUPTYPE=AL;SVLEN=-70;STRANDS=+-;RE=11;REF_strand=13,12;AF=0.305556 GT:DR:DV 0/1:25:11 +chr1 1980245 66_0 ATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGACTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGCGTGGGGAGGACGGGTGAACGAGAGACTGTATCTAAGCCACCGGCACAGATCGCAGTGGGCGCCCTCTTACCGTGTGGGGAGGACGGGTGAACGA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1980454;STD_quant_start=101.743;STD_quant_stop=92.693;Kurtosis_quant_start=-0.884687;Kurtosis_quant_stop=-0.52696;SVTYPE=DEL;RNAMES=052bdc46-518f-41db-ad42-0684dc228ca4,32319478-b72c-45fa-bc4c-fe51c11649c7,706e2df9-de59-4d99-b258-713deb89d841,7e0e439a-bd8d-45b6-92af-6d04069896de,8525da23-273d-43ea-b798-67f153a33bbc,95c7442c-1458-4fd5-b134-66f19fe6c45b,a6ea1aaf-dc00-4e26-aa3d-f0edd224e3a7,b522295d-622c-4c09-92dd-f125fde27113,d7383ef9-e484-418b-b935-184212da47b4,d8ac8009-6fb7-4c0f-be91-f5ae65921ef2,e2dd6517-ff29-49ac-a274-9768838cc651;SUPTYPE=AL;SVLEN=-209;STRANDS=+-;RE=11;REF_strand=8,10;AF=0.37931 GT:DR:DV 0/1:18:11 +chr1 1982226 66_1 N CGCAGGACACCCAACCACGGACAGACACGGGGGCACGCAGGACACCCAGCCGCGGACAGACACGGGGCACGCGGAACACCCAGCCGTGGACAGACCACCAGAGAGCACACAGGACACCCAGCCGTGGACAGACACGGGAACAGGACACCCAGCCGTGGACAGACACGGTGACACACAGGACACCCAGCCATGGACAGACACGGGGACACGCAGGACACAGCCACGGACAGACACGGGGAC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1982427;STD_quant_start=111.252;STD_quant_stop=95.5136;Kurtosis_quant_start=-0.937548;Kurtosis_quant_stop=-1.0182;SVTYPE=INS;RNAMES=052bdc46-518f-41db-ad42-0684dc228ca4,528aac50-b822-4bd5-9cbd-4b8382236279,8525da23-273d-43ea-b798-67f153a33bbc,90a3c1c7-3727-4019-8995-80b749f304c5,b522295d-622c-4c09-92dd-f125fde27113,c99f1e8e-64f8-46b0-8baa-52ad4bea8869,d0b22d0c-2bd5-4ae3-b65c-807960446b61;SUPTYPE=AL;SVLEN=206;STRANDS=+-;RE=7;REF_strand=5,3;AF=0.466667 GT:DR:DV 0/1:8:7 +chr1 1993704 68 N GGGCACAGTGGCTCATGCCTGTAATCCCAGCAACATGGGAGCCTGAGGTGGGAGGCTCTCTTGAGGCCAGGAGTTTGAGACCAGCCTGAGCAACATAGTGAGACCCCACCGCCATTTCTAGGAAAAAAAAAAAGTGGCC . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=1993848;STD_quant_start=0;STD_quant_stop=2.30217;Kurtosis_quant_start=-0.137648;Kurtosis_quant_stop=0.586411;SVTYPE=INS;RNAMES=12612e6e-777e-43af-8849-4b074b2f00ac,3199a42c-620f-4a1d-a406-892e05fe365d,3a9ec060-4a54-44ef-bd05-acc590e6927c,48257349-9599-4e4d-b1c8-81428bed719b,516476f1-e11d-4860-b20c-6bcd57126832,5f26394a-a245-43ed-9364-e07fef58a86e,69f19577-181a-43f0-8698-b2a254a4af7e,751e7a48-0915-4aba-bbee-fc4169db51af,7fd8a496-1821-4f89-89e5-aa459208dee7,83936f75-3e7b-4cd8-8444-fb2758e79309,a6ea1aaf-dc00-4e26-aa3d-f0edd224e3a7,a78f0b5f-91c6-49b6-a65d-14ba7899fcf0,c99f1e8e-64f8-46b0-8baa-52ad4bea8869,ce1e0c2c-7782-4be5-9d87-0f852599231c,d0eb4f07-4b62-43d9-986f-fead3a9c4de5,d923e257-cf61-40a0-bfde-487fc0c5aca4,e01bb1b6-6bb4-456c-a445-0038552a3166,e5283a39-f252-4bee-8378-2587557d50c8,ea7096fc-b05a-4d68-9cd7-0b0707b652dc,f7aaf47b-c90c-4458-b7b3-234f6f20fc43,ff61f792-d1df-48a7-b53b-3c122cbdfbb0;SUPTYPE=AL;SVLEN=144;STRANDS=+-;RE=21;REF_strand=10,8;AF=0.538462 GT:DR:DV 0/1:18:21 +chr1 2019228 69_0 N GGGAGGAGAGGGGGAGAGGAGGGGACCTGGGTAGGGTGGGGGGAGGGGAACGGGGAGGGGAGCGGCGGGCGCGGGGTGGGGTGGGAGGGGGGAGGGGGAGGGGAGAAGACGGGCAGCGGGAGGGCAGAGGATGGGGGCGGGGGGGGAGGGGAGGGGGCGGCGGAGGATGGGGCGGGGGAGTGGGATGGGCGGGGGAGGGGGAGGGGGGC . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2019444;STD_quant_start=5.89654;STD_quant_stop=7.31174;Kurtosis_quant_start=-1.49088;Kurtosis_quant_stop=-0.213696;SVTYPE=INS;RNAMES=084b4c10-4abe-45a3-bb41-50a843f349b7,141e97fe-ac49-4c29-a3d5-ed11490e5e0c,189bdbba-7c5c-425c-9986-48faee8994b8,1b320452-023c-43d2-80aa-3fd843eca48f,1b523e3c-4545-43ce-a386-de80099285aa,2ba0273c-2942-412d-9e6c-64000dbe5a24,2d01c13b-fe6d-4d94-9e85-f1a8b75a97a0,3745f5b0-c4b2-4f98-ad05-1fd16961d44f,3d6ee7dc-5dae-4bea-ac7b-c51be12b4e35,3fee962e-0292-42b9-97d4-c9f60cb80cb0,5117c8e0-9e16-4765-8cc1-83c57b053121,59eff853-5383-4fcd-a740-7dd1cb819f3d,6a95fb4f-660a-4aea-a830-b0a3a3e25876,6ec61bb5-9fd5-4f87-bf26-ab3662797e0c,71ffaa82-5556-4b11-a020-d9cfc9377ca8,7688b7ce-31d9-40e6-b262-b707e4beb0a9,7d2f3cce-628d-4c6c-89e8-32c85eff6029,8323db15-fe57-4d04-a384-12eb8e286127,844e781b-8f42-42f9-b667-183919bd5497,85db8648-9b21-4396-96cd-64d8210ec0b6,88158c15-e1b0-4217-9414-260e07ea041c,91e77f06-2ce4-48a1-a4aa-6e99b50183e3,c98072a1-b91c-415c-bf05-4caa96f7fb2d,cfb3d8e5-eeca-4519-986e-d8d2e1629fcf,df0b69a9-893f-40aa-8210-bafad1da5208,eac11fa2-4f1c-4773-ba53-46f54d556aff;SUPTYPE=AL;SVLEN=209;STRANDS=+-;RE=26;REF_strand=11,12;AF=0.530612 GT:DR:DV 0/1:23:26 +chr1 2031615 69_1 CTGCAGGACCCAGGTGTCTGGGGCGCTACCATCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2031650;STD_quant_start=0;STD_quant_stop=1;Kurtosis_quant_start=3.55176;Kurtosis_quant_stop=1.96435;SVTYPE=DEL;RNAMES=12039f9a-b3a1-4bb5-889f-27118e7212a0,189d9ec3-67c5-4156-aef7-84f137e1c188,1b320452-023c-43d2-80aa-3fd843eca48f,29ba43e9-795c-4344-a62e-19c797d939fa,3745f5b0-c4b2-4f98-ad05-1fd16961d44f,3f364712-fe9c-4e2f-adc9-07f69555c1e4,4946e6a8-20a0-4177-931d-6b5edbb7310f,6d810f14-3521-4f01-a7c8-08501e86ea6b,93dd4c46-9379-4e94-9814-b69d63b9480d,b933e615-1c59-4dd9-89ee-9805edf3a17f,e52271ee-8199-4c0d-af4e-95d436da9609,f7fff377-de1b-49cd-8605-1b8a832b0185;SUPTYPE=AL;SVLEN=-35;STRANDS=+-;RE=12;REF_strand=5,13;AF=0.4 GT:DR:DV 0/1:18:12 +chr1 2106498 71_0 GGTAACTCTCAGCAAGCCCCTCTGGTGGGCGAGGACCTCCACACGTGTCACCAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2106559;STD_quant_start=18.0638;STD_quant_stop=24.1309;Kurtosis_quant_start=-1.94975;Kurtosis_quant_stop=-2.12239;SVTYPE=DEL;RNAMES=28805ff2-2a6a-44c7-ba7c-39b95ea9cfbf,2af6fbbd-f264-4229-87a0-a713a86bb300,373d65fa-058a-4e65-9c7d-c54373d59b09,3a963710-a757-49c5-8b1e-ff2706ed2b16,3ba8c7a4-62e2-412e-8b3d-05a8d0b1d74d,3f3ee43c-cad2-4769-a361-3d1b65c2e547,6205add9-ede8-46ea-9983-c9bca75d01f0,6604bd06-aa45-437f-8639-5bace412a763,6cbb710b-746d-4605-8f55-5118404f191b,701b0d67-ee07-4077-bdc2-32678e6df494,7116e163-0218-469b-a925-7c3770efd791,7a8fe2c5-d3c9-4e7b-b667-18ca3a3634c0,90931cd4-5f1c-41a0-bc14-9f16138a7d1d,937a7161-5462-466f-b1d9-c91ad0a29917,947202e3-b846-49d1-b795-98bab687979f,c69112e7-fdaf-431a-942d-6e6e7d219b3f,c791f368-5fb0-4f42-9385-585f75bb6a91,fd995180-e743-4bb9-8ac6-0f8dd3bd6060;SUPTYPE=AL;SVLEN=-61;STRANDS=+-;RE=18;REF_strand=11,20;AF=0.367347 GT:DR:DV 0/1:31:18 +chr1 2110096 71_1 GAGACACAGAACGGCCAGGGCTGAATCCGGGGCCCTCCCTGGGGGCAGCCAAGGACCTAAAACCAATGGGTCCCAACCAAGAGGATCCCAGAG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2110188;STD_quant_start=28.137;STD_quant_stop=29.053;Kurtosis_quant_start=-2.1532;Kurtosis_quant_stop=-2.15889;SVTYPE=DEL;RNAMES=1b6d873e-ffd5-48ae-ad64-cbfc2fe5f991,2af6fbbd-f264-4229-87a0-a713a86bb300,2f93bb83-61c2-410e-9504-299571c80478,3a963710-a757-49c5-8b1e-ff2706ed2b16,3ba8c7a4-62e2-412e-8b3d-05a8d0b1d74d,6604bd06-aa45-437f-8639-5bace412a763,6cbb710b-746d-4605-8f55-5118404f191b,701b0d67-ee07-4077-bdc2-32678e6df494,7116e163-0218-469b-a925-7c3770efd791,74f450b4-f44e-401c-ae14-25ed8682996e,7a8fe2c5-d3c9-4e7b-b667-18ca3a3634c0,90556fe1-6f15-43ad-b541-18b71bfd6a30,92c42e9b-0738-4473-a13d-3c950c13a88f,947202e3-b846-49d1-b795-98bab687979f,9e8b5bf4-1212-4b8c-955d-956dbb5f8573,b4b91919-6fad-4984-ac92-d36a6e55e84c,c69112e7-fdaf-431a-942d-6e6e7d219b3f,c791f368-5fb0-4f42-9385-585f75bb6a91,cb91c0ac-5c4e-461b-8b51-ae7a6d9ba2c0,cd3b3d48-2407-4570-bd64-4c74d85c1113,d92913bc-455d-4f37-8076-fb2c944a43e4,db136f46-27ff-4915-88c0-de0b3fcc6559,dee55e5f-5ad4-4ba1-8f33-4da8a0ca888c,f2baf3f5-0375-4789-a4a0-421c72aaedfe,faff0bd7-0f1e-4a4f-b19a-683c00ffe049,fd995180-e743-4bb9-8ac6-0f8dd3bd6060;SUPTYPE=AL;SVLEN=-92;STRANDS=+-;RE=26;REF_strand=6,1;AF=0.787879 GT:DR:DV 0/1:7:26 +chr1 2121519 73 N GGTCATGGTGGTAGTTAGGGTTATGGTAGTTAG . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2121553;STD_quant_start=8.15037;STD_quant_stop=7.96421;Kurtosis_quant_start=2.31242;Kurtosis_quant_stop=0.704955;SVTYPE=INS;RNAMES=150f2d25-b265-4af7-8135-23aa9741506d,1a6336dd-1444-4c9f-a01d-b44e5ec055d3,4b3f8741-f139-4956-ba84-952af879c943,81f0780e-3130-42ed-b489-a3ca347b4e5c,9d20c94f-6d1e-4624-9b25-3d5949d7e53c,df559026-6706-483c-922a-d706d4e0bfbd,ecc5327a-3d59-4d19-8cb7-0e24ab84ebbd;SUPTYPE=AL;SVLEN=34;STRANDS=+-;RE=7;REF_strand=10,6;AF=0.304348 GT:DR:DV 0/1:16:7 +chr1 2122420 74_0 GTGGTAGGGTCGTGGTGGTTAGGGTCGTGGCGGTGGTTAGGGTCGTGGCGGTGGTTAGGGTT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2122481;STD_quant_start=2.34521;STD_quant_stop=7.3598;Kurtosis_quant_start=0.553719;Kurtosis_quant_stop=1.81801;SVTYPE=DEL;RNAMES=0650f152-57e2-432b-9f22-f61cef3fdadf,6679a3cc-d5ab-43a8-9476-6a63a9592057,670102b2-c1bb-4323-835a-edf30c6e4457,6be13089-e486-4759-b1d3-99fb4da3e5ce,bee0bcc7-3494-4ac8-9d57-3707ea5045a2,ee18ca1c-9244-4440-b623-79f76165755a;SUPTYPE=AL;SVLEN=-61;STRANDS=+-;RE=6;REF_strand=2,0;AF=0.75 GT:DR:DV 0/1:2:6 +chr1 2123427 74_1 GTGGTTAGGGTCGTGGCGGTGGTTAGGGTTGTGGTGGTTAGGGTTGTGGTGGTTAGGGTTGTGGTGGTTAGGGTCGTGGCGGTGGTTAGGGTCGTGGCGGTGGTTAGGGTT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2123528;STD_quant_start=35.106;STD_quant_stop=31.7715;Kurtosis_quant_start=2.91495;Kurtosis_quant_stop=-1.20498;SVTYPE=DEL;RNAMES=0650f152-57e2-432b-9f22-f61cef3fdadf,670102b2-c1bb-4323-835a-edf30c6e4457,73fb19cb-0ca2-4185-bfe2-5c1511d92a8a,adf37c10-0ebd-4b66-a691-4d78e9ce3887,bee0bcc7-3494-4ac8-9d57-3707ea5045a2,ee18ca1c-9244-4440-b623-79f76165755a,eedb21ba-7cb6-45a8-a524-2ce1dca4f9e1;SUPTYPE=AL;SVLEN=-101;STRANDS=+-;RE=7;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:7 +chr1 2142340 76 TTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCTAGCAGCCGAAGCGCCT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2142424;STD_quant_start=21.468;STD_quant_stop=22.4611;Kurtosis_quant_start=-0.568143;Kurtosis_quant_stop=-0.726049;SVTYPE=DEL;RNAMES=171ee54f-94db-40e5-abd3-64c25c3bd654,1ec9394b-a56b-4ecc-9a95-a189e11ab936,414c9229-dbb4-4f8d-b352-3395870549ce,8b1a60f1-2b05-40cf-96cb-5c25d3609202,a82e313f-0107-45b6-bff9-499aa5741572,bdc72979-282b-40f1-8822-46206a303af5,d601a903-b81b-423b-8b40-931b45c4812e,d7aed26e-dee7-4026-ba04-ad90cbf1ab97;SUPTYPE=AL;SVLEN=-84;STRANDS=+-;RE=8;REF_strand=12,10;AF=0.266667 GT:DR:DV 0/0:22:8 +chr1 2147720 77 N TGTCCACTGACCTCTCCATTCTCCGTCTGTTGTCCACTGACCTCTCCGCTCATCCACTCCATCTATTGTCCACTGACCCCTCTTCATCTATCCATTCATTGTCCACTGACCTCTCATCTATCCATCCATCTGGTCAGTCCACTGATCTCTCTCCATCTATCCATCCACTTC . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2147883;STD_quant_start=2.04939;STD_quant_stop=2.91548;Kurtosis_quant_start=3.27179;Kurtosis_quant_stop=3.80751;SVTYPE=INS;RNAMES=3b6c8737-901b-4b87-bb72-eb7092dfef29,3dc2a634-4c43-496c-8ec4-37141b9f49e1,50520c4d-5117-476f-91d6-88a96210bd00,5bb38fd6-be27-4945-8879-e7fa91fb4ed2,61131946-4d87-43e0-ba2a-437d65cb5d46,6f92ee94-91e0-447b-bdd8-cf1b4bb2cb22,7d2495f0-d0d8-4f23-ab87-8966f0fc9a76,89ca80fa-ec8f-409d-b1df-6035496e9aa0,955dbf85-2ba3-4862-81af-eeba6ded1571,bc432383-d7f9-46e6-91e4-c7ac27d0e9bb,e4d10a9a-fd46-4b43-96d4-260144261a04,fa90e368-9f17-4bc3-baba-6cdb0f1720df;SUPTYPE=AL;SVLEN=162;STRANDS=+-;RE=12;REF_strand=6,6;AF=0.5 GT:DR:DV 0/1:12:12 +chr1 2212064 78 N GCTAATTTGTTTTTTTTTGTATTTTTAGGAGGCGACAGGGTTTCACCGTGTTAGCCAGGTTGATCTCGATCTCCTGACCTCGTGATCCGCCCGCCTCAGCCTCCCAAAGTGCTGGGATTACAGCGTGAAGAGCCACCGGTGCCCGGCCAGTTTTTTATATTTTTTTATTAGAGACGGGGTTTCATCATGTTGGCCAGGCTGGTCTCAAACTCCTGACCTCAGGTGATCCACCTGCCTCGGCCTCCCAAAGTGCCGGGGATTACAGGCGTGAGCCACCGTGCCTGGCCGAGAGAGCAAGCTGCTGCTTAGTTTTTTTTTGAGACGGAGTCTTGTGTCGCCAGGCTGGAGTGCAGTAGTGTGATCTCGGCTCACTGCAAGCTCCACCTCCCGGGTTCACACCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCCACCACGCCCG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2212522;STD_quant_start=74.6405;STD_quant_stop=66.4379;Kurtosis_quant_start=-1.46472;Kurtosis_quant_stop=-1.44655;SVTYPE=INS;RNAMES=0676b04a-7c8d-42cf-b577-88e7c5e59366,186fb406-a7c2-427a-a852-c832ed397d77,1c28123c-97c3-4b56-be80-95f423024035,392588b0-1f81-4dfb-95e2-516600f4d921,3b3d82bf-b95b-42dc-85b6-6e0ec7909512,487beba9-384f-4d39-9206-67d9ff555f3b,4ff2e1df-681d-41eb-804f-d62b1e1243f7,65813d37-b46f-41e7-8a62-74fd40ef4802,83bbdbb2-2f4e-409c-a225-7ac6c3a60112,91830ab4-3efb-4ef9-b092-74588a1f1977,a5bb802e-de21-4019-a251-702f17b31459,b01eb484-c448-4b3f-9e9d-57fdbc83c1b3,bd07653a-6a66-4532-9d78-64ec38c0e59c,e03f35b8-06d0-430d-928e-6fb4c40ba032,e508e912-2b3a-4490-8895-b31a09536944,e6819841-704e-4239-8e88-3185449188dd,f21c9471-ca40-4c86-83e4-d79827d0c0b2;SUPTYPE=AL;SVLEN=457;STRANDS=+-;RE=17;REF_strand=13,8;AF=0.447368 GT:DR:DV 0/1:21:17 +chr1 2280686 79 AGAGAGGACGCCCGAGAAGACAGGCGGCGGCGGCGATCTTCAGAGAGAGAGATGCCCGAGAAGACAGGCGGTGGCGGAGATCTTCAGAGAGAGGACGCCCGAGAAGACAGGCGGTGGCGGAGATCTTCAGAGAGAGGACGCCCGAGAAGACAGGCGGCGGCGGCGATCTTCAGAGAGAGGACGCCCGAGAAGACAGGCGGCGGCGGCGATCTTCAGAGAGAGGACGCCCGAGAAGACAGGCGGCGGCGGCGATCTTC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2280929;STD_quant_start=20.1122;STD_quant_stop=50.9735;Kurtosis_quant_start=-1.35663;Kurtosis_quant_stop=1.0857;SVTYPE=DEL;RNAMES=0201190b-ffdf-4da2-b1d2-878efae5276d,065e39f9-6c32-425a-a3f9-3c41cdf17cfb,54aa1c8a-ce0b-440f-a73b-74137b246f3f,7569fcc1-6df6-4e2e-9a3e-7b3475257f88,85469682-a260-411a-9da2-40949100da1a,a8546972-7f28-4aa2-b19c-b104b07c3f26,abcf1b8d-cac3-4621-b0ff-8187389417c8,b314ebbf-e428-4f84-ae07-a53631ded366,c334c780-b78a-478d-beaa-12a683730973,e2507dc8-5255-4d67-90d5-ed5828f6e81f,f57ea356-9ee6-484a-9685-3caf5029b3fa;SUPTYPE=AL;SVLEN=-243;STRANDS=+-;RE=11;REF_strand=10,12;AF=0.333333 GT:DR:DV 0/1:22:11 +chr1 2393034 80_0 N GGACGCTGTGGCAGAGGACTTCATCCCATGTTTCAAAGTGCCCT . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2393076;STD_quant_start=0;STD_quant_stop=1.07417;Kurtosis_quant_start=-1.11224;Kurtosis_quant_stop=-1.09293;SVTYPE=INS;RNAMES=00980201-0c96-4489-90dc-fe1b1a16ba8c,06ce59b7-1c95-4cd6-afc0-5cb2d44c3da0,0eee7d18-36f4-4b41-88b8-413a26a2e92b,12233d0a-ba0e-4f6b-8f20-231b4c0df705,276b7eca-4c55-4ea4-b706-e2ea0ef806bc,37f7640b-5f00-4496-b03c-176b2fee9f81,4f196bd2-aaf9-4c99-b954-f48454f1377e,699db0e6-24b9-4257-83dc-2c92e96479c3,75751984-314d-402e-90cb-d2064a9e9a6c,7670c0d3-c90a-43cb-9761-3dc4e3c0c86e,7b099be2-d750-4c08-a6fb-25185120d10e,82e87691-2a14-4c37-b073-9e706d600638,8d2e0d04-baaf-4485-aced-48bda5916742,990bf247-679d-4047-a113-7dd3aa4fcf65,9a56d0b2-8fbf-4179-8647-05454c9be994,b2881ddb-8ae7-479b-8896-35c01a48e844,bac07239-5e7f-4fc6-9362-01d8a6b6791f,bbc701f9-17eb-4353-81c9-ff5ee961b554,c0c21a75-e2b2-43de-aaf8-af53e92c26b1,c4c9fe9e-5f08-4239-8531-3cbd08912e5b,cd7c8196-ac48-4caa-9414-32a3b37a7d25,d43881ac-62d7-4668-bb72-b6e5d50fd8d1,d48718e0-864b-46d5-93f8-8fde7e786b8e,d8b68c1f-fe56-4915-bf2b-670ccac3f03c,ea7de415-8965-4b45-aa66-864bba6cecf9,f78c1fa7-d9ef-4f7e-b39c-4c95bc54f740;SUPTYPE=AL;SVLEN=42;STRANDS=+-;RE=26;REF_strand=14,11;AF=0.509804 GT:DR:DV 0/1:25:26 +chr1 2401344 80_1 N CTCCTCCCTCCTCCCTCCCTCCCTCCCTCCTCCTCCTCCCTCTCCCCTCCTCCTTCCCTCCCTCTCCTCCCTCCTTCCTCCCTCCTCCTCCTCCCTCCTCCTCCCTCCTCCCTTCTTCCTCCCTCCTCCTCCTCCCTTCTTCCTCCCTCCTCCCTCCCTCCCTCCCTCCCCCTCCCTCCTCCCCTCTCCTCCCCCCTCCCTCCTCCTTCCTCCTCCTCCCTCCCTCCTCCCTTCCTCCCT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2401601;STD_quant_start=7.79957;STD_quant_stop=12.6853;Kurtosis_quant_start=-1.73392;Kurtosis_quant_stop=-0.358473;SVTYPE=INS;RNAMES=0122932c-f48b-4b83-bece-1983fa357a82,01bd0d80-e250-4be3-996c-e6bbdfad82f8,06ce59b7-1c95-4cd6-afc0-5cb2d44c3da0,084d817e-9526-4e38-8a85-954a55703197,17d48ee0-5f98-4ddf-b6a2-82b201fd9140,21c67059-ba8f-4d02-856a-26f28b81f0a6,25f27c4c-cea6-4fcb-baf9-a9cbc2cbf798,67e266bf-3cf2-482f-b9f6-1f7e0edbfb20,7a7bd87e-f893-41c4-84b5-0d0cf345a57c,9a56d0b2-8fbf-4179-8647-05454c9be994,a7d2160f-22a6-493a-b788-050768eb9bb4,a97bbe0a-c62b-41ab-970e-ec673cca054a,b10646da-43c5-4aa8-9cf4-48d6dabf1fac,b167dc86-fc0d-4b6e-9a82-fd7250b88b9d,c0c21a75-e2b2-43de-aaf8-af53e92c26b1,cd37a7be-b9ba-4348-a01c-e1fb286af589,cd7c8196-ac48-4caa-9414-32a3b37a7d25,cf8fb7a6-6ee4-4679-9589-2c546ab0eaf8,d79040ee-f236-4142-9d86-632cf2ec11fb,d8b68c1f-fe56-4915-bf2b-670ccac3f03c,dbd74465-d543-4b90-885d-5585f6c5c733,ea7de415-8965-4b45-aa66-864bba6cecf9,ee10783d-ee86-4b3d-b3fe-70c862bc79f5,f3edbd0a-a726-46cc-bac8-bbfe128fbe42;SUPTYPE=AL;SVLEN=257;STRANDS=+-;RE=24;REF_strand=9,6;AF=0.615385 GT:DR:DV 0/1:15:24 +chr1 2435943 83_1 N ]chr8:2191727]N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=361.135;STD_quant_stop=375.138;Kurtosis_quant_start=-1.00232;Kurtosis_quant_stop=-1.41029;SVTYPE=BND;RNAMES=1034f436-2911-4e62-8bb4-897d96384930,20c61a5f-55ca-4466-8f0c-5acfa8d937ec,2c5731ed-a065-4428-a87f-11560422cb7a,4d806a60-f552-4c1d-a41b-4d92c9a7f44b,7096ab27-21cd-4a75-9b3b-61942414d57f,8f0f8a9c-51ac-46b9-8c10-6fe63afbc58d,99b9a2b6-169c-4b13-b450-fcb969c61e37,afcf9b05-eaa9-4bdd-a231-8b97fb693a94,b2c03c3c-381a-487e-a13b-90230e1a2fbf,c885fe57-3116-4b4d-a3e7-ca92b07181be,d457a04c-02bb-4cb5-84b4-f3b2b6e4f78e,d67e1e93-5385-4ea9-a0c0-d1c773b8903a;SUPTYPE=SR;SVLEN=0;STRANDS=-+;RE=12;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:12 +chr1 2436031 82_0 N TCTTCCCTCCTCCCCCTCCTCCTTCTCCTCTCCCCTTATC . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2436072;STD_quant_start=2.58844;STD_quant_stop=3.4785;Kurtosis_quant_start=3.2681;Kurtosis_quant_stop=1.23088;SVTYPE=INS;RNAMES=15db541e-fff8-459c-a500-6bf0abef1181,45d1807a-8638-48e3-b1ab-c13ec31f67d4,8501a903-194a-4b39-90b9-5dbb8894a965,89d5ae55-5d6f-48f9-87bf-a47000cd43bb,917960b5-a32c-4b3b-aaf3-3030d0ae4adf,c518ca32-4881-4ff2-a436-6aaf65d6dcd7,d29c808b-64dd-431c-af93-f0b792fdac5f,db4d5b64-7cc8-433b-8c7e-1fcdc1d2f749,edf624ea-c4fb-4000-963f-03379b300d79,ee26af85-3d3f-4c14-84a5-694053a729cf,f0e08ed7-b7cb-4090-b5e0-e2d11c3aec32,f2021b13-8bd1-4ecd-b83b-26b1f3834c67;SUPTYPE=AL;SVLEN=42;STRANDS=+-;RE=12;REF_strand=9,10;AF=0.387097 GT:DR:DV 0/1:19:12 +chr1 2436235 83_0 N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=51906855;STD_quant_start=248.038;STD_quant_stop=29.0723;Kurtosis_quant_start=-1.13146;Kurtosis_quant_stop=-0.792692;SVTYPE=INV;RNAMES=20c61a5f-55ca-4466-8f0c-5acfa8d937ec,63ca8f3a-dccd-4544-bcb6-165078b143c0,8f0f8a9c-51ac-46b9-8c10-6fe63afbc58d,b4a16a83-7eb8-41d9-92b6-c3343ed532a5,e00aaa66-34bb-44d9-bd63-b8f65c2725e1;SUPTYPE=SR;SVLEN=49470620;STRANDS=--;RE=5;REF_strand=1,2;AF=0.625 GT:DR:DV 0/1:3:5 +chr1 2436453 30263 N N[chr21:44407143[ . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=260.661;STD_quant_stop=33.892;Kurtosis_quant_start=0.156456;Kurtosis_quant_stop=1.93083;SVTYPE=BND;RNAMES=38ae4519-daf6-4f97-891a-f0fa8e3ad89b,54b53a6d-e860-41ba-ab89-bc5ea855609b,6023543c-a3b7-4668-83d4-5874646245d2,99b9a2b6-169c-4b13-b450-fcb969c61e37,b2e8082e-4084-46f8-b792-8d697b657d64,df600a7e-047d-4f6a-8bc2-53036240ad2a;SUPTYPE=SR;SVLEN=0;STRANDS=+-;RE=6;REF_strand=0,1;AF=0.857143 GT:DR:DV 1/1:1:6 +chr1 2440693 82_1 N CCCGAAACGCCCGCGGGATCTTCGTTGCTGCGACCAGGATCCTCTCCACATGTCTGTCGCTGGCCTTGCCCGGCCCGCCGGGGATCTTGCATGCTGCGACCCAGGGATCCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCGCCGGGGATCTTGCATGCTGCGACCAGTGATCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCCGCCCGGGAGATCTTGCATTGCTGCGACCAGGGATCACCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCGCCCGAGGGATCTTGCATACTGCGACCAGTGATCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCCGCCGGGGATCTTGCCCATGCTCGACCAGTGATCCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCCGCCCGAGATCTTGGGGCTGCTGCGACCAGGATCCTCTCCATATCCTGTCGCTGGCCTTGCCCGGCCCGCCGAGGGATCTTGCATTGCTGCGACCAGGATCCTCTCTCCATGTCTGTCGCCAGCCGCCATGACTTCGCCCGGGGATCTTGCATGCACTGCGACCAGTGATCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCCGCCCCGGGGATCTTGCATTGCTCGGGCCAGGGATCTCTCCATGTCTGTCGCTGGCCTTGCCGACCGCCGAGGATCTTTGCATTGCTGCGACCAGGGATCCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCCGGCCCGCCGGGGATCTTGCATTGCTCGACCAGGGATCCTCTCTCCATGTCTGTCGCTGGCCTGCCCGGCCCCGCCCGAGGATCTTGCATTGCTGCGCGACCAGGGATCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCCGCCCGGGGGATCTTGCATGCTGCGACCCAGTGATCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCGCCGGGGATCTTGCATTGCTCGACCAGGGATCCTCTCTCCATGTCTGTCGCTGGCCTTGCCCGGCCCGCCCGGGGATCTTGCATTGCCGACCAGGGATCCTCTCTCCTTGTGTCGCTGGCCTT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2441617;STD_quant_start=35.0071;STD_quant_stop=184.207;Kurtosis_quant_start=1.35393;Kurtosis_quant_stop=-1.47296;SVTYPE=INS;RNAMES=10a01cbd-3204-44bd-8e18-aa1f7102a596,11cc45ed-33df-4995-84b5-02be2a3c198a,2e000069-23ca-4c66-85fb-500ffa84bc1e,32508232-0783-466c-8c97-23f11117dccf,375a5306-1f06-4084-a995-4f8695e83ec6,449d6781-7c8d-4f3f-aed2-7a122c648ad6,45d1807a-8638-48e3-b1ab-c13ec31f67d4,471288e6-2af7-4c44-8ced-0b2e14a0af09,4fe8ce9a-fd28-4e0b-90f4-5a90d99f2466,6b456c6e-3fe1-4b74-a35d-1c240b6ce279,6e0f3747-6214-415b-a677-635aa90110b8,747fe45c-dfcc-4b42-b4b8-e396c7bcf318,79c8b378-0cf0-47a2-ab2b-c4f512f9ff0a,8501a903-194a-4b39-90b9-5dbb8894a965,a41e48ed-36b3-4af2-87c6-9c114d8067f2,ae4210ea-eb59-418a-8709-462b3ab10a77,bdb15ecc-872a-4aef-9611-a439385e8ed3,dac45185-8624-4672-97a4-5e015bc689ca,edf624ea-c4fb-4000-963f-03379b300d79,ee26af85-3d3f-4c14-84a5-694053a729cf,f2021b13-8bd1-4ecd-b83b-26b1f3834c67;SUPTYPE=AL,SR;SVLEN=963;STRANDS=+-;RE=15;REF_strand=3,6;AF=0.625 GT:DR:DV 0/1:9:15 +chr1 2459450 85 TCCTTCCTGGTGGTTCTCCTTCCTGGTGGTCCTCCTTCCTGGTGGTCCTCCTTCCTGGTGGTCCTCCTTCCTGGTGGTCCTCCTTCCTGGTGGTCCTCCTTCCTGGTGGTCCTCCTTGCCGGTGGTCCTCCTTGCCGGTGGTCTTCCTTTCCGGTGGTCCTCCTTGCCTGTGGTCCTCCTTGCCTGTGGTCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2459642;STD_quant_start=0.547723;STD_quant_stop=0;Kurtosis_quant_start=0.482804;Kurtosis_quant_stop=5.95281;SVTYPE=DEL;RNAMES=034ea11e-3da4-453a-80aa-6efab59d5110,0d288516-a039-451f-9f69-20031569d572,0d96f8b9-1d1e-4ece-a418-703cf14d00b6,0fec5033-ebb8-4832-bdc7-66bdb1ef1f57,1a37d739-733c-4be8-9dad-2083e6928e5c,1c1de513-464e-4024-a10d-29956a286cb1,551711d9-55f3-4933-9066-4df1346da909,761838d0-4eaf-4837-a45e-c92313e20f53,768d7eca-6f56-49b2-af49-be31afd1e13e,798d8388-3fbf-4607-8a25-97853325b1c5,8b4ad02d-6cbb-4aad-928d-7d7d1a34be5e,940c4e93-c468-49db-bc93-a17a034f1d7b,9e6f7ab9-acd0-4376-8175-df4a33abc5eb,9ecd2a2e-9ad5-43fb-aa89-98e139d419a3,9f2908f7-2773-402f-8f27-79dfab5594cf,a913fa6f-c3b4-488e-856d-1e47dce746ff,c14638ff-820c-42d0-aa9f-7cb8feb3238e,c71f623c-221a-49bb-b463-a8057d0a7f29,ce283681-ed7e-4685-ac1b-a1d5308e6055,e28ed1db-4be8-4a5e-9eab-f220168fe4f7;SUPTYPE=AL;SVLEN=-192;STRANDS=+-;RE=20;REF_strand=11,7;AF=0.526316 GT:DR:DV 0/1:18:20 +chr1 2522791 86 N TATAGTGACTTAACGGAGGGCACTGTATGCTATAGTGACTTAACGGAGGGCACCGTGTGTGTTATAGTGACTTAACGGAGGGCACCGTATGGTGCTATAGTGACTTAACTGAGGGCACTGTGTGTGTTATAGTGACTTAACGGAGGGGCACCGTGTGTGTTATAGTGACTTAACGGAGGGCACCGTATGGTGCTATAGTGACTTAACGGAGGGCACCGTATGGTGCTATGTGACTTAACGGAGGGGACCGTGTGGTGCTATAGTGACTTAACGGAGGGCATTGTGTGTGCCAGTGACTTAACGGAGGGCACCCCGTACGGTGCTATAGTGACTTAACGGAGGGCACTGTGTGTGCTAAAGTGACTAACGGAGGGGACCGTGTGGTGTTATAGTGACTTAACGGAGGGCACCGGATGGTGCCAGGTAGTGACTTAACGGAAGGGACTGTGGTGTTATAGTGACTTAACGGAGGGCACTGATGGTGCTATAGTGACTTAACGAGGGGACCGTGTGGTGTTATAGTGACTTAACGGAGGGCACCGTGTGGTGT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2523320;STD_quant_start=46.8466;STD_quant_stop=106.999;Kurtosis_quant_start=0.908045;Kurtosis_quant_stop=-0.365621;SVTYPE=INS;RNAMES=2dd6b87a-9fbd-4327-9b00-f19a7b42aff9,a61d5f51-b87b-4a54-8b91-4faed7bafa4c,b0ce83fd-b979-40ba-819c-9619b2e49608,d2840c79-4ff9-4db4-84bd-eaf4be8bef66,d880e068-2bc0-4d31-810c-c47e1af97a5a,f1f4d10c-8e26-4889-b569-f3289b79f3b4;SUPTYPE=AL;SVLEN=550;STRANDS=+-;RE=5;REF_strand=7,6;AF=0.277778 GT:DR:DV 0/0:13:5 +chr1 2592355 87_0 CCCCTCCCCTGCTGTGCTGGCACCCCCTCCCCTGCCGCGCTGATGCCCCCTCCCCTGATGCACTGGCGCCCCCTCCCCTGCCATGCTGACGCCCCCTCCCCTGCCGTGCTGGCGCCCCCTCCCCTGCCGCGCTGACGCCCCCTCCCCTGCCGCGCTGACGCCCCCTCCCCTGCCGCGCTGATGCCCCCTCCCCTGCCGTGCTGGCGCCCCCT N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2592568;STD_quant_start=2;STD_quant_stop=2.52678;Kurtosis_quant_start=-0.815348;Kurtosis_quant_stop=-0.886441;SVTYPE=DEL;RNAMES=075ea038-cc1f-46e9-81ea-8fc94439f2c2,0a6d50c0-fd3a-442f-a61b-35e7ddc4dedf,15659d1b-e42b-496e-bc0b-0a57abbb5e38,1d97238d-4d17-4db8-88ab-03c183ae0934,2e6d3d64-84f0-458e-a973-7fbe937ed23c,34e8529a-be85-4285-873d-921539591f34,42778f38-cc84-482c-8f5c-d5a1d7b77879,47a610f2-3a2a-4ea7-a878-625bbdd9f062,4deec02b-bcd9-47b1-a876-b3e61131452f,505aee8b-2f38-48ea-a2de-98cb355f35f3,58fd450b-dd5c-428d-8134-d604331bdab1,6fd0bb2e-9c38-4a19-9e15-3ede11c3883a,73f809f6-8ac0-4838-9667-e52087f81be8,7d212f5c-b0a1-49da-8aab-af4cdd659e1e,7ecd2a68-f165-41f2-af24-f64ead2e5b71,862b9f9b-5094-45f8-9005-65d41c8d9c35,8e3220de-e682-4c27-974c-509712b22ad6,91c549fd-0458-499c-ad46-9416fb3a1a10,a4ae79d5-09d3-48de-a91b-84b92555c4c8,b14fcb62-2759-4596-9fe6-7ecbb82f2138,cbbc3bdd-ae78-4a61-9866-f97800766dd1,e492c156-9ec3-49ef-870f-3b755678869b,eb03f7dc-4330-4729-85ef-e125a9e4ad5b,fc132c78-0c57-4046-ab0d-8f8d74ab914a,fcb46bff-2c24-406f-81ea-b4a009ed61f7,ff4023b0-f820-4d8c-a517-c04455f9b1d0;SUPTYPE=AL;SVLEN=-213;STRANDS=+-;RE=26;REF_strand=13,10;AF=0.530612 GT:DR:DV 0/1:23:26 +chr1 2602149 87_1 N GCCACGTCAGGACCAGCCTCCCTCAGGTAGAAGTCAGGTTCGTCTTCTTGGAGTCAGAGGCCACTCAGCAATCTAGA . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2602230;STD_quant_start=13.3791;STD_quant_stop=12.1169;Kurtosis_quant_start=-2.12793;Kurtosis_quant_stop=-2.03683;SVTYPE=INS;RNAMES=080bd3e6-be18-4eae-b582-b2c931bce677,1499a1be-5ee2-457a-8b64-dac5f7a79a47,284431c2-f271-49c8-84b1-fc48b113b70b,34e8529a-be85-4285-873d-921539591f34,3e58d6a5-20b8-49b7-a02e-bd73c598775a,453c0bb2-2128-4ecc-9c6b-ce569db89834,48447aec-1b08-4409-86ea-8cd17bc45a23,4f69d8c2-229a-4c57-8007-e7c999b63942,6a25ad15-ad1e-4e5c-884c-845a87c6fbdc,72165ea7-31cf-4f4b-a099-145d9d7b1266,7bbc658c-4c6e-4dc6-a2a1-fbb0b436919d,7ecd2a68-f165-41f2-af24-f64ead2e5b71,7fb8622d-2c5f-492f-85bd-6f74f57119f1,82472465-ed07-4feb-989a-bb3cdea3150c,91c549fd-0458-499c-ad46-9416fb3a1a10,91dc1e9d-faf0-4bac-8f7e-86b955cd99f5,93e9e469-b3ca-43b3-bc0f-212e0f33afd3,995b8daf-cc06-4bd6-82c4-693e270ab7ea,e0e7fa04-7c3a-4748-b129-dac09bb9619f,f0783eb8-fd5d-42e3-9d2d-e6a2d8c7254f,fc132c78-0c57-4046-ab0d-8f8d74ab914a,fe0d04d3-3b08-4618-8579-c51c98a1ccaa;SUPTYPE=AL;SVLEN=80;STRANDS=+-;RE=22;REF_strand=6,11;AF=0.564103 GT:DR:DV 0/1:17:22 +chr1 2652083 89_0 N CCCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCCACGCCCAGGTGAGCAATGACAGCCTGGAACAGCACCCACCCCCAGGAGAGCATCTGACAGCCTGGAGCAGAACCCACCCACAGGCGAGCATCTGACAGCCTGGGTCGGCAC . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2652221;STD_quant_start=21.3049;STD_quant_stop=12.182;Kurtosis_quant_start=-1.88012;Kurtosis_quant_stop=-1.16379;SVTYPE=INS;RNAMES=02ba50c5-26f8-45f3-8fe7-9a7f61fac7f6,079fd425-df3d-468a-b552-2e579dc0522b,1914c345-2c26-4520-ac2c-99d33848176d,1f650b27-ee75-4812-bd48-f4c326820dde,26678881-769b-4da2-ab6b-58551722824b,2fc0e095-e459-4e17-9a9f-18a199c50908,30044b95-f04e-4a4b-b10e-f58df1806af9,3a1870b8-919f-41da-9acc-bc039af2a1ba,4c13c934-2614-4c44-bfbd-9a7289e40062,516909d0-ab9f-48da-9ec4-418f4c436131,5bf5cef3-7d7d-4139-8e47-9bf31dd216af,5d9b20fd-1f6f-41df-8350-a29246e536dd,6e2b1fc4-bd27-482b-9f5b-9777177143a6,70f92b03-38dd-47c1-a7e8-9fc4d81fa318,7448038e-ce47-4302-8073-a80274bfa16a,986509c9-3855-4ee3-a667-9bc970c66867,9a19d03f-fb7e-4770-ad1f-e64cbba6aefe,9a1fea37-5dec-4c49-a7c1-f808c97041a0,a56c5898-8f78-4731-a928-1c882472b4ec,b52e0fd0-0627-4176-aaa2-64fb7935a108,edcb1ac9-c75c-4c02-b9a2-411765f99e37;SUPTYPE=AL;SVLEN=37;STRANDS=+-;RE=21;REF_strand=17,19;AF=0.368421 GT:DR:DV 0/1:36:21 +chr1 2653604 91_0 TACACCCACAAGTGAGCATCTGACAGCCTGGAGCAGCATCCACACCCCCAGGCGAGCATCTGACAGCCTGGAACAGCACCCA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2653716;STD_quant_start=53.9565;STD_quant_stop=32.0765;Kurtosis_quant_start=-1.7751;Kurtosis_quant_stop=-0.966136;SVTYPE=DEL;RNAMES=0158ae9b-1cea-44ac-a523-2ffbdd24d482,05eb3583-4eea-45e9-bfcd-772055aaedeb,126d5e77-444b-41e6-af80-cc9f4aa8c66a,24b0a6b5-ee2c-466c-bdb4-4f69e8cfb6eb,2c750e0a-d09c-4052-9e22-b7846b9901df,54548ce1-5153-4ab5-9132-09917e430a74,6dccd774-2617-4c36-927c-19552cbe2840,6e29f3fb-6791-4396-84de-60f79bc784f7,77574f58-be4a-45e0-ae50-65b8e2f3e1fa,8a1925ec-84a4-493d-aee2-ffa8f53e1dd9,95be938e-9621-403e-b941-41fcc109e84a,984a1b40-0e0e-47b1-8859-0d807d1c2cf3,abde884c-3320-4e31-842d-e148f7459810,b4c9e4bf-5762-4ccd-80fa-fb08a6a0010e,d92d3960-4f96-4a2b-b1cc-ba4658976990;SUPTYPE=AL;SVLEN=-112;STRANDS=+-;RE=15;REF_strand=12,21;AF=0.3125 GT:DR:DV 0/1:33:15 +chr1 2653644 89_1 N CCCCAGGTGAGCATCTGGCCAGCCTGAACAACACTCCTG . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2653708;STD_quant_start=6.84349;STD_quant_stop=14.8661;Kurtosis_quant_start=-0.608737;Kurtosis_quant_stop=-1.58209;SVTYPE=INS;RNAMES=00db15b2-57f3-4b04-99ec-a767cb355dd6,02ba50c5-26f8-45f3-8fe7-9a7f61fac7f6,067adc23-3733-463b-bf30-3506ccb80286,0b952212-2a8f-4c3f-9242-feae635ad72f,0c9a0a71-e6f8-4fca-8488-c1815cb723ad,0f08527b-7745-4b44-adc4-838aa079ac29,4a582a55-20bd-4a02-b343-9ba8a8186c96,4ec9953b-3be5-493b-9e01-3ddb75ecddc7,51ec221c-3b15-42f2-b08f-c4b701f3b538,56e0025b-6e20-4c5d-9fef-17ed8bd0e3d5,5c75377d-1fb1-48be-ab51-ec39995d0b92,5d9b20fd-1f6f-41df-8350-a29246e536dd,70423a2f-e8b0-42f7-9ab4-c0e1beb13a3e,7175d4aa-1063-4242-af5f-5789c7f7de4d,76885d77-2eda-4866-9c3a-011e2198d357,7e647479-6d90-4de4-bc04-8f6df2119d0e,932bb34e-fc09-48b8-8c7a-85de34b5071b,94087aa5-facd-4548-81e4-ce9eb1a404b8,9dbf2ebd-4b68-40e0-aea3-73db2c89d7ea,9dd105c2-6f3b-4383-8a6f-30ba7bec1207,a8888d71-bd7f-422c-8c01-7b9d2469aada,abde884c-3320-4e31-842d-e148f7459810,c4bfb544-5450-4e71-979a-f8ed12864a0f,c86e0a6c-26f3-41de-b6ed-d29c3f370e1a,d8bc8df3-fbcb-4efe-a9e7-b310aa7113ef;SUPTYPE=AL;SVLEN=81;STRANDS=+-;RE=24;REF_strand=13,21;AF=0.413793 GT:DR:DV 0/1:34:24 +chr1 2655003 91_1 CTCTCACAACCCCAGGTGAGCATCTGACAGCCCGGAACA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2655057;STD_quant_start=27.4481;STD_quant_stop=21.7117;Kurtosis_quant_start=4.01557;Kurtosis_quant_stop=4.40304;SVTYPE=DEL;RNAMES=0f08527b-7745-4b44-adc4-838aa079ac29,43110448-5a2a-4374-affb-b5131e88b80f,54548ce1-5153-4ab5-9132-09917e430a74,6e29f3fb-6791-4396-84de-60f79bc784f7,6f07594a-7079-46ad-816c-932b2ecb0f98,85efba47-b448-4ed2-b080-afc1cab0d598,a8888d71-bd7f-422c-8c01-7b9d2469aada,abde884c-3320-4e31-842d-e148f7459810,c9f1ed31-fd3e-4516-a789-374b480d1624,dbe924d2-b7be-4baa-a70c-f38f48bd2d6d,e7b6b62b-35e7-4e51-9d37-a930f65ca207;SUPTYPE=AL;SVLEN=-54;STRANDS=+-;RE=11;REF_strand=12,14;AF=0.297297 GT:DR:DV 0/0:26:11 +chr1 2655093 89_2 CCCCAGGTGAGCATCCAACAGCCTGGAACAGCACCGACACCCCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGGTGAGCATCTGATATCCTGGAACAGCACCCACACCCCCAGGTGAGCATCTGACAGGCTGGAGCAGCACGCACACCCCCAGTGAGCATCTGACAGCCTGCAACAGCTCTCACAACCCCAGGTGAGCATCTGACAGCCCGGAACAGCACGCTGCACCCCCAAGTGAGCACCTGACAGCCTGGAGCAGCAACCACA N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2655359;STD_quant_start=101.621;STD_quant_stop=124.593;Kurtosis_quant_start=-0.743602;Kurtosis_quant_stop=0.797034;SVTYPE=DEL;RNAMES=00db15b2-57f3-4b04-99ec-a767cb355dd6,02ba50c5-26f8-45f3-8fe7-9a7f61fac7f6,0b952212-2a8f-4c3f-9242-feae635ad72f,126d5e77-444b-41e6-af80-cc9f4aa8c66a,5d9b20fd-1f6f-41df-8350-a29246e536dd,70423a2f-e8b0-42f7-9ab4-c0e1beb13a3e,8a1925ec-84a4-493d-aee2-ffa8f53e1dd9,c86e0a6c-26f3-41de-b6ed-d29c3f370e1a;SUPTYPE=AL;SVLEN=-266;STRANDS=+-;RE=8;REF_strand=3,2;AF=0.615385 GT:DR:DV 0/1:5:8 +chr1 2655316 91_1 ACAGCACCCACACCCCCAGGTGAGCATCTGACAGGCTGGAGCAGCACGCACACCCCCAGTGAGCATCTGACAGCCTGCAACAGCTCTCACAACCCCAGGTGAGCATCTGACAGCCCGGAACAGCACGCTGCACCCCCAAGTGAGCACCTGACAGCCTGGAGCAGCAACCACACCCCCAGGTGAGCATCCAACAGCCTGGAACAGCACCGACACCCCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGGTGAGCATCTGATATCCTG N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2655505;STD_quant_start=107.58;STD_quant_stop=66.298;Kurtosis_quant_start=-2.19968;Kurtosis_quant_stop=-2.1278;SVTYPE=DEL;RNAMES=00db15b2-57f3-4b04-99ec-a767cb355dd6,0158ae9b-1cea-44ac-a523-2ffbdd24d482,03e5cf49-71c2-4755-a458-465a48094234,0d2d5305-6e5f-4727-b1bc-07bef37ffe11,32cfc35d-2be4-4f41-9d31-21d2e0267fe4,3477c04c-40eb-4421-843a-d4322a4ec6d9,3ca46174-1e88-4b14-9c48-4402e09bfb27,4a582a55-20bd-4a02-b343-9ba8a8186c96,4ec9953b-3be5-493b-9e01-3ddb75ecddc7,51ec221c-3b15-42f2-b08f-c4b701f3b538,58092463-80c2-408c-aa63-6b00486f44bc,7175d4aa-1063-4242-af5f-5789c7f7de4d,74573bdf-6135-449c-a83a-52763f7a8385,77574f58-be4a-45e0-ae50-65b8e2f3e1fa,7bfdc99d-734d-460a-bdd1-204a51c1f608,7e647479-6d90-4de4-bc04-8f6df2119d0e,85efba47-b448-4ed2-b080-afc1cab0d598,9772b2f4-57f6-4aaa-981d-2f3a9e50d6c9,9dbf2ebd-4b68-40e0-aea3-73db2c89d7ea,abde884c-3320-4e31-842d-e148f7459810,ae486c6b-8ef6-4185-b8e9-e7f26a83cf26,b122e05c-3996-413d-a12f-d4547c943fe5,c9f1ed31-fd3e-4516-a789-374b480d1624,dbe924d2-b7be-4baa-a70c-f38f48bd2d6d,e4a66895-140f-476b-a91e-d94d0ba95342,e7b6b62b-35e7-4e51-9d37-a930f65ca207,ede912f4-5167-4638-902c-67a102d17350,ff58ec71-8a66-4246-9f93-79a1d9af860b;SUPTYPE=AL;SVLEN=-189;STRANDS=+-;RE=28;REF_strand=3,1;AF=0.875 GT:DR:DV 1/1:4:28 +chr1 2657218 89_1 TGATGGTCTGGAGCAGCACCCACAACCACAGGTGAACATCAGAGAGTCTGGAGCAGCGCCCACAACCCCAGGCGAGCATCTGACAGCCTGGAGCCGTGCCCAAACACCCAGGTGAGCATCTGACAGCATGGAGCAGCACCCATAGCCCAAGGTGAGCATCTGACAACTTGGAGCAGCACCCACACCCCGAGGTGAGCATCTGACCTCCCGGAGCAGTACCAGTACCCCCAGGCGAGCATCTGAACTCATGGAGCAGCACCCACGCCCCCAGGCGAGCATCTGACCGAACGGAGCAGCACCCACAACCCCAGGCGAGCATCTGACAGCATGAAACAGCACCCAGAACTCCAGGTGAGCATCTGACAGCCCGCAGTAGCACCCACAAGCACAAGTGAGAAT N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2657716;STD_quant_start=100.28;STD_quant_stop=99.8248;Kurtosis_quant_start=-1.44056;Kurtosis_quant_stop=-1.42879;SVTYPE=DEL;RNAMES=0158ae9b-1cea-44ac-a523-2ffbdd24d482,0b952212-2a8f-4c3f-9242-feae635ad72f,126d5e77-444b-41e6-af80-cc9f4aa8c66a,1f650b27-ee75-4812-bd48-f4c326820dde,26678881-769b-4da2-ab6b-58551722824b,2fc0e095-e459-4e17-9a9f-18a199c50908,45751196-7376-4e6d-8073-de7dbd962408,516909d0-ab9f-48da-9ec4-418f4c436131,51ec221c-3b15-42f2-b08f-c4b701f3b538,5bf5cef3-7d7d-4139-8e47-9bf31dd216af,6e2b1fc4-bd27-482b-9f5b-9777177143a6,70423a2f-e8b0-42f7-9ab4-c0e1beb13a3e,8a1925ec-84a4-493d-aee2-ffa8f53e1dd9,91629993-2017-44a6-86a4-238d1fd17249,9a19d03f-fb7e-4770-ad1f-e64cbba6aefe,c86e0a6c-26f3-41de-b6ed-d29c3f370e1a,edcb1ac9-c75c-4c02-b9a2-411765f99e37;SUPTYPE=AL,SR;SVLEN=-498;STRANDS=+-;RE=17;REF_strand=13,12;AF=0.404762 GT:DR:DV 0/1:25:17 +chr1 2657257 89_1 CCCTGCACACCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGTTGAGCATCTGATGGTCTGGAGCAGCACCCACAACCACAGGTGAACATCAGAGAGTCTGGAGCAGCGCCCACAACCCCAGGCGAGCATCTGACAGCCTGGAGCCGTGCCCAAACACCCAGGTGAGCATCTGACAGCATGGAGCAGCACCCATAGCCCAAGGTGAGCATCTGACAACTTGGAGCAGCACCCACACCCCGAGGTGAGCATCTGACCTCCCGGAGCAGTACCAGTACCCCCAGGCGAGCATCTGAACTCATGGAGCAGCACCCACGCCCCCAGGCGAGCATCTGACCGAACGGAGCAGC N . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=2657477;STD_quant_start=107.679;STD_quant_stop=106.262;Kurtosis_quant_start=-1.52544;Kurtosis_quant_stop=0.301427;SVTYPE=DEL;RNAMES=112c6a5f-ad13-406d-90b5-5b98716845e5,1914c345-2c26-4520-ac2c-99d33848176d,5bf5cef3-7d7d-4139-8e47-9bf31dd216af,5d9b20fd-1f6f-41df-8350-a29246e536dd,7cadd1ca-76fd-4e10-9536-dcc4a660b1a2,91629993-2017-44a6-86a4-238d1fd17249;SUPTYPE=AL;SVLEN=-220;STRANDS=+-;RE=6;REF_strand=2,6;AF=0.428571 GT:DR:DV 0/1:8:6 +chr1 125029102 1150_3 N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=125029168;STD_quant_start=0;STD_quant_stop=0;Kurtosis_quant_start=1.95204;Kurtosis_quant_stop=4.40106;SVTYPE=DEL/INV;RNAMES=0615a1e2-43d8-4ee1-aad8-b3bc30654032,07df46cf-fab9-4b7a-9c61-63d6fb79063d,0c62bf53-a1fa-4cd8-9f65-9b6d896437f6,0c775ec9-8cd4-46fa-b432-d8fadbe12e8c,13f6e3a3-7e12-4fe1-96e8-6fb510c5e51e,25140c43-b936-4ec7-88ac-5f35ee57eb89,425d50b0-d4c0-4128-befb-3cb9a20d0395,42ec5104-09bf-4324-9579-9099acbf7650,61e7dbc6-c78a-4047-aaee-6660f758bf93,711bb46e-6427-4960-9abf-a11838e69701,76aadab8-9dfe-46bc-a906-eb70126c5841,77b0f627-1ffc-43ff-8f07-a3839b73e859,78e435d7-34fa-4d52-9f3f-189868c74142,7bff020f-5745-4030-a649-c2ca270932f4,801d64f6-1205-4a00-a1c0-eeff98c29be7,928578aa-6777-4ba7-a150-7b2eaa900249,993a19f7-ab8d-4636-aa9f-56566d3ab328,9dc57ac2-909f-467e-a15f-26041dee67d0,a5fdd9d5-ed51-4036-88e5-6009ce233bc6;SUPTYPE=NR;SVLEN=-66;STRANDS=+-;RE=19;REF_strand=0,5;AF=0.791667 GT:DR:DV 0/1:5:19 +chr10 125502113 16341_1 N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr10;END=125508658;STD_quant_start=0;STD_quant_stop=3.1305;Kurtosis_quant_start=nan;Kurtosis_quant_stop=2;SVTYPE=INVDUP;RNAMES=11f6d8be-ef3b-44bd-bc04-b7a4c6619129,235ef779-5cc1-4999-91cd-25c6bbcfbb08,3058d7cf-0c2c-493e-8bd7-97adb6e8c721,40788bbd-835b-4706-8abe-a76b9672804f,4ed139db-33bf-4ed2-b709-840d06a92d5f,929a407e-1103-4f89-81fa-20b902c08c6e,a8b62faf-9df4-497e-a1df-f4d6af7a92e0,c34b12b2-7aee-4c9a-89a2-81d8f9d34a8b;SUPTYPE=SR;SVLEN=6545;STRANDS=++;RE=5;REF_strand=6,4;AF=0.333333 GT:DR:DV 0/1:10:5 +chr11 3653753 16746_1 N CAACCCTACCTCTGTCTCTCCTCACCGCTATTCTCCCATCAGACATCAGTCTTGTAAATTCCAATTCCTACCTCTGTCTATCCTCACTGCCATTCTCCCACTGACATCAAGTCTTGTAAATCTCCACCTCCTACCTCTGTCTATCCTCACTGCCATTCTCCCATCAGACATCAAGTTCTGTAAATTCCACCTCCTACCTCTCTGTCTATCCTCACTGCCATTCTCCCATCTGGACATCAAGTCTTGTAAATT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr11;END=3654825;STD_quant_start=139.219;STD_quant_stop=2.84445;Kurtosis_quant_start=-1.96058;Kurtosis_quant_stop=-0.064951;SVTYPE=DUP/INS;RNAMES=00b67655-ed56-4780-b3fe-be29d59e1859,028e509d-93df-4dc8-82e9-a09514d7bc3d,02cce278-7597-4217-8253-5f28a462e699,06eb2443-613b-404c-950a-d4af531205bd,0fcbfcc2-2a80-4da8-b592-d7862d4e5f32,0ffca8ba-4f7f-4475-b7f1-b57ec5c83ed8,130292a2-4dff-4c25-b571-b1ed9fb82f6e,14038c62-2b05-446f-8d58-b34a3f784d23,16732163-f7e5-4bc2-bfc8-a38f908e5ba8,190c6094-4e7b-4f20-91a1-30e6b0d189db,19b242ce-893d-4b98-ac2d-70cf8e216d61,206d22d5-959a-4d55-9d85-eca31de42f0b,21cc7629-0ee7-44b6-9e17-664918ab0ac2,26fd2c01-04d2-40f7-b350-aeea96752a72,2c67fc96-be15-4e19-bb73-662a104fdd1a,329783a5-e55a-4276-8c13-1f8bdbff7700,342c2503-a98b-4a20-9dd9-8190bdd743fa,34c23995-bcaa-4dcd-aefa-3c96d4032492,4f12c658-1e88-44a9-9689-18bc76d12047,5da809c9-cf2c-4562-a703-3d1b12927220,6145c5c6-c4ed-4b30-987f-e653337a0a18,744b6c64-1a96-4dcb-9216-8be6bdcfe7f3,84e6ec27-5a6b-463d-8681-045651b2af07,8eedae6b-ec01-4367-bd47-2081f9df8f33,8f6ff282-26eb-4eb5-89bd-df9010689ba6,9c8fa8a6-da33-4599-b835-24d0220c6139,a2494f89-4dba-472a-9b20-c61d0a0512af,ad3b03b1-1cf7-4a54-a6df-eb7563ddbbea,b2b77f8b-0659-4996-8280-078e8b9463bb,b7819371-05b3-4eac-a229-54a49a852133,bafd9ab5-3cc7-4c21-b48f-186d1a8e5351,bff30357-4e2d-46a7-927e-707223885e25,c2d8bcd3-a488-4709-8d33-f9c000c54d51,c4e7702f-8831-4236-b6a8-6723a3d668f8,ca4a99ee-181d-488a-9eea-e0ef7e9b765e,ca6d0c9c-bc64-4888-b660-18ca49b597b9,ccf84af9-06f8-4bb1-b844-e4512907b8bd,d00d6c06-b2e2-407d-b294-d585efdb53ad,da8571f5-d34e-4e04-a7a8-b2696a4661e0,e1fd56ef-d7ee-4703-8fad-a90383cb4156,ebf37b99-cfbc-4168-a497-a8453d0e698a,f8008b3f-d0e6-474b-82c1-bb28a53b9e01,f8c3fa80-204b-4d1f-b782-358c648e48bd,fa2f213f-f63d-4828-a601-bfdfae84b8e9;SUPTYPE=AL,SR;SVLEN=61;STRANDS=+-;RE=22;REF_strand=1,0;AF=0.956522 GT:DR:DV 1/1:1:22 From 302d8544ded6d21ea121961c48693699409fb88d Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Thu, 20 Jan 2022 14:49:16 -0800 Subject: [PATCH 074/137] reformat file --- src/mavis/tools/vcf.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index ae1410e5..c25fe27d 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -181,9 +181,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: std_row[COLUMNS.event_type] = SVTYPE.INS elif size < 0: std_row[COLUMNS.event_type] = SVTYPE.DEL - std_row.update( - {COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2} - ) + std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2}) if info.get( "PRECISE", False ): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged @@ -201,11 +199,8 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: COLUMNS.break1_position_start: max( 1, record.pos + info.get("CIPOS", (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos - + info.get("CIPOS", (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, end + info.get("CIEND", (0, 0))[0] - ), + COLUMNS.break1_position_end: record.pos + info.get("CIPOS", (0, 0))[1], + COLUMNS.break2_position_start: max(1, end + info.get("CIEND", (0, 0))[0]), COLUMNS.break2_position_end: end + info.get("CIEND", (0, 0))[1], } ) From 4b42f693cd282544a458c373b6e16587082efc56 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 16:44:49 -0800 Subject: [PATCH 075/137] Remove leftover unused perl file --- src/tools/TSV.pm | 202 ----------------------------------------------- 1 file changed, 202 deletions(-) delete mode 100644 src/tools/TSV.pm diff --git a/src/tools/TSV.pm b/src/tools/TSV.pm deleted file mode 100644 index ddc48bb8..00000000 --- a/src/tools/TSV.pm +++ /dev/null @@ -1,202 +0,0 @@ -package TSV; - -#** @file -# Main file for processing tsv formatted files -# given some list of required column names, goes through the rows and builds a -# hash for each row by column names -# note the hash will only store information for column names that we pass in -# via the required columns list -#* - -use strict; -use warnings; -use POSIX qw(strftime); - -my $_warnings_off = 0; - -sub import -{ - my $class = shift; - my $_warnings_off = shift if $#_ >= 0; -} - -sub _build_header_hash -{ - #** @function private _build_header_hash($required_column_names) - # @param $required_column_names [required] the list of expected column names - # @return a reference to a hash of the required column names - #* - my $required_column_names = shift; - my $header_index_hash = {}; - for my $col (@$required_column_names) - { - $header_index_hash->{$col} = -1; #default - } - return $header_index_hash; -} - -sub generate_header_comments -{ - #** @function header_comments($inputfile, $outputfile) - # @param $program the program used to generate the results - # @param $version the version of the above program - # @param %args a hash representing the parameters that the above program was run with - # @return the string that will be put at the top of the output file - #* - my $program = shift; - my $version = shift; - my %args = @_; - my $time = strftime("%Y-%m-%d %H:%M:%S", localtime); - my @result = ( - "## Generated by $program version $version at $time", - "## Running Parameters: ", - ); - while((my $option, my $setting) = each %args) - { - push(@result, sprintf( - "##\t%s\t%s", $option, - defined $setting? $setting : 'undef' - ) - ); - } - push(@result, "##"); - return join("\n", @result) . "\n"; -} - -sub _parse_input_line -{ - #** @function private parse_input_line($header_index, $line) - # builds a hash of a row from the input file using expected column identifiers - # @param $header_index the hash of column names with their index positions in a line - # @param $line the row we are parsing - # @retval {} the row hash - #* - my ($header_index, $line) = @_; - - my @fields = split("\t", $line, -1); - if(scalar @fields < scalar keys %$header_index) - { - my $err = ("[ERROR] in row $line\n" - . "[ERROR] found " - . scalar @fields - . " but expected " - . (scalar keys %{$header_index} ) - . " fields\n" - . "Error reading the input file. The number of fields" - . " in the input row is less than the number of required" - . " columns. Please check that the input file is" - . " tab-delimited and has the correct number of fields"); - die $err; - } - - my $record = {}; - while((my $column_name, my $index) = each %$header_index) - { - $record->{$column_name} = $fields[$index]; - } - return $record; -} - -sub _parse_header_line -{ - #** @function private undef parse_header_line($header_index_ref, $header) - # fills the header_index hash with the column names (keys) and their positions (values) in the line - # @warning throws an exception if it is passed a column header (in the hash) that is not found - # @param $header_index_ref reference to the hash indices by required column names - # @param $header header string from the input file - # @return none - #* - my ($header_index_ref, $header) = @_; - - my @column_names = split("\t", $header, -1); - my $counter = 0; - my %dup_counter = (); - while(my $col = shift @column_names) # store the positions of the column names that we are looking for - { - if(exists $dup_counter{$col}) - { - die "[ERROR] duplicate column names $col in header\n"; - } - $dup_counter{$col} = undef; - $header_index_ref->{$col} = $counter; - $counter++; - } - while((my $column, my $index_position) = each %$header_index_ref) # check to ensure we have valid index positions for each of the required columns - { - if($index_position < 0) - { - die "[ERROR] in parsing the header of the inputfile. Did not find the required column $column"; - } - } -} - -sub parse_input -{ - #** @function public () parse_input($filename, $req_columns) - # reads a tab-delimited file - # creates an array of the rows (excluding comments and the header) - # each row is turned into a hash (by header column names) - # @param $filename the input file - # @param $req_columns an array of expected column names - # @retval () an array of the input file rows - #* - my ($filename, $req_columns) = @_; - my $header_index = _build_header_hash($req_columns); - die unless defined $header_index; - open(my $fh, "<", $filename) - or die "Could not open inputfile $filename\n"; - my $line; - - while($line = <$fh>) - { - last if(not ($line =~ m/^##/)); # skip comments, defined by double hash line - } - # the next line is the header - chomp($line); - $line =~ s/^#//; # remove the header starting hash if present - - _parse_header_line($header_index, $line); - - my @header = sort { $header_index->{$a} <=> $header_index->{$b} } keys(%$header_index); - my @catalog = (); - while($line = <$fh>) - { - chomp($line); # remove leading and trailing whitespace - next if $line eq ""; - my $record = _parse_input_line($header_index, $line); - die if !defined $record; - push(@catalog, $record); - } - return (\@header, \@catalog); -} - - -sub string_line -{ - #** @method public static $ string_line($header, $line, $delim) - # @param $header [required] (type: array ref) - # @param $line [required] (type: hash ref) - # @param $delim [optional] (type: string) - # puts a line back together in the same order as the input header given some - # delimiter - # @return (type: string) the string of the input line - #* - my $header = shift; - my $line = shift; - my $delim = shift; - $delim = ! defined $delim ? "\t" : $delim; - - my @new_line = (); - for my $col (@$header) - { - if (! exists $line->{$col}) - { - die "[ERROR] column '$col' not in row"; - } - push(@new_line, $line->{$col}); - } - return join($delim, @new_line); -} - - -1; # this makes the module usable from another perl script From 2793abac36237ebfe7af777f0a7341da7fe6df33 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 16:45:15 -0800 Subject: [PATCH 076/137] Fix inputs for skip validate version of config --- Snakefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 113dc2fb..4c8233f9 100644 --- a/Snakefile +++ b/Snakefile @@ -170,10 +170,18 @@ if not config['skip_stage.validate']: + ' &> {log}' +def get_annotate_input_file(wildcards): + if not config['skip_stage.validate']: + return expand(rules.validate.output, library=[wildcards.library], job_id=[wildcards.job_id]) + return expand(CLUSTER_OUTPUT, library=[wildcards.library], job_id=[wildcards.job_id]) + + rule annotate: input: rules.validate.output if not config['skip_stage.validate'] else rules.cluster.output output: stamp=output_dir('{library}/annotate/batch-{job_id}/MAVIS.COMPLETE'), result=output_dir('{library}/annotate/batch-{job_id}/annotations.tab') + params: + inputfile=get_annotate_input_file log: os.path.join(LOG_DIR, '{library}.annotate.snakemake.batch-{job_id}.log.txt') container: CONTAINER resources: @@ -184,7 +192,7 @@ rule annotate: shell: 'mavis annotate --config {rules.init_config.output}' + ' --library {wildcards.library}' - + ' --inputs {input}' + + ' --inputs {params.inputfile}' + ' --output ' + output_dir('{wildcards.library}/annotate/batch-{wildcards.job_id}') + ' &> {log}' From 4a8b1d0aafbd219d7fd4b751152a81f3652d0937 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 16:45:52 -0800 Subject: [PATCH 077/137] Only run the bam stats collection if validation is not skipped --- src/mavis/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mavis/main.py b/src/mavis/main.py index abedaf82..ea2f93aa 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -265,7 +265,8 @@ def main(argv=None): ) elif command == SUBCOMMAND.SETUP: # add bam stats to the config if missing - _config.add_bamstats_to_config(config) + if not config.get('skip_stage.validate'): + _config.add_bamstats_to_config(config) _util.LOG(f'writing: {args.outputfile}') with open(args.outputfile, 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) From 574172e71ca5b8ece99325d5c7f18780f2a0ad22 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 16:46:09 -0800 Subject: [PATCH 078/137] Apply custom functions after adding default values --- src/mavis/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mavis/util.py b/src/mavis/util.py index abd55c34..446520d3 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -398,10 +398,6 @@ def soft_null_cast(value): if col not in df and col not in add_default: raise KeyError(f'missing required column: {col}') - # run the custom functions - for col, func in apply.items(): - df[col] = df[col].apply(func) - if COLUMNS.opposing_strands in df: df[COLUMNS.opposing_strands] = df[COLUMNS.opposing_strands].apply( lambda x: None if x == '?' else soft_cast(x, cast_type=bool) @@ -434,6 +430,10 @@ def soft_null_cast(value): else: df[col] = default_value + # run the custom functions + for col, func in apply.items(): + df[col] = df[col].apply(func) + # set overwriting defaults for col, value in overwrite.items(): df[col] = value From a9abb205c77f8580e3f274b95ed81611bb00da2f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 18:27:28 -0800 Subject: [PATCH 079/137] Add test for annotation-only workflow --- tests/mini-tutorial.annotate_only.config.json | 52 ++++++++++ tests/snakemake/test_mini_workflow.py | 98 ++++++++++++++++--- 2 files changed, 134 insertions(+), 16 deletions(-) create mode 100644 tests/mini-tutorial.annotate_only.config.json diff --git a/tests/mini-tutorial.annotate_only.config.json b/tests/mini-tutorial.annotate_only.config.json new file mode 100644 index 00000000..b270c7dc --- /dev/null +++ b/tests/mini-tutorial.annotate_only.config.json @@ -0,0 +1,52 @@ +{ + "annotate.draw_fusions_only": false, + "convert": { + "mock_converted": { + "inputs": [ + "tests/data/mock_sv_events.tsv" + ], + "file_type": "mavis", + "assume_no_untemplated": true + } + }, + "skip_stage.validate": true, + "cluster.uninformative_filter": true, + "cluster.limit_to_chr": null, + "cluster.min_clusters_per_file": 5, + "libraries": { + "mock-A47933": { + "assign": [ + "tests/data/mock_trans_sv_events.tsv" + ], + "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", + "disease_status": "diseased", + "protocol": "transcriptome", + "strand_specific": true + }, + "mock-A36971": { + "assign": [ + "mock_converted" + ], + "bam_file": "tests/data/mock_reads_for_events.sorted.bam", + "disease_status": "diseased", + "protocol": "genome", + "strand_specific": false + } + }, + "output_dir": "output_dir", + "reference.annotations": [ + "tests/data/mock_annotations.json" + ], + "reference.dgv_annotation": [ + "tests/data/mock_dgv_annotation.txt" + ], + "reference.masking": [ + "tests/data/mock_masking.tab" + ], + "reference.reference_genome": [ + "tests/data/mock_reference_genome.fa" + ], + "reference.template_metadata": [ + "tests/data/cytoBand.txt" + ] +} diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py index 963f339c..af332343 100644 --- a/tests/snakemake/test_mini_workflow.py +++ b/tests/snakemake/test_mini_workflow.py @@ -12,6 +12,16 @@ from ..util import glob_exists, long_running_test, package_relative_file +def list_files(startpath): + for root, dirs, files in os.walk(startpath): + level = root.replace(startpath, '').count(os.sep) + indent = ' ' * 4 * (level) + print('{}{}/'.format(indent, os.path.basename(root))) + subindent = ' ' * 4 * (level + 1) + for f in files: + print('{}{}'.format(subindent, f)) + + @pytest.fixture def blat_output_dir(): temp_output = tempfile.mkdtemp() @@ -45,6 +55,21 @@ def bwa_output_dir(): shutil.rmtree(temp_output) +@pytest.fixture +def annotate_only_output_dir(): + temp_output = tempfile.mkdtemp() + + os.makedirs(os.path.join(temp_output, 'mavis/schemas')) + + with open(package_relative_file('tests/mini-tutorial.annotate_only.config.json'), 'r') as fh: + config = json.load(fh) + config['output_dir'] = os.path.join(temp_output, 'output_dir') + with open(os.path.join(temp_output, 'mini-tutorial.config.json'), 'w') as fh: + fh.write(json.dumps(config)) + yield temp_output + shutil.rmtree(temp_output) + + @pytest.fixture def output_dir(request): return request.getfixturevalue(request.param) @@ -67,22 +92,63 @@ def test_workflow(output_dir): with patch.object(sys, 'argv', argv): try: snakemake_main() - assert glob_exists(os.path.join(output_dir, 'summary', 'MAVIS.COMPLETE')) - assert glob_exists(os.path.join(output_dir, 'pairing', 'MAVIS.COMPLETE')) - assert glob_exists(os.path.join(output_dir, 'mock-A47933', 'cluster', 'MAVIS.COMPLETE')) - assert glob_exists( - os.path.join(output_dir, 'mock-A47933', 'validate', '*', 'MAVIS.COMPLETE') - ) - assert glob_exists( - os.path.join(output_dir, 'mock-A47933', 'annotate', '*', 'MAVIS.COMPLETE') - ) - assert glob_exists(os.path.join(output_dir, 'mock-A36971', 'cluster', 'MAVIS.COMPLETE')) - assert glob_exists( - os.path.join(output_dir, 'mock-A36971', 'validate', '*', 'MAVIS.COMPLETE') - ) - assert glob_exists( - os.path.join(output_dir, 'mock-A36971', 'annotate', '*', 'MAVIS.COMPLETE') - ) + + except SystemExit as err: + if err.code != 0: + raise err + + list_files(output_dir) + for expected_file in [ + os.path.join('summary', 'MAVIS.COMPLETE'), + os.path.join('pairing', 'MAVIS.COMPLETE'), + os.path.join('mock-A47933', 'cluster', 'MAVIS.COMPLETE'), + os.path.join('mock-A47933', 'annotate', 'batch-*', 'MAVIS.COMPLETE'), + os.path.join('mock-A36971', 'cluster', 'MAVIS.COMPLETE'), + os.path.join('mock-A36971', 'annotate', 'batch-*', 'MAVIS.COMPLETE'), + os.path.join('mock-A47933', 'validate', 'batch-*', 'MAVIS.COMPLETE'), + os.path.join('mock-A36971', 'validate', 'batch-*', 'MAVIS.COMPLETE'), + ]: + if not glob_exists(os.path.join(output_dir, 'output_dir', expected_file)): + raise AssertionError(f'{expected_file} does not exist') + + +@long_running_test +@pytest.mark.parametrize('output_dir', ['annotate_only_output_dir'], indirect=True) +def test_no_validate_worflow(output_dir): + argv = [ + 'snakemake', + '-s', + package_relative_file('Snakefile'), + '-j', + '1', + '--configfile', + os.path.join(output_dir, 'mini-tutorial.config.json'), + '-d', + package_relative_file(), + ] + with patch.object(sys, 'argv', argv): + try: + snakemake_main() + except SystemExit as err: if err.code != 0: raise err + + list_files(output_dir) + for expected_file in [ + os.path.join('summary', 'MAVIS.COMPLETE'), + os.path.join('pairing', 'MAVIS.COMPLETE'), + os.path.join('mock-A47933', 'cluster', 'MAVIS.COMPLETE'), + os.path.join('mock-A47933', 'annotate', 'batch-*', 'MAVIS.COMPLETE'), + os.path.join('mock-A36971', 'cluster', 'MAVIS.COMPLETE'), + os.path.join('mock-A36971', 'annotate', 'batch-*', 'MAVIS.COMPLETE'), + ]: + if not glob_exists(os.path.join(output_dir, 'output_dir', expected_file)): + raise AssertionError(f'{expected_file} does not exist') + + for unexpected_file in [ + os.path.join('mock-A47933', 'validate', 'batch-*', 'MAVIS.COMPLETE'), + os.path.join('mock-A36971', 'validate', 'batch-*', 'MAVIS.COMPLETE'), + ]: + if glob_exists(os.path.join(output_dir, 'output_dir', unexpected_file)): + raise AssertionError(f'{unexpected_file} exists') From 837a28d79750dd22c3b39155635a49ebdedb9af8 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 18:30:02 -0800 Subject: [PATCH 080/137] Remove leftover debugging code --- tests/snakemake/test_mini_workflow.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py index af332343..2b559234 100644 --- a/tests/snakemake/test_mini_workflow.py +++ b/tests/snakemake/test_mini_workflow.py @@ -12,16 +12,6 @@ from ..util import glob_exists, long_running_test, package_relative_file -def list_files(startpath): - for root, dirs, files in os.walk(startpath): - level = root.replace(startpath, '').count(os.sep) - indent = ' ' * 4 * (level) - print('{}{}/'.format(indent, os.path.basename(root))) - subindent = ' ' * 4 * (level + 1) - for f in files: - print('{}{}'.format(subindent, f)) - - @pytest.fixture def blat_output_dir(): temp_output = tempfile.mkdtemp() @@ -97,7 +87,6 @@ def test_workflow(output_dir): if err.code != 0: raise err - list_files(output_dir) for expected_file in [ os.path.join('summary', 'MAVIS.COMPLETE'), os.path.join('pairing', 'MAVIS.COMPLETE'), @@ -134,7 +123,6 @@ def test_no_validate_worflow(output_dir): if err.code != 0: raise err - list_files(output_dir) for expected_file in [ os.path.join('summary', 'MAVIS.COMPLETE'), os.path.join('pairing', 'MAVIS.COMPLETE'), From 3d8602376416535c16c60e7b35651312558edbaf Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 21:09:59 -0800 Subject: [PATCH 081/137] Add annotation only example to docs --- docs/tutorials/annotation.md | 98 ++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 docs/tutorials/annotation.md diff --git a/docs/tutorials/annotation.md b/docs/tutorials/annotation.md new file mode 100644 index 00000000..adfaf17d --- /dev/null +++ b/docs/tutorials/annotation.md @@ -0,0 +1,98 @@ +# Annotation Only + +Sometimes you have a set of variants and would simply like to run the annotate step of MAVIS to visualize and annotate them. + +First you need to create your basic config to tell MAVIS where the reference files you want to use are and some minimal information about the library/sample you want to process. + +Here is an example config where the user has created a minimal input file in the MAVIS standard input file format. We convert it to expand any unknowns (ex. SV type if left blank) + +```json +{ + "libraries": { + "my_library": { + "assign": ["my_converted_file"], + "disease_status": "normal", + "protocol": "genome" + } + }, + "convert": { + "my_converted_file": { + "inputs": ["/path/to/file/structural_variants.txt"], + "file_type": "mavis" + } + }, + "cluster.split_only": true, + "skip_stage.validate": true, + "output_dir": "my_output_dir", + "reference.annotations": "/path/to/mavis/reference_files/ensembl79_hg38_annotations.json", + "reference.template_metadata": "/path/to/mavis/reference_files/hg38_cytoBand.txt", + "reference.reference_genome": "/path/to/hg38_no_alt/genome/hg38_no_alt.fa", + "reference.masking": "/path/to/mavis/reference_files/masking_hg38.adjusted.tab", + "reference.dgv_annotation": "/path/to/mavis/reference_files/dgv_hg38_annotations.tab" +} +``` + +Another example is given in the MAVIS tests folder under `tests/mini-tutorial.annotate_only.config.json` which looks like this + +```json +{ + "annotate.draw_fusions_only": false, + "convert": { + "mock_converted": { + "inputs": [ + "tests/data/mock_sv_events.tsv" + ], + "file_type": "mavis", + "assume_no_untemplated": true + } + }, + "skip_stage.validate": true, + "cluster.uninformative_filter": true, + "cluster.limit_to_chr": null, + "cluster.min_clusters_per_file": 5, + "libraries": { + "mock-A47933": { + "assign": [ + "tests/data/mock_trans_sv_events.tsv" + ], + "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", + "disease_status": "diseased", + "protocol": "transcriptome", + "strand_specific": true + }, + "mock-A36971": { + "assign": [ + "mock_converted" + ], + "bam_file": "tests/data/mock_reads_for_events.sorted.bam", + "disease_status": "diseased", + "protocol": "genome", + "strand_specific": false + } + }, + "output_dir": "output_dir", + "reference.annotations": [ + "tests/data/mock_annotations.json" + ], + "reference.dgv_annotation": [ + "tests/data/mock_dgv_annotation.txt" + ], + "reference.masking": [ + "tests/data/mock_masking.tab" + ], + "reference.reference_genome": [ + "tests/data/mock_reference_genome.fa" + ], + "reference.template_metadata": [ + "tests/data/cytoBand.txt" + ] +} +``` + +Either of these configurations can be run with the following command simply by changing the configfile argument + +```bash +snakemake -j 1 \ + --configfile tests/mini-tutorial.annotate_only.config.json \ + -s Snakefile +``` From aa0a861f646a80e15105f645abed4523acd64d32 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 22:28:21 -0800 Subject: [PATCH 082/137] Use mavis_config instead of local schemas --- setup.cfg | 1 + src/mavis/annotate/main.py | 5 +- src/mavis/illustrate/constants.py | 2 +- src/mavis/pairing/constants.py | 3 +- src/mavis/schemas/__init__.py | 35 - src/mavis/schemas/config.json | 795 -------------------- src/mavis/schemas/overlay.json | 142 ---- src/mavis/validate/base.py | 2 +- src/mavis/validate/evidence.py | 2 - src/mavis/validate/main.py | 1 - tests/integration/test_align.py | 3 +- tests/integration/test_assemble.py | 2 +- tests/integration/test_validate.py | 2 +- tests/integration/test_validate_evidence.py | 2 +- 14 files changed, 12 insertions(+), 985 deletions(-) delete mode 100644 src/mavis/schemas/__init__.py delete mode 100644 src/mavis/schemas/config.json delete mode 100644 src/mavis/schemas/overlay.json diff --git a/setup.cfg b/setup.cfg index a34fc7ef..b75ac619 100644 --- a/setup.cfg +++ b/setup.cfg @@ -48,6 +48,7 @@ install_requires = shortuuid>=0.5.0 svgwrite typing_extensions>=4 + pyfaidx^=0.6.3.1 setup_requires = pip>=9.0.0 setuptools>=36.0.0 diff --git a/src/mavis/annotate/main.py b/src/mavis/annotate/main.py index f1a9456e..e1df8e68 100644 --- a/src/mavis/annotate/main.py +++ b/src/mavis/annotate/main.py @@ -4,11 +4,12 @@ import time from typing import Dict, List -from ..constants import COLUMNS, PRIME, PROTOCOL, sort_columns +from mavis_config import get_by_prefix + +from ..constants import COLUMNS, PRIME, sort_columns from ..error import DrawingFitError, NotSpecifiedError from ..illustrate.constants import DiagramSettings from ..illustrate.diagram import draw_sv_summary_diagram -from ..schemas import DEFAULTS, get_by_prefix from ..util import LOG, generate_complete_stamp, mkdirp, read_inputs from .constants import PASS_FILENAME from .file_io import ReferenceFile diff --git a/src/mavis/illustrate/constants.py b/src/mavis/illustrate/constants.py index 670e5364..7dcf5e09 100644 --- a/src/mavis/illustrate/constants.py +++ b/src/mavis/illustrate/constants.py @@ -1,7 +1,7 @@ from colour import Color +from mavis_config import DEFAULTS, get_by_prefix from ..constants import GIEMSA_STAIN -from ..schemas import DEFAULTS, get_by_prefix class DiagramSettings: diff --git a/src/mavis/pairing/constants.py b/src/mavis/pairing/constants.py index 69077f4d..cb55e123 100644 --- a/src/mavis/pairing/constants.py +++ b/src/mavis/pairing/constants.py @@ -1,7 +1,8 @@ from typing import Dict +from mavis_config import DEFAULTS + from ..constants import CALL_METHOD, MavisNamespace -from ..schemas import DEFAULTS PAIRING_DISTANCES: Dict[str, int] = { CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'], diff --git a/src/mavis/schemas/__init__.py b/src/mavis/schemas/__init__.py deleted file mode 100644 index f41bda5c..00000000 --- a/src/mavis/schemas/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -try: - from collections import Mapping -except ImportError: - from collections.abc import Mapping - -import os - -from snakemake.utils import validate as snakemake_validate - - -class ImmutableDict(Mapping): - def __init__(self, data): - self._data = data - - def __getitem__(self, key): - return self._data[key] - - def __len__(self): - return len(self._data) - - def __iter__(self): - return iter(self._data) - - -def get_by_prefix(config, prefix): - return {k.replace(prefix, ''): v for k, v in config.items() if k.startswith(prefix)} - - -DEFAULTS = {} -snakemake_validate( - DEFAULTS, - os.path.join(os.path.dirname(__file__), 'config.json'), - set_default=True, -) -DEFAULTS = ImmutableDict(DEFAULTS) diff --git a/src/mavis/schemas/config.json b/src/mavis/schemas/config.json deleted file mode 100644 index a948cd77..00000000 --- a/src/mavis/schemas/config.json +++ /dev/null @@ -1,795 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": false, - "properties": { - "annotate.annotation_filters": { - "default": [ - "choose_more_annotated", - "choose_transcripts_by_priority" - ], - "description": "A comma separated list of filters to apply to putative annotations", - "items": { - "enum": [ - "choose_more_annotated", - "choose_transcripts_by_priority" - ], - "type": "string" - }, - "type": "array" - }, - "annotate.draw_fusions_only": { - "default": true, - "description": "Flag to indicate if events which do not produce a fusion transcript should produce illustrations", - "type": "boolean" - }, - "annotate.draw_non_synonymous_cdna_only": { - "default": true, - "description": "Flag to indicate if events which are synonymous at the cdna level should produce illustrations", - "type": "boolean" - }, - "annotate.max_orf_cap": { - "default": 3, - "description": "The maximum number of orfs to return (best putative orfs will be retained)", - "type": "integer" - }, - "annotate.min_domain_mapping_match": { - "default": 0.9, - "description": "A number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript to be displayed", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "annotate.min_orf_size": { - "default": 300, - "description": "The minimum length (in base pairs) to retain a putative open reading frame (orf)", - "type": "integer" - }, - "bam_stats.distribution_fraction": { - "default": 0.97, - "description": "the proportion of the distribution to use in computing stdev", - "maximum": 1, - "minimum": 0.01, - "type": "number" - }, - "bam_stats.sample_bin_size": { - "default": 1000, - "description": "how large to make the sample bin (in bp)", - "type": "integer" - }, - "bam_stats.sample_cap": { - "default": 1000, - "description": "maximum number of reads to collect for any given sample region", - "type": "integer" - }, - "bam_stats.sample_size": { - "default": 500, - "description": "the number of genes/bins to compute stats over", - "type": "integer" - }, - "cluster.cluster_initial_size_limit": { - "default": 25, - "description": "The maximum cumulative size of both breakpoints for breakpoint pairs to be used in the initial clustering phase (combining based on overlap)", - "type": "integer" - }, - "cluster.cluster_radius": { - "default": 100, - "description": "Maximum distance allowed between paired breakpoint pairs", - "type": "integer" - }, - "cluster.limit_to_chr": { - "default": [ - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "X", - "Y" - ], - "description": "A list of chromosome names to use. breakpointpairs on other chromosomes will be filteredout. for example '1 2 3 4' would filter out events/breakpoint pairs on any chromosomes but 1, 2, 3, and 4", - "items": { - "type": "string" - }, - "type": [ - "array", - "null" - ] - }, - "cluster.max_files": { - "default": 200, - "description": "The maximum number of files to output from clustering/splitting", - "minimum": 1, - "type": "integer" - }, - "cluster.max_proximity": { - "default": 5000, - "description": "The maximum distance away from an annotation before the region in considered to be uninformative", - "type": "integer" - }, - "cluster.min_clusters_per_file": { - "default": 50, - "description": "The minimum number of breakpoint pairs to output to a file", - "minimum": 1, - "type": "integer" - }, - "cluster.split_only": { - "default": false, - "description": "just split the input files, do not merge input breakpoints into clusters", - "type": "boolean" - }, - "cluster.uninformative_filter": { - "default": false, - "description": "Flag that determines if breakpoint pairs which are not within max_proximity to any annotations are filtered out prior to clustering", - "type": "boolean" - }, - "convert": { - "additionalProperties": { - "properties": { - "assume_no_untemplated": { - "default": false, - "description": "Assume the lack of untemplated information means that there IS not untemplated sequence expected at the breakpoints", - "type": "boolean" - }, - "file_type": { - "description": "the tool the file is input from or 'mavis' for standard mavis-style tab files", - "enum": [ - "manta", - "delly", - "transabyss", - "pindel", - "chimerascan", - "mavis", - "defuse", - "breakdancer", - "vcf", - "breakseq", - "cnvnator", - "strelka", - "starfusion" - ], - "type": "string" - }, - "inputs": { - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array", - "description": "List of input files" - }, - "strand_specific": { - "default": false, - "type": "boolean" - } - }, - "required": [ - "inputs", - "file_type" - ], - "type": "object" - }, - "type": "object" - }, - "illustrate.domain_color": { - "default": "#ccccb3", - "description": "Domain fill color", - "type": "string", - "pattern": "^#[a-zA-Z0-9]{6}" - }, - "illustrate.domain_mismatch_color": { - "default": "#b2182b", - "description": "Domain fill color on 0%% match", - "type": "string", - "pattern": "^#[a-zA-Z0-9]{6}" - }, - "illustrate.domain_name_regex_filter": { - "default": "^PF\\d+$", - "description": "The regular expression used to select domains to be displayed (filtered by name)", - "type": "string" - }, - "illustrate.domain_scaffold_color": { - "default": "#000000", - "description": "The color of the domain scaffold", - "type": "string", - "pattern": "^#[a-zA-Z0-9]{6}" - }, - "illustrate.drawing_width_iter_increase": { - "default": 500, - "description": "The amount (in pixels) by which to increase the drawing width upon failure to fit", - "type": "integer" - }, - "illustrate.exon_min_focus_size": { - "default": 10, - "description": "Minimum size of an exon for it to be granted a label or min exon width", - "type": "integer" - }, - "illustrate.gene1_color": { - "default": "#657e91", - "description": "The color of genes near the first gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.gene1_color_selected": { - "default": "#518dc5", - "description": "The color of the first gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.gene2_color": { - "default": "#325556", - "description": "The color of genes near the second gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.gene2_color_selected": { - "default": "#4c9677", - "description": "The color of the second gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.label_color": { - "default": "#000000", - "description": "The label color", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.mask_fill": { - "default": "#ffffff", - "description": "Color of mask (for deleted region etc.)", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.mask_opacity": { - "default": 0.7, - "description": "Opacity of the mask layer", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "illustrate.max_drawing_retries": { - "default": 5, - "description": "The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output", - "type": "integer" - }, - "illustrate.novel_exon_color": { - "default": "#5D3F6A", - "description": "Novel exon fill color", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.scaffold_color": { - "default": "#000000", - "description": "The color used for the gene/transcripts scaffolds", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.splice_color": { - "default": "#000000", - "description": "Splicing lines color", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.width": { - "default": 1000, - "description": "The drawing width in pixels", - "type": "integer" - }, - "illustrate.breakpoint_color": { - "default": "#000000", - "description": "Breakpoint outline color", - "type": "string", - "pattern": "^#[a-zA-Z0-9]{6}" - }, - "libraries": { - "additionalProperties": { - "additionalProperties": false, - "properties": { - "assign": { - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array", - "description": "List of input files or conversion aliases that should be processed for this library" - }, - "total_batches": { - "type": "integer", - "min": 1, - "description": "The number of jobs to slit a library into for cluster/validate/annotate. This will be set during initialization of the config if not given" - }, - "bam_file": { - "type": "string", - "description": "Path to the bam file containing the sequencing reads for this library" - }, - "disease_status": { - "enum": [ - "diseased", - "normal" - ], - "type": "string" - }, - "median_fragment_size": { - "type": "integer", - "description": "The median fragment size in the paired-end read library. This will be computed from the bam during initialization of the config if not given" - }, - "protocol": { - "enum": [ - "genome", - "transcriptome" - ], - "type": "string" - }, - "read_length": { - "type": "integer", - "description": "The read length in the paired-end read library. This will be computed from the bam during initialization of the config if not given" - }, - "stdev_fragment_size": { - "type": "integer", - "description": "The standard deviation of fragment size in the paired-end read library. This will be computed from the bam during initialization of the config if not given" - }, - "strand_determining_read": { - "default": 2, - "description": "1 or 2. the read in the pair which determines if (assuming a stranded protocol) the first or second read in the pair matches the strand sequenced", - "type": "integer" - }, - "strand_specific": { - "default": false, - "type": "boolean" - } - }, - "required": [ - "disease_status", - "protocol", - "assign" - ], - "type": "object" - }, - "minProperties": 1, - "type": "object" - }, - "log": { - "type": "string" - }, - "log_level": { - "default": "INFO", - "enum": [ - "INFO", - "DEBUG" - ], - "type": "string" - }, - "output_dir": { - "type": "string", - "description": "path to the directory to output the MAVIS files to" - }, - "pairing.contig_call_distance": { - "default": 10, - "description": "The maximum distance allowed between breakpoint pairs (called by contig) in order for them to pair", - "type": "integer" - }, - "pairing.flanking_call_distance": { - "default": 50, - "description": "The maximum distance allowed between breakpoint pairs (called by flanking pairs) in order for them to pair", - "type": "integer" - }, - "pairing.input_call_distance": { - "default": 20, - "description": "The maximum distance allowed between breakpoint pairs (called by input tools, not validated) in order for them to pair", - "type": "integer" - }, - "pairing.spanning_call_distance": { - "default": 20, - "description": "The maximum distance allowed between breakpoint pairs (called by spanning reads) in order for them to pair", - "type": "integer" - }, - "pairing.split_call_distance": { - "default": 20, - "description": "The maximum distance allowed between breakpoint pairs (called by split reads) in order for them to pair", - "type": "integer" - }, - "reference.aligner_reference": { - "examples": [ - "tests/data/mock_reference_genome.2bit" - ], - "items": { - "type": "string" - }, - "maxItems": 1, - "minItems": 1, - "type": "array", - "description": "The reference genome file used by the aligner" - }, - "reference.annotations": { - "examples": [ - "tests/data/mock_annotations.json" - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array", - "description": "The reference file containing gene/transcript position information" - }, - "reference.dgv_annotation": { - "examples": [ - [ - "tests/data/mock_dgv_annotation.txt" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" - }, - "reference.masking": { - "examples": [ - [ - "tests/data/mock_masking.tab" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array", - "description": "A list of regions to ignore in validation. Generally these are centromeres and telomeres or known poor mapping areas" - }, - "reference.reference_genome": { - "examples": [ - [ - "tests/data/mock_reference_genome.fa" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" - }, - "reference.template_metadata": { - "examples": [ - [ - "tests/data/cytoBand.txt" - ] - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" - }, - "skip_stage.validate": { - "default": false, - "description": "skip the validation stage of the MAVIS pipeline", - "type": "boolean" - }, - "summary.filter_cdna_synon": { - "default": true, - "description": "Filter all annotations synonymous at the cdna level", - "type": "boolean" - }, - "summary.filter_min_complexity": { - "default": 0.2, - "description": "Filter event calls based on call sequence complexity", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "summary.filter_min_flanking_reads": { - "default": 10, - "description": "Minimum number of flanking pairs for a call by flanking pairs", - "type": "integer" - }, - "summary.filter_min_linking_split_reads": { - "default": 1, - "description": "Minimum number of linking split reads for a call by split reads", - "type": "integer" - }, - "summary.filter_min_remapped_reads": { - "default": 5, - "description": "Minimum number of remapped reads for a call by contig", - "type": "integer" - }, - "summary.filter_min_spanning_reads": { - "default": 5, - "description": "Minimum number of spanning reads for a call by spanning reads", - "type": "integer" - }, - "summary.filter_min_split_reads": { - "default": 5, - "description": "Minimum number of split reads for a call by split reads", - "type": "integer" - }, - "summary.filter_protein_synon": { - "default": false, - "description": "Filter all annotations synonymous at the protein level", - "type": "boolean" - }, - "summary.filter_trans_homopolymers": { - "default": true, - "description": "Filter all single bp ins/del/dup events that are in a homopolymer region of at least 3 bps and are not paired to a genomic event", - "type": "boolean" - }, - "validate.aligner": { - "default": "blat", - "description": "The aligner to use to map the contigs/reads back to the reference e.g blat or bwa", - "enum": [ - "bwa mem", - "blat" - ], - "type": "string" - }, - "validate.assembly_kmer_size": { - "default": 0.74, - "description": "The percent of the read length to make kmers for assembly", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.assembly_max_paths": { - "default": 8, - "description": "The maximum number of paths to resolve. this is used to limit when there is a messy assembly graph to resolve. the assembly will pre-calculate the number of paths (or putative assemblies) and stop if it is greater than the given setting", - "type": "integer" - }, - "validate.assembly_min_edge_trim_weight": { - "default": 3, - "description": "This is used to simplify the debruijn graph before path finding. edges with less than this frequency will be discarded if they are non-cutting, at a fork, or the end of a path", - "type": "integer" - }, - "validate.assembly_min_exact_match_to_remap": { - "default": 15, - "description": "The minimum length of exact matches to initiate remapping a read to a contig", - "type": "integer" - }, - "validate.assembly_min_remap_coverage": { - "default": 0.9, - "description": "Minimum fraction of the contig sequence which the remapped sequences must align over", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.assembly_min_remapped_seq": { - "default": 3, - "description": "The minimum input sequences that must remap for an assembled contig to be used", - "type": "integer" - }, - "validate.assembly_min_uniq": { - "default": 0.1, - "description": "Minimum percent uniq required to keep separate assembled contigs. if contigs are more similar then the lower scoring, then shorter, contig is dropped", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.assembly_strand_concordance": { - "default": 0.51, - "description": "When the number of remapped reads from each strand are compared, the ratio must be above this number to decide on the strand", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.blat_limit_top_aln": { - "default": 10, - "description": "Number of results to return from blat (ranking based on score)", - "type": "integer" - }, - "validate.blat_min_identity": { - "default": 0.9, - "description": "The minimum percent identity match required for blat results when aligning contigs", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.call_error": { - "default": 10, - "description": "Buffer zone for the evidence window", - "type": "integer" - }, - "validate.clean_aligner_files": { - "default": false, - "description": "Remove the aligner output files after the validation stage is complete. not required for subsequent steps but can be useful in debugging and deep investigation of events", - "type": "boolean" - }, - "validate.contig_aln_max_event_size": { - "default": 50, - "description": "Relates to determining breakpoints when pairing contig alignments. for any given read in a putative pair the soft clipping is extended to include any events of greater than this size. the softclipping is added to the side of the alignment as indicated by the breakpoint we are assigning pairs to", - "type": "integer" - }, - "validate.contig_aln_merge_inner_anchor": { - "default": 20, - "description": "The minimum number of consecutive exact match base pairs to not merge events within a contig alignment", - "type": "integer" - }, - "validate.contig_aln_merge_outer_anchor": { - "default": 15, - "description": "Minimum consecutively aligned exact matches to anchor an end for merging internal events", - "type": "integer" - }, - "validate.contig_aln_min_anchor_size": { - "default": 50, - "description": "The minimum number of aligned bases for a contig (m or =) in order to simplify. do not have to be consecutive", - "type": "integer" - }, - "validate.contig_aln_min_extend_overlap": { - "default": 10, - "description": "Minimum number of bases the query coverage interval must be extended by in order to pair alignments as a single split alignment", - "type": "integer" - }, - "validate.contig_aln_min_query_consumption": { - "default": 0.9, - "description": "Minimum fraction of the original query sequence that must be used by the read(s) of the alignment", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.contig_aln_min_score": { - "default": 0.9, - "description": "Minimum score for a contig to be used as evidence in a call by contig", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.fetch_min_bin_size": { - "default": 50, - "description": "The minimum size of any bin for reading from a bam file. increasing this number will result in smaller bins being merged or less bins being created (depending on the fetch method)", - "type": "integer" - }, - "validate.fetch_reads_bins": { - "default": 5, - "description": "Number of bins to split an evidence window into to ensure more even sampling of high coverage regions", - "type": "integer" - }, - "validate.fetch_reads_limit": { - "default": 3000, - "description": "Maximum number of reads, cap, to loop over for any given evidence window", - "type": "integer" - }, - "validate.filter_secondary_alignments": { - "default": true, - "description": "Filter secondary alignments when gathering read evidence", - "type": "boolean" - }, - "validate.fuzzy_mismatch_number": { - "default": 1, - "description": "The number of events/mismatches allowed to be considered a fuzzy match", - "type": "integer" - }, - "validate.max_sc_preceeding_anchor": { - "default": 6, - "description": "When remapping a softclipped read this determines the amount of softclipping allowed on the side opposite of where we expect it. for example for a softclipped read on a breakpoint with a left orientation this limits the amount of softclipping that is allowed on the right. if this is set to none then there is no limit on softclipping", - "type": "integer" - }, - "validate.min_anchor_exact": { - "default": 6, - "description": "Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum number of consecutive exact matches to anchor a read to initiate targeted realignment", - "type": "integer" - }, - "validate.min_anchor_fuzzy": { - "default": 10, - "description": "Applies to re-aligning softclipped reads to the opposing breakpoint. the minimum length of a fuzzy match to anchor a read to initiate targeted realignment", - "type": "integer" - }, - "validate.min_anchor_match": { - "default": 0.9, - "description": "Minimum percent match for a read to be kept as evidence", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.min_call_complexity": { - "default": 0.1, - "description": "The minimum complexity score for a call sequence. is an average for non-contig calls. filters low complexity contigs before alignment. see [contig_complexity](#contig_complexity)", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "validate.min_double_aligned_to_estimate_insertion_size": { - "default": 2, - "description": "The minimum number of reads which map soft-clipped to both breakpoints to assume the size of the untemplated sequence between the breakpoints is at most the read length - 2 * min_softclipping", - "type": "integer" - }, - "validate.min_flanking_pairs_resolution": { - "default": 10, - "description": "The minimum number of flanking reads required to call a breakpoint by flanking evidence", - "type": "integer" - }, - "validate.min_linking_split_reads": { - "default": 2, - "description": "The minimum number of split reads which aligned to both breakpoints", - "type": "integer" - }, - "validate.min_mapping_quality": { - "default": 5, - "description": "The minimum mapping quality of reads to be used as evidence", - "type": "integer" - }, - "validate.min_non_target_aligned_split_reads": { - "default": 1, - "description": "The minimum number of split reads aligned to a breakpoint by the input bam and no forced by local alignment to the target region to call a breakpoint by split read evidence", - "type": "integer" - }, - "validate.min_sample_size_to_apply_percentage": { - "default": 10, - "description": "Minimum number of aligned bases to compute a match percent. if there are less than this number of aligned bases (match or mismatch) the percent comparator is not used", - "type": "integer" - }, - "validate.min_softclipping": { - "default": 6, - "description": "Minimum number of soft-clipped bases required for a read to be used as soft-clipped evidence", - "type": "integer" - }, - "validate.min_spanning_reads_resolution": { - "default": 5, - "description": "Minimum number of spanning reads required to call an event by spanning evidence", - "type": "integer" - }, - "validate.min_splits_reads_resolution": { - "default": 3, - "description": "Minimum number of split reads required to call a breakpoint by split reads", - "type": "integer" - }, - "validate.outer_window_min_event_size": { - "default": 125, - "description": "The minimum size of an event in order for flanking read evidence to be collected", - "type": "integer" - }, - "validate.stdev_count_abnormal": { - "default": 3, - "description": "The number of standard deviations away from the normal considered expected and therefore not qualifying as flanking reads", - "type": "number" - }, - "validate.trans_fetch_reads_limit": { - "default": 12000, - "description": "Related to [fetch_reads_limit](#fetch_reads_limit). overrides fetch_reads_limit for transcriptome libraries when set. if this has a value of none then fetch_reads_limit will be used for transcriptome libraries instead", - "type": [ - "integer", - "null" - ] - }, - "validate.trans_min_mapping_quality": { - "default": 0, - "description": "Related to [min_mapping_quality](#min_mapping_quality). overrides the min_mapping_quality if the library is a transcriptome and this is set to any number not none. if this value is none, min_mapping_quality is used for transcriptomes aswell as genomes", - "type": [ - "integer", - "null" - ] - }, - "validate.write_evidence_files": { - "default": true, - "description": "Write the intermediate bam and bed files containing the raw evidence collected and contigs aligned. not required for subsequent steps but can be useful in debugging and deep investigation of events", - "type": "boolean" - } - }, - "anyOf": [ - { - "not": { - "properties": { - "skip_stage.validate": { - "const": true - } - }, - "required": [ - "reference.aligner_reference" - ] - } - } - ], - "type": "object" -} diff --git a/src/mavis/schemas/overlay.json b/src/mavis/schemas/overlay.json deleted file mode 100644 index 3fe89cf5..00000000 --- a/src/mavis/schemas/overlay.json +++ /dev/null @@ -1,142 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": false, - "properties": { - "illustrate.breakpoint_color": { - "default": "#000000", - "description": "Breakpoint outline color", - "type": "string" - }, - "illustrate.domain_color": { - "default": "#ccccb3", - "description": "Domain fill color", - "type": "string" - }, - "illustrate.domain_mismatch_color": { - "default": "#b2182b", - "description": "Domain fill color on 0%% match", - "type": "string" - }, - "illustrate.domain_name_regex_filter": { - "default": "^PF\\d+$", - "description": "The regular expression used to select domains to be displayed (filtered by name)", - "type": "string" - }, - "illustrate.domain_scaffold_color": { - "default": "#000000", - "description": "The color of the domain scaffold", - "type": "string" - }, - "illustrate.drawing_width_iter_increase": { - "default": 500, - "description": "The amount (in pixels) by which to increase the drawing width upon failure to fit", - "type": "integer" - }, - "illustrate.exon_min_focus_size": { - "default": 10, - "description": "Minimum size of an exon for it to be granted a label or min exon width", - "type": "integer" - }, - "illustrate.gene1_color": { - "default": "#657e91", - "description": "The color of genes near the first gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.gene1_color_selected": { - "default": "#518dc5", - "description": "The color of the first gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.gene2_color": { - "default": "#325556", - "description": "The color of genes near the second gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.gene2_color_selected": { - "default": "#4c9677", - "description": "The color of the second gene", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.label_color": { - "default": "#000000", - "description": "The label color", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.mask_fill": { - "default": "#ffffff", - "description": "Color of mask (for deleted region etc.)", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.mask_opacity": { - "default": 0.7, - "description": "Opacity of the mask layer", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "illustrate.max_drawing_retries": { - "default": 5, - "description": "The maximum number of retries for attempting a drawing. each iteration the width is extended. if it is still insufficient after this number a gene-level only drawing will be output", - "type": "integer" - }, - "illustrate.novel_exon_color": { - "default": "#5D3F6A", - "description": "Novel exon fill color", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.scaffold_color": { - "default": "#000000", - "description": "The color used for the gene/transcripts scaffolds", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.splice_color": { - "default": "#000000", - "description": "Splicing lines color", - "pattern": "^#[a-zA-Z0-9]{6}", - "type": "string" - }, - "illustrate.width": { - "default": 1000, - "description": "The drawing width in pixels", - "type": "integer" - }, - "log": { - "type": "string" - }, - "log_level": { - "default": "INFO", - "enum": [ - "INFO", - "DEBUG" - ], - "type": "string" - }, - "reference.annotations": { - "examples": [ - "tests/data/mock_annotations.json" - ], - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" - }, - "validate.min_mapping_quality": { - "default": 5, - "description": "The minimum mapping quality of reads to be used as evidence", - "type": "integer" - } - }, - "required": [ - "reference.annotations" - ], - "type": "object" -} diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index e6767d3a..1225d69c 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -4,6 +4,7 @@ from typing import Dict, List, Optional, Set, Tuple import pysam +from mavis_config import DEFAULTS from ..assemble import assemble from ..bam import cigar as _cigar @@ -23,7 +24,6 @@ ) from ..error import NotSpecifiedError from ..interval import Interval -from ..schemas import DEFAULTS from ..util import DEVNULL diff --git a/src/mavis/validate/evidence.py b/src/mavis/validate/evidence.py index a689170c..19f2fbdb 100644 --- a/src/mavis/validate/evidence.py +++ b/src/mavis/validate/evidence.py @@ -3,13 +3,11 @@ import pysam -from ..align import SplitAlignment, call_read_events from ..annotate.variant import overlapping_transcripts from ..bam import cigar as _cigar from ..breakpoint import Breakpoint from ..constants import CIGAR, COLUMNS, ORIENT, PROTOCOL, STRAND, SVTYPE from ..interval import Interval -from ..schemas import DEFAULTS from .base import Evidence diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 51d1e48f..4bec5fcd 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -16,7 +16,6 @@ from ..bam.cache import BamCache from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL -from ..schemas import get_by_prefix from ..util import ( LOG, filter_on_overlap, diff --git a/tests/integration/test_align.py b/tests/integration/test_align.py index 0075a32d..0b3f556e 100644 --- a/tests/integration/test_align.py +++ b/tests/integration/test_align.py @@ -2,7 +2,6 @@ from unittest import mock import mavis.bam.cigar as _cigar -import pytest from mavis import align from mavis.annotate.file_io import load_reference_genome from mavis.assemble import Contig @@ -11,8 +10,8 @@ from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CIGAR, ORIENT, STRAND, reverse_complement from mavis.interval import Interval -from mavis.schemas import DEFAULTS from mavis.validate.evidence import GenomeEvidence +from mavis_config import DEFAULTS from ..util import blat_only, bwa_only, get_data from . import MockLongString, MockObject, MockRead diff --git a/tests/integration/test_assemble.py b/tests/integration/test_assemble.py index 6930b685..b91b1c4d 100644 --- a/tests/integration/test_assemble.py +++ b/tests/integration/test_assemble.py @@ -5,8 +5,8 @@ from mavis.assemble import Contig, assemble, filter_contigs from mavis.constants import reverse_complement from mavis.interval import Interval -from mavis.schemas import DEFAULTS from mavis.util import LOG +from mavis_config import DEFAULTS from ..util import get_data, long_running_test from . import MockObject diff --git a/tests/integration/test_validate.py b/tests/integration/test_validate.py index 8f0d8471..e29a063f 100644 --- a/tests/integration/test_validate.py +++ b/tests/integration/test_validate.py @@ -5,9 +5,9 @@ from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint from mavis.constants import NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS -from mavis.schemas import DEFAULTS from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence +from mavis_config import DEFAULTS from ..util import get_data, long_running_test from . import MockLongString, MockObject, MockRead, mock_read_pair diff --git a/tests/integration/test_validate_evidence.py b/tests/integration/test_validate_evidence.py index 9a53cf82..1cab995a 100644 --- a/tests/integration/test_validate_evidence.py +++ b/tests/integration/test_validate_evidence.py @@ -9,9 +9,9 @@ from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import ORIENT, STRAND from mavis.interval import Interval -from mavis.schemas import DEFAULTS from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence +from mavis_config import DEFAULTS from . import MockBamFileHandle, MockObject, MockRead, mock_read_pair From 154cf87f273823981df653a7c5509e3a1f541775 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 22:30:09 -0800 Subject: [PATCH 083/137] Remove pyfaidx till later --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b75ac619..a34fc7ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -48,7 +48,6 @@ install_requires = shortuuid>=0.5.0 svgwrite typing_extensions>=4 - pyfaidx^=0.6.3.1 setup_requires = pip>=9.0.0 setuptools>=36.0.0 From 4dbeed0d65ae964eb6bd6a3a48cbf6e0e1c12afe Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 23:53:45 -0800 Subject: [PATCH 084/137] Remove support for tab-delimited annotation files - use json-schema to validate annotations input file --- .gitignore | 1 + src/mavis/annotate/annotations_schema.json | 186 +++++++++++++++++ src/mavis/annotate/file_io.py | 165 ++------------- src/tools/migrate_mavis_annotations_2to3.py | 190 ++++++++++++++++++ tests/data/annotations_subsample.json | 2 +- tests/data/example_genes.json | 2 + tests/data/mock_annotations.json | 2 +- .../data/mock_reference_annotations.full.json | 1 + .../data/mock_reference_annotations.full.tsv | 6 - tests/data/mock_reference_annotations.json | 2 +- tests/data/mock_reference_annotations.tsv | 7 - tests/data/mock_reference_annotations2.json | 1 + tests/integration/test_annotate.py | 4 +- tests/integration/test_annotate_fileio.py | 31 +-- 14 files changed, 404 insertions(+), 196 deletions(-) create mode 100644 src/mavis/annotate/annotations_schema.json create mode 100644 src/tools/migrate_mavis_annotations_2to3.py create mode 100644 tests/data/mock_reference_annotations.full.json delete mode 100644 tests/data/mock_reference_annotations.full.tsv delete mode 100644 tests/data/mock_reference_annotations.tsv create mode 100644 tests/data/mock_reference_annotations2.json diff --git a/.gitignore b/.gitignore index 1f4c4214..c3cad29a 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ bin dag* tutorial_data reference_inputs +tmp diff --git a/src/mavis/annotate/annotations_schema.json b/src/mavis/annotate/annotations_schema.json new file mode 100644 index 00000000..04d0cc50 --- /dev/null +++ b/src/mavis/annotate/annotations_schema.json @@ -0,0 +1,186 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "best_transcript_file": { + "type": "string" + }, + "ensembl_version": { + "type": "integer" + }, + "generation_time": { + "type": "string" + }, + "genes": { + "items": { + "properties": { + "aliases": { + "default": [ + ], + "items": { + "minLength": 1, + "type": "string" + }, + "type": "array" + }, + "chr": { + "minLength": 1, + "type": "string" + }, + "end": { + "minimum": 1, + "type": "integer" + }, + "name": { + "minLength": 1, + "type": "string" + }, + "start": { + "minimum": 1, + "type": "integer" + }, + "strand": { + "enum": [ + "+", + "-" + ], + "type": "string" + }, + "transcripts": { + "default": [ + ], + "items": { + "properties": { + "aliases": { + "default": [ + ], + "items": { + "minLength": 1, + "type": "string" + }, + "type": "array" + }, + "cdna_coding_end": { + "minimum": 1, + "type": [ + "integer", + "null" + ], + "default": null + }, + "cdna_coding_start": { + "minimum": 1, + "type": [ + "integer", + "null" + ], + "default": null + }, + "domains": { + "default": [ + ], + "items": { + "properties": { + "name": { + "minLength": 1, + "type": "string" + }, + "regions": { + "minItems": 1, + "properties": { + "end": { + "minimum": 1, + "type": "integer" + }, + "start": { + "minimum": 1, + "type": "integer" + } + }, + "type": "array" + } + }, + "required": [ + "name", + "regions" + ], + "type": "object" + }, + "type": "array" + }, + "end": { + "minimum": 1, + "type": "integer" + }, + "exons": { + "defualt": [ + ], + "items": { + "properties": { + "end": { + "minimum": 1, + "type": "integer" + }, + "start": { + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "start", + "end" + ], + "type": "object" + }, + "type": "array" + }, + "is_best_transcript": { + "default": false, + "type": "boolean" + }, + "name": { + "minLength": 1, + "type": "string" + }, + "start": { + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "start", + "end", + "name" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "chr", + "start", + "end", + "name", + "strand" + ], + "type": "object" + }, + "minItems": 1, + "type": "array" + }, + "hugo_mapping_file": { + "type": "string" + }, + "script": { + "type": "string" + }, + "script_version": { + "type": "string" + } + }, + "required": [ + "genes" + ], + "type": "object" +} diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index b41489c7..6b1a16f2 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -4,16 +4,16 @@ import json import os import re -import warnings -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, Dict, List, Optional import pandas as pd from Bio import SeqIO from Bio.SeqRecord import SeqRecord +from snakemake.utils import validate as snakemake_validate -from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate +from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, translate from ..interval import Interval -from ..util import DEVNULL, LOG, cast_boolean, filepath +from ..util import DEVNULL, LOG from .base import BioInterval, ReferenceName from .genomic import Exon, Gene, PreTranscript, Template, Transcript from .protein import Domain, Translation @@ -79,11 +79,8 @@ def load_annotations( for filename in filepaths: data = None - if filename.endswith('.json'): - with open(filename) as fh: - data = json.load(fh) - else: - data = convert_tab_to_json(filename, warn) + with open(filename) as fh: + data = json.load(fh) current_annotations = parse_annotations_json( data, @@ -107,17 +104,20 @@ def parse_annotations_json( """ parses a json of annotation information into annotation objects """ + try: + snakemake_validate( + data, + os.path.join(os.path.dirname(__file__), 'annotations_schema.json'), + ) + except Exception as err: + short_msg = '. '.join( + [line for line in str(err).split('\n') if line.strip()][:3] + ) # these can get super long + raise AssertionError(short_msg) + genes_by_chr: Dict[str, List[Gene]] = {} for gene_dict in data['genes']: - if gene_dict['strand'] in ['1', '+', 1]: - gene_dict['strand'] = STRAND.POS - elif gene_dict['strand'] in ['-1', '-', -1]: - gene_dict['strand'] = STRAND.NEG - else: - raise AssertionError( - 'input has unexpected form. strand must be 1 or -1 but found', gene_dict['strand'] - ) gene = Gene( chr=gene_dict['chr'], @@ -130,7 +130,6 @@ def parse_annotations_json( has_best = False for transcript in gene_dict['transcripts']: - transcript['is_best_transcript'] = cast_boolean(transcript['is_best_transcript']) transcript.setdefault('exons', []) exons = [Exon(strand=gene.strand, **ex) for ex in transcript['exons']] if not exons: @@ -205,136 +204,6 @@ def parse_annotations_json( return genes_by_chr -def convert_tab_to_json(filepath: str, warn: Callable = DEVNULL) -> Dict: - """ - given a file in the std input format (see below) reads and return a list of genes (and sub-objects) - - +-----------------------+---------------------------+-----------------------------------------------------------+ - | column name | example | description | - +=======================+===========================+===========================================================+ - | ensembl_transcript_id | ENST000001 | | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | ensembl_gene_id | ENSG000001 | | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | strand | -1 | positive or negative 1 | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | cdna_coding_start | 44 | where translation begins relative to the start of the cdna| - +-----------------------+---------------------------+-----------------------------------------------------------+ - | cdna_coding_end | 150 | where translation terminates | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | hugo_names | KRAS | hugo gene name | - +-----------------------+---------------------------+-----------------------------------------------------------+ - - Args: - filepath (str): path to the input tab-delimited file - - Returns: - Dict[str,List[Gene]]: a dictionary keyed by chromosome name with values of list of genes on the chromosome - - Warning: - does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 - """ - - def parse_exon_list(row): - if pd.isnull(row): - return [] - exons = [] - for temp in re.split('[; ]', row): - try: - start, end = temp.split('-') - exons.append({'start': int(start), 'end': int(end)}) - except Exception as err: - warn('exon error:', repr(temp), repr(err)) - return exons - - def parse_domain_list(row): - if pd.isnull(row): - return [] - domains = [] - for domain in row.split(';'): - try: - name, temp = domain.rsplit(':') - temp = temp.split(',') - temp = [x.split('-') for x in temp] - regions = [{'start': int(x), 'end': int(y)} for x, y in temp] - domains.append({'name': name, 'regions': regions}) - except Exception as err: - warn('error in domain:', domain, row, repr(err)) - return domains - - df = pd.read_csv( - filepath, - dtype={ - 'ensembl_gene_id': str, - 'ensembl_transcript_id': str, - 'chr': str, - 'cdna_coding_start': pd.Int64Dtype(), - 'cdna_coding_end': pd.Int64Dtype(), - 'AA_domain_ranges': str, - 'genomic_exon_ranges': str, - 'hugo_names': str, - 'transcript_genomic_start': pd.Int64Dtype(), - 'transcript_genomic_end': pd.Int64Dtype(), - 'best_ensembl_transcript_id': str, - 'gene_start': int, - 'gene_end': int, - }, - sep='\t', - comment='#', - ) - - for col in ['ensembl_gene_id', 'chr', 'ensembl_transcript_id', 'gene_start', 'gene_end']: - if col not in df: - raise KeyError(f'missing required column: {col}') - - for col, parser in [ - ('genomic_exon_ranges', parse_exon_list), - ('AA_domain_ranges', parse_domain_list), - ]: - if col in df: - df[col] = df[col].apply(parser) - - genes = {} - rows = df.where(df.notnull(), None).to_dict('records') - - for row in rows: - gene = { - 'chr': row['chr'], - 'start': row['gene_start'], - 'end': row['gene_end'], - 'name': row['ensembl_gene_id'], - 'strand': row['strand'], - 'aliases': row['hugo_names'].split(';') if row.get('hugo_names') else [], - 'transcripts': [], - } - if gene['name'] not in genes: - genes[gene['name']] = gene - else: - gene = genes[gene['name']] - is_best_transcript = ( - row.get('best_ensembl_transcript_id', row['ensembl_transcript_id']) - == row['ensembl_transcript_id'] - ) - transcript = { - 'is_best_transcript': is_best_transcript, - 'name': row['ensembl_transcript_id'], - 'exons': row.get('genomic_exon_ranges', []), - 'domains': row.get('AA_domain_ranges', []), - 'start': row.get('transcript_genomic_start'), - 'end': row.get('transcript_genomic_end'), - 'cdna_coding_start': row.get('cdna_coding_start'), - 'cdna_coding_end': row.get('cdna_coding_end'), - 'aliases': [], - } - gene['transcripts'].append(transcript) - - return {'genes': genes.values()} - - def load_reference_genome(*filepaths: str) -> Dict[str, SeqRecord]: """ Args: diff --git a/src/tools/migrate_mavis_annotations_2to3.py b/src/tools/migrate_mavis_annotations_2to3.py new file mode 100644 index 00000000..0fe9d39d --- /dev/null +++ b/src/tools/migrate_mavis_annotations_2to3.py @@ -0,0 +1,190 @@ +import argparse +import json +import logging +import re +from typing import Dict + +import pandas as pd + + +def convert_tab_to_json(filepath: str) -> Dict: + """ + given a file in the std input format (see below) reads and return a list of genes (and sub-objects) + + +-----------------------+---------------------------+-----------------------------------------------------------+ + | column name | example | description | + +=======================+===========================+===========================================================+ + | ensembl_transcript_id | ENST000001 | | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | ensembl_gene_id | ENSG000001 | | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | strand | -1 | positive or negative 1 | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | cdna_coding_start | 44 | where translation begins relative to the start of the cdna| + +-----------------------+---------------------------+-----------------------------------------------------------+ + | cdna_coding_end | 150 | where translation terminates | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | hugo_names | KRAS | hugo gene name | + +-----------------------+---------------------------+-----------------------------------------------------------+ + + Args: + filepath (str): path to the input tab-delimited file + + Returns: + Dict[str,List[Gene]]: a dictionary keyed by chromosome name with values of list of genes on the chromosome + + Warning: + does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 + """ + + def parse_exon_list(row): + if pd.isnull(row): + return [] + exons = [] + for temp in re.split('[; ]', row): + try: + start, end = temp.split('-') + exons.append({'start': int(start), 'end': int(end)}) + except Exception as err: + logging.warning(f'exon error: {repr(temp)}, {repr(err)}') + return exons + + def parse_domain_list(row): + if pd.isnull(row): + return [] + domains = [] + for domain in row.split(';'): + try: + name, temp = domain.rsplit(':') + temp = temp.split(',') + temp = [x.split('-') for x in temp] + regions = [{'start': int(x), 'end': int(y)} for x, y in temp] + domains.append({'name': name, 'regions': regions}) + except Exception as err: + logging.warning(f'error in domain: {domain}, {row}, {repr(err)}') + return domains + + df = pd.read_csv( + filepath, + dtype={ + 'ensembl_gene_id': str, + 'ensembl_transcript_id': str, + 'chr': str, + 'cdna_coding_start': pd.Int64Dtype(), + 'cdna_coding_end': pd.Int64Dtype(), + 'AA_domain_ranges': str, + 'genomic_exon_ranges': str, + 'hugo_names': str, + 'transcript_genomic_start': pd.Int64Dtype(), + 'transcript_genomic_end': pd.Int64Dtype(), + 'best_ensembl_transcript_id': str, + 'gene_start': int, + 'gene_end': int, + }, + sep='\t', + comment='#', + ) + + for col in ['ensembl_gene_id', 'chr', 'ensembl_transcript_id', 'gene_start', 'gene_end']: + if col not in df: + raise KeyError(f'missing required column: {col}') + + for col, parser in [ + ('genomic_exon_ranges', parse_exon_list), + ('AA_domain_ranges', parse_domain_list), + ]: + if col in df: + df[col] = df[col].apply(parser) + + genes = {} + rows = df.where(df.notnull(), None).to_dict('records') + + for row in rows: + gene = { + 'chr': row['chr'], + 'start': int(row['gene_start']), + 'end': int(row['gene_end']), + 'name': row['ensembl_gene_id'], + 'strand': row['strand'], + 'aliases': row['hugo_names'].split(';') if row.get('hugo_names') else [], + 'transcripts': [], + } + if gene['strand'] in {'true', '1', '+', '+1', 'True', 1, True}: + gene['strand'] = '+' + elif gene['strand'] in {'false', '-1', '-', 'False', -1, False}: + gene['strand'] = '-' + if gene['name'] not in genes: + genes[gene['name']] = gene + else: + gene = genes[gene['name']] + is_best_transcript = ( + row.get('best_ensembl_transcript_id', row['ensembl_transcript_id']) + == row['ensembl_transcript_id'] + ) + transcript = { + 'is_best_transcript': is_best_transcript, + 'name': row['ensembl_transcript_id'], + 'exons': row.get('genomic_exon_ranges', []), + 'domains': row.get('AA_domain_ranges', []), + 'start': row.get('transcript_genomic_start'), + 'end': row.get('transcript_genomic_end'), + 'cdna_coding_start': row.get('cdna_coding_start'), + 'cdna_coding_end': row.get('cdna_coding_end'), + 'aliases': [], + } + for int_value in ['start', 'end', 'cdna_coding_start', 'cdna_coding_end']: + if transcript.get(int_value) is not None: + transcript[int_value] = int(transcript[int_value]) + gene['transcripts'].append(transcript) + + return {'genes': list(genes.values())} + + +if __name__ == '__main__': + logging.basicConfig(**{'format': '{message}', 'style': '{', 'level': logging.INFO}) + parser = argparse.ArgumentParser() + parser.add_argument( + 'input', help='path to the tab-delimated mavis v2 style reference annotations file' + ) + parser.add_argument('output', help='path to the JSON output file') + + args = parser.parse_args() + + annotations = convert_tab_to_json(args.input) + + rows = [] + logging.info(f'writing: {args.output}') + if args.output_format == 'jsonl': + with open(args.output, 'w') as fh: + for gene in annotations['genes']: + fh.write(json.dumps(gene, sort_keys=True) + '\n') + elif args.output_format == 'json': + with open(args.output, 'w') as fh: + fh.write(json.dumps(annotations, sort_keys=True)) + else: + transcripts = [] + + for gene in annotations['genes']: + meta = {**gene} + del meta['transcripts'] + if gene['transcripts']: + for transcript in gene['transcripts']: + transcripts.append( + {**meta, **{f'transcript.{k}': v for k, v in transcript.items()}} + ) + else: + transcripts.append(meta) + df = pd.json_normalize(transcripts, max_level=1) + json_cols = [ + 'aliases', + 'transcript.aliases', + 'transcript.exons', + 'transcript.domains', + ] + for col in json_cols: + df[col] = df[col].apply(json.dumps) + df.to_csv(args.output, index=False, sep='\t') diff --git a/tests/data/annotations_subsample.json b/tests/data/annotations_subsample.json index 7c362018..d4ee0a56 100644 --- a/tests/data/annotations_subsample.json +++ b/tests/data/annotations_subsample.json @@ -1 +1 @@ -{"genes": [{"chr": "15", "start": 63889592, "end": 63893885, "name": "ENSG00000259662", "strand": "1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000539570", "exons": [{"start": 63889592, "end": 63889944}, {"start": 63893495, "end": 63893885}], "domains": [{"name": "SSF81383", "regions": [{"start": 9, "end": 49}]}], "start": 63889592, "end": 63893885, "cdna_coding_start": 1, "cdna_coding_end": 744, "aliases": []}]}, {"chr": "14", "start": 102027834, "end": 102028748, "name": "ENSG00000258865", "strand": "1", "aliases": ["DIO3"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000510508", "exons": [{"start": 102027834, "end": 102028748}], "domains": [{"name": "PF00837", "regions": [{"start": 38, "end": 293}]}, {"name": "SSF52833", "regions": [{"start": 125, "end": 198}]}], "start": 102027834, "end": 102028748, "cdna_coding_start": 1, "cdna_coding_end": 915, "aliases": []}]}, {"chr": "X", "start": 49364778, "end": 49370618, "name": "ENSG00000255738", "strand": "1", "aliases": ["GAGE4"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000381700", "exons": [{"start": 49364778, "end": 49364861}, {"start": 49365327, "end": 49365447}, {"start": 49368271, "end": 49368396}, {"start": 49370596, "end": 49370618}], "domains": [{"name": "PF05831", "regions": [{"start": 1, "end": 116}]}], "start": 49364778, "end": 49370618, "cdna_coding_start": 1, "cdna_coding_end": 354, "aliases": []}]}, {"chr": "10", "start": 89621708, "end": 89622244, "name": "ENSG00000227268", "strand": "-1", "aliases": ["KLLN"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000445946", "exons": [{"start": 89621708, "end": 89622244}], "domains": [], "start": 89621708, "end": 89622244, "cdna_coding_start": 1, "cdna_coding_end": 537, "aliases": []}]}, {"chr": "19", "start": 50193095, "end": 50193750, "name": "ENSG00000224420", "strand": "1", "aliases": ["ADM5"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000420022", "exons": [{"start": 50193095, "end": 50193168}, {"start": 50193363, "end": 50193750}], "domains": [], "start": 50193095, "end": 50193750, "cdna_coding_start": 1, "cdna_coding_end": 462, "aliases": []}]}, {"chr": "4", "start": 69056959, "end": 69083631, "name": "ENSG00000226894", "strand": "-1", "aliases": ["TMPRSS11BNL"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000432593", "exons": [{"start": 69083624, "end": 69083631}, {"start": 69078080, "end": 69078195}, {"start": 69057125, "end": 69057242}, {"start": 69056959, "end": 69057034}], "domains": [{"name": "SSF82671", "regions": [{"start": 35, "end": 87}]}], "start": 69056959, "end": 69083631, "cdna_coding_start": 1, "cdna_coding_end": 318, "aliases": []}]}, {"chr": "1", "start": 179833916, "end": 179834311, "name": "ENSG00000258664", "strand": "-1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000553856", "exons": [{"start": 179833916, "end": 179834311}], "domains": [], "start": 179833916, "end": 179834311, "cdna_coding_start": 1, "cdna_coding_end": 396, "aliases": []}]}, {"chr": "19", "start": 8959608, "end": 9091814, "name": "ENSG00000181143", "strand": "-1", "aliases": ["MUC16"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000397910", "exons": [{"start": 9082340, "end": 9091814}, {"start": 9080451, "end": 9080555}, {"start": 9056173, "end": 9077865}, {"start": 9054235, "end": 9054348}, {"start": 9045564, "end": 9050243}, {"start": 9043426, "end": 9043461}, {"start": 9038380, "end": 9038415}, {"start": 9038077, "end": 9038136}, {"start": 9033610, "end": 9033737}, {"start": 9033231, "end": 9033298}, {"start": 9028224, "end": 9028396}, {"start": 9027540, "end": 9027575}, {"start": 9027216, "end": 9027281}, {"start": 9026191, "end": 9026315}, {"start": 9025591, "end": 9025658}, {"start": 9024826, "end": 9024998}, {"start": 9024461, "end": 9024496}, {"start": 9024134, "end": 9024199}, {"start": 9021054, "end": 9021184}, {"start": 9020765, "end": 9020832}, {"start": 9019985, "end": 9020157}, {"start": 9019600, "end": 9019635}, {"start": 9019275, "end": 9019340}, {"start": 9018437, "end": 9018561}, {"start": 9018133, "end": 9018200}, {"start": 9017346, "end": 9017518}, {"start": 9016981, "end": 9017016}, {"start": 9016660, "end": 9016722}, {"start": 9015621, "end": 9015745}, {"start": 9015318, "end": 9015385}, {"start": 9014532, "end": 9014704}, {"start": 9014169, "end": 9014204}, {"start": 9013845, "end": 9013910}, {"start": 9012774, "end": 9012898}, {"start": 9012468, "end": 9012535}, {"start": 9011322, "end": 9011494}, {"start": 9010971, "end": 9011006}, {"start": 9010648, "end": 9010713}, {"start": 9009588, "end": 9009712}, {"start": 9009267, "end": 9009334}, {"start": 9008173, "end": 9008345}, {"start": 9007809, "end": 9007844}, {"start": 9007487, "end": 9007552}, {"start": 9006642, "end": 9006766}, {"start": 9006344, "end": 9006411}, {"start": 9005559, "end": 9005731}, {"start": 9005194, "end": 9005229}, {"start": 9004869, "end": 9004934}, {"start": 9003566, "end": 9003690}, {"start": 9003293, "end": 9003360}, {"start": 9002501, "end": 9002673}, {"start": 9002153, "end": 9002188}, {"start": 9001831, "end": 9001896}, {"start": 9000442, "end": 9000566}, {"start": 9000147, "end": 9000214}, {"start": 8999392, "end": 8999564}, {"start": 8999025, "end": 8999060}, {"start": 8998698, "end": 8998763}, {"start": 8997412, "end": 8997536}, {"start": 8997118, "end": 8997185}, {"start": 8996321, "end": 8996493}, {"start": 8995954, "end": 8995989}, {"start": 8995635, "end": 8995700}, {"start": 8994417, "end": 8994538}, {"start": 8994142, "end": 8994209}, {"start": 8993373, "end": 8993545}, {"start": 8993007, "end": 8993042}, {"start": 8987210, "end": 8987334}, {"start": 8987045, "end": 8987112}, {"start": 8982157, "end": 8982329}, {"start": 8979217, "end": 8979252}, {"start": 8977640, "end": 8977690}, {"start": 8976739, "end": 8976860}, {"start": 8976581, "end": 8976648}, {"start": 8976260, "end": 8976432}, {"start": 8973972, "end": 8974102}, {"start": 8973549, "end": 8973616}, {"start": 8971676, "end": 8971824}, {"start": 8969276, "end": 8969427}, {"start": 8968880, "end": 8968947}, {"start": 8966650, "end": 8966816}, {"start": 8962354, "end": 8962395}, {"start": 8961952, "end": 8962031}, {"start": 8959608, "end": 8959706}], "domains": [{"name": "PS50324", "regions": [{"start": 466, "end": 934}, {"start": 1638, "end": 3054}, {"start": 3899, "end": 4383}, {"start": 5333, "end": 5532}, {"start": 5907, "end": 5999}, {"start": 7083, "end": 10394}]}, {"name": "SSF48726", "regions": [{"start": 3813, "end": 11805}]}, {"name": "SSF82671", "regions": [{"start": 12377, "end": 12509}, {"start": 13312, "end": 13444}, {"start": 13468, "end": 13600}, {"start": 13000, "end": 13132}, {"start": 12688, "end": 12820}, {"start": 13624, "end": 13756}, {"start": 12533, "end": 12665}, {"start": 13156, "end": 13288}, {"start": 12844, "end": 12976}, {"start": 12219, "end": 12351}, {"start": 13780, "end": 13911}, {"start": 12063, "end": 12195}, {"start": 14064, "end": 14195}, {"start": 13913, "end": 14045}, {"start": 14310, "end": 14440}, {"start": 14202, "end": 14308}]}, {"name": "SM00200", "regions": [{"start": 12068, "end": 12199}, {"start": 12226, "end": 12349}, {"start": 12384, "end": 12513}, {"start": 12851, "end": 12982}, {"start": 13007, "end": 13138}, {"start": 13471, "end": 13604}, {"start": 14314, "end": 14444}]}, {"name": "PS50024", "regions": [{"start": 12068, "end": 12133}, {"start": 13007, "end": 13070}, {"start": 13472, "end": 13538}, {"start": 13628, "end": 13694}, {"start": 14314, "end": 14380}]}, {"name": "PS50325", "regions": [{"start": 14, "end": 12083}]}, {"name": "PF01390", "regions": [{"start": 12074, "end": 12175}, {"start": 12231, "end": 12330}, {"start": 12388, "end": 12489}, {"start": 12544, "end": 12646}, {"start": 12699, "end": 12801}, {"start": 12855, "end": 12952}, {"start": 13011, "end": 13113}, {"start": 13167, "end": 13271}, {"start": 13324, "end": 13427}, {"start": 13479, "end": 13583}, {"start": 13635, "end": 13739}, {"start": 13793, "end": 13894}, {"start": 13922, "end": 14030}, {"start": 14076, "end": 14175}, {"start": 14202, "end": 14271}, {"start": 14319, "end": 14423}]}], "start": 8959608, "end": 9091814, "cdna_coding_start": 1, "cdna_coding_end": 43524, "aliases": []}]}, {"chr": "3", "start": 157815816, "end": 157823813, "name": "ENSG00000258518", "strand": "-1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000483851", "exons": [{"start": 157823468, "end": 157823813}, {"start": 157820467, "end": 157820675}, {"start": 157818043, "end": 157818100}, {"start": 157817649, "end": 157817737}, {"start": 157815816, "end": 157816073}], "domains": [{"name": "PS50071", "regions": [{"start": 138, "end": 198}]}, {"name": "PF00046", "regions": [{"start": 141, "end": 197}]}, {"name": "PS50310", "regions": [{"start": 244, "end": 316}]}, {"name": "PS50316", "regions": [{"start": 247, "end": 259}]}, {"name": "SSF46689", "regions": [{"start": 131, "end": 209}]}, {"name": "SM00389", "regions": [{"start": 140, "end": 202}]}, {"name": "PF03826", "regions": [{"start": 298, "end": 316}]}, {"name": "PS50315", "regions": [{"start": 60, "end": 90}]}, {"name": "PS50803", "regions": [{"start": 301, "end": 314}]}, {"name": "PR00031", "regions": [{"start": 169, "end": 178}, {"start": 178, "end": 194}]}], "start": 157815816, "end": 157823813, "cdna_coding_start": 1, "cdna_coding_end": 960, "aliases": []}]}, {"chr": "10", "start": 225953, "end": 295049, "name": "ENSG00000259741", "strand": "1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000558098", "exons": [{"start": 225953, "end": 226068}, {"start": 255829, "end": 255988}, {"start": 267135, "end": 267296}, {"start": 282778, "end": 282855}, {"start": 283525, "end": 283617}, {"start": 285378, "end": 285465}, {"start": 285996, "end": 286051}, {"start": 286833, "end": 286910}, {"start": 287961, "end": 288079}, {"start": 292706, "end": 292913}, {"start": 293338, "end": 293406}, {"start": 294276, "end": 294548}, {"start": 294843, "end": 295049}], "domains": [{"name": "PS50812", "regions": [{"start": 280, "end": 331}]}, {"name": "PS50016", "regions": [{"start": 100, "end": 148}]}, {"name": "SM00297", "regions": [{"start": 151, "end": 257}]}, {"name": "PF00855", "regions": [{"start": 278, "end": 342}]}, {"name": "SSF57903", "regions": [{"start": 86, "end": 151}]}, {"name": "SSF47370", "regions": [{"start": 130, "end": 268}]}, {"name": "SM00293", "regions": [{"start": 278, "end": 329}]}, {"name": "SM00249", "regions": [{"start": 102, "end": 146}]}, {"name": "PS50014", "regions": [{"start": 186, "end": 238}]}, {"name": "PF00439", "regions": [{"start": 182, "end": 242}]}, {"name": "SSF63748", "regions": [{"start": 270, "end": 396}]}], "start": 225953, "end": 295049, "cdna_coding_start": 1, "cdna_coding_end": 1707, "aliases": []}]}, {"chr": "11", "start": 117160282, "end": 117166263, "name": "ENSG00000265969", "strand": "-1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000292095", "exons": [{"start": 117166214, "end": 117166263}, {"start": 117165847, "end": 117166063}, {"start": 117164587, "end": 117164724}, {"start": 117163770, "end": 117163904}, {"start": 117162428, "end": 117162529}, {"start": 117161616, "end": 117161765}, {"start": 117161204, "end": 117161375}, {"start": 117160282, "end": 117160523}], "domains": [{"name": "SSF50630", "regions": [{"start": 18, "end": 346}]}, {"name": "PF00026", "regions": [{"start": 17, "end": 316}]}, {"name": "PR01816", "regions": [{"start": 15, "end": 25}, {"start": 112, "end": 132}, {"start": 260, "end": 274}, {"start": 315, "end": 327}, {"start": 334, "end": 353}, {"start": 355, "end": 375}]}, {"name": "PR00792", "regions": [{"start": 133, "end": 146}, {"start": 186, "end": 197}, {"start": 292, "end": 307}]}, {"name": "PR01815", "regions": [{"start": 47, "end": 70}, {"start": 168, "end": 187}, {"start": 218, "end": 241}, {"start": 256, "end": 270}, {"start": 316, "end": 339}, {"start": 352, "end": 373}]}], "start": 117160282, "end": 117166263, "cdna_coding_start": 1, "cdna_coding_end": 1206, "aliases": []}]}, {"chr": "14", "start": 55034638, "end": 55255662, "name": "ENSG00000262355", "strand": "1", "aliases": ["SAMD4A"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000305831", "exons": [{"start": 55034638, "end": 55034830}, {"start": 55168780, "end": 55169298}, {"start": 55215533, "end": 55215642}, {"start": 55218169, "end": 55218255}, {"start": 55226879, "end": 55227212}, {"start": 55231173, "end": 55231258}, {"start": 55236822, "end": 55236940}, {"start": 55241652, "end": 55241853}, {"start": 55243132, "end": 55243258}, {"start": 55251255, "end": 55251338}, {"start": 55255634, "end": 55255662}], "domains": [{"name": "PF07647", "regions": [{"start": 234, "end": 292}]}, {"name": "SSF47769", "regions": [{"start": 232, "end": 293}]}, {"name": "PF00536", "regions": [{"start": 235, "end": 292}]}, {"name": "SM00454", "regions": [{"start": 231, "end": 294}]}], "start": 55034638, "end": 55255662, "cdna_coding_start": 1, "cdna_coding_end": 1890, "aliases": []}]}, {"chr": "MT", "start": 3307, "end": 4262, "name": "ENSG00000198888", "strand": "1", "aliases": ["MT-ND1"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361390", "exons": [{"start": 3307, "end": 4262}], "domains": [{"name": "PF00146", "regions": [{"start": 2, "end": 308}]}], "start": 3307, "end": 4262, "cdna_coding_start": 1, "cdna_coding_end": 956, "aliases": []}]}, {"chr": "MT", "start": 4470, "end": 5511, "name": "ENSG00000198763", "strand": "1", "aliases": ["MT-ND2"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361453", "exons": [{"start": 4470, "end": 5511}], "domains": [{"name": "PF06444", "regions": [{"start": 290, "end": 345}]}, {"name": "PR01436", "regions": [{"start": 159, "end": 172}, {"start": 183, "end": 196}, {"start": 202, "end": 220}, {"start": 242, "end": 254}, {"start": 274, "end": 293}]}, {"name": "PF00361", "regions": [{"start": 23, "end": 268}]}], "start": 4470, "end": 5511, "cdna_coding_start": 1, "cdna_coding_end": 1042, "aliases": []}]}, {"chr": "MT", "start": 5904, "end": 7445, "name": "ENSG00000198804", "strand": "1", "aliases": ["MT-CO1"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361624", "exons": [{"start": 5904, "end": 7445}], "domains": [{"name": "PR01165", "regions": [{"start": 5, "end": 30}, {"start": 52, "end": 75}, {"start": 80, "end": 104}, {"start": 122, "end": 134}, {"start": 154, "end": 172}, {"start": 183, "end": 202}, {"start": 234, "end": 255}, {"start": 281, "end": 296}, {"start": 305, "end": 326}, {"start": 340, "end": 358}, {"start": 368, "end": 387}, {"start": 418, "end": 439}]}, {"name": "PS50855", "regions": [{"start": 1, "end": 511}]}, {"name": "SSF81442", "regions": [{"start": 1, "end": 513}]}, {"name": "PF00115", "regions": [{"start": 12, "end": 460}]}], "start": 5904, "end": 7445, "cdna_coding_start": 1, "cdna_coding_end": 1542, "aliases": []}]}, {"chr": "MT", "start": 7586, "end": 8269, "name": "ENSG00000198712", "strand": "1", "aliases": ["MT-CO2"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361739", "exons": [{"start": 7586, "end": 8269}], "domains": [{"name": "SSF81464", "regions": [{"start": 1, "end": 90}]}, {"name": "PF00116", "regions": [{"start": 95, "end": 213}]}, {"name": "TIGR02866", "regions": [{"start": 13, "end": 215}]}, {"name": "PS50999", "regions": [{"start": 1, "end": 91}]}, {"name": "PS50857", "regions": [{"start": 92, "end": 225}]}, {"name": "SSF49503", "regions": [{"start": 91, "end": 220}]}, {"name": "PF02790", "regions": [{"start": 1, "end": 83}]}, {"name": "PR01166", "regions": [{"start": 57, "end": 69}, {"start": 69, "end": 89}, {"start": 91, "end": 110}, {"start": 134, "end": 155}, {"start": 158, "end": 178}, {"start": 178, "end": 195}, {"start": 196, "end": 213}]}], "start": 7586, "end": 8269, "cdna_coding_start": 1, "cdna_coding_end": 684, "aliases": []}]}, {"chr": "MT", "start": 8366, "end": 8572, "name": "ENSG00000228253", "strand": "1", "aliases": ["MT-ATP8"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361851", "exons": [{"start": 8366, "end": 8572}], "domains": [{"name": "PF00895", "regions": [{"start": 1, "end": 56}]}], "start": 8366, "end": 8572, "cdna_coding_start": 1, "cdna_coding_end": 207, "aliases": []}]}, {"chr": "MT", "start": 8527, "end": 9207, "name": "ENSG00000198899", "strand": "1", "aliases": ["MT-ATP6"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361899", "exons": [{"start": 8527, "end": 9207}], "domains": [{"name": "SSF81336", "regions": [{"start": 62, "end": 223}]}, {"name": "PF00119", "regions": [{"start": 21, "end": 224}]}, {"name": "TIGR01131", "regions": [{"start": 9, "end": 225}]}, {"name": "PR00123", "regions": [{"start": 71, "end": 87}, {"start": 132, "end": 147}, {"start": 152, "end": 174}, {"start": 209, "end": 224}]}], "start": 8527, "end": 9207, "cdna_coding_start": 1, "cdna_coding_end": 681, "aliases": []}]}, {"chr": "MT", "start": 9207, "end": 9990, "name": "ENSG00000198938", "strand": "1", "aliases": ["MT-CO3"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000362079", "exons": [{"start": 9207, "end": 9990}], "domains": [{"name": "PF00510", "regions": [{"start": 6, "end": 261}]}, {"name": "SSF81452", "regions": [{"start": 1, "end": 260}]}, {"name": "PS50253", "regions": [{"start": 4, "end": 261}]}], "start": 9207, "end": 9990, "cdna_coding_start": 1, "cdna_coding_end": 784, "aliases": []}]}, {"chr": "MT", "start": 10059, "end": 10404, "name": "ENSG00000198840", "strand": "1", "aliases": ["MT-ND3"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361227", "exons": [{"start": 10059, "end": 10404}], "domains": [{"name": "PF00507", "regions": [{"start": 13, "end": 113}]}], "start": 10059, "end": 10404, "cdna_coding_start": 1, "cdna_coding_end": 346, "aliases": []}]}, {"chr": "MT", "start": 10470, "end": 10766, "name": "ENSG00000212907", "strand": "1", "aliases": ["MT-ND4L"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361335", "exons": [{"start": 10470, "end": 10766}], "domains": [{"name": "PF00420", "regions": [{"start": 4, "end": 98}]}], "start": 10470, "end": 10766, "cdna_coding_start": 1, "cdna_coding_end": 297, "aliases": []}]}, {"chr": "MT", "start": 10760, "end": 12137, "name": "ENSG00000198886", "strand": "1", "aliases": ["MT-ND4"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361381", "exons": [{"start": 10760, "end": 12137}], "domains": [{"name": "TIGR01972", "regions": [{"start": 59, "end": 456}]}, {"name": "PR01437", "regions": [{"start": 118, "end": 137}, {"start": 149, "end": 173}, {"start": 221, "end": 245}, {"start": 308, "end": 327}, {"start": 360, "end": 386}]}, {"name": "PF00361", "regions": [{"start": 112, "end": 383}]}, {"name": "PF01059", "regions": [{"start": 1, "end": 109}]}], "start": 10760, "end": 12137, "cdna_coding_start": 1, "cdna_coding_end": 1378, "aliases": []}]}, {"chr": "MT", "start": 12337, "end": 14148, "name": "ENSG00000198786", "strand": "1", "aliases": ["MT-ND5"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361567", "exons": [{"start": 12337, "end": 14148}], "domains": [{"name": "PF00662", "regions": [{"start": 62, "end": 123}]}, {"name": "TIGR01974", "regions": [{"start": 8, "end": 501}]}, {"name": "PR01434", "regions": [{"start": 83, "end": 108}, {"start": 111, "end": 131}, {"start": 158, "end": 179}, {"start": 219, "end": 240}, {"start": 241, "end": 267}, {"start": 309, "end": 321}, {"start": 412, "end": 431}]}, {"name": "PF00361", "regions": [{"start": 134, "end": 397}]}, {"name": "PF06455", "regions": [{"start": 422, "end": 602}]}], "start": 12337, "end": 14148, "cdna_coding_start": 1, "cdna_coding_end": 1812, "aliases": []}]}, {"chr": "MT", "start": 14149, "end": 14673, "name": "ENSG00000198695", "strand": "-1", "aliases": ["MT-ND6"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361681", "exons": [{"start": 14149, "end": 14673}], "domains": [{"name": "PF00499", "regions": [{"start": 14, "end": 171}]}], "start": 14149, "end": 14673, "cdna_coding_start": 1, "cdna_coding_end": 525, "aliases": []}]}, {"chr": "MT", "start": 14747, "end": 15887, "name": "ENSG00000198727", "strand": "1", "aliases": ["MT-CYB"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361789", "exons": [{"start": 14747, "end": 15887}], "domains": [{"name": "PF00032", "regions": [{"start": 259, "end": 359}]}, {"name": "PS51003", "regions": [{"start": 210, "end": 380}]}, {"name": "PS51002", "regions": [{"start": 1, "end": 209}]}, {"name": "PF00033", "regions": [{"start": 24, "end": 200}]}, {"name": "SSF81342", "regions": [{"start": 1, "end": 260}]}, {"name": "SSF81648", "regions": [{"start": 261, "end": 379}]}], "start": 14747, "end": 15887, "cdna_coding_start": 1, "cdna_coding_end": 1141, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143738824, "end": 143763329, "name": "ENSG00000262023", "strand": "-1", "aliases": [], "transcripts": [{"is_best_transcript": false, "name": "ENST00000585837", "exons": [{"start": 143751293, "end": 143751331}, {"start": 143745839, "end": 143747881}, {"start": 143740183, "end": 143740305}, {"start": 143738824, "end": 143739441}], "domains": [{"name": "SM00674", "regions": [{"start": 83, "end": 149}]}, {"name": "PF03221", "regions": [{"start": 87, "end": 149}]}, {"name": "PF03184", "regions": [{"start": 179, "end": 382}]}, {"name": "SSF46689", "regions": [{"start": 79, "end": 144}, {"start": 13, "end": 76}]}, {"name": "PS50960", "regions": [{"start": 11, "end": 62}]}, {"name": "PF04218", "regions": [{"start": 14, "end": 66}]}], "start": 143738824, "end": 143751331, "cdna_coding_start": 502, "cdna_coding_end": 2172, "aliases": []}, {"is_best_transcript": true, "name": "ENST00000571961", "exons": [{"start": 143751293, "end": 143751355}, {"start": 143745839, "end": 143747881}, {"start": 143739726, "end": 143740305}], "domains": [{"name": "PF03221", "regions": [{"start": 87, "end": 149}]}, {"name": "SM00674", "regions": [{"start": 83, "end": 149}]}, {"name": "PF04218", "regions": [{"start": 14, "end": 66}]}, {"name": "PS50960", "regions": [{"start": 11, "end": 62}]}, {"name": "SSF46689", "regions": [{"start": 79, "end": 144}, {"start": 13, "end": 76}]}, {"name": "PF03184", "regions": [{"start": 179, "end": 382}]}], "start": 143739726, "end": 143751355, "cdna_coding_start": 526, "cdna_coding_end": 2196, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000592696", "exons": [{"start": 143747818, "end": 143747881}, {"start": 143739743, "end": 143747688}], "domains": [{"name": "PF03221", "regions": [{"start": 87, "end": 149}]}, {"name": "SM00674", "regions": [{"start": 83, "end": 149}]}, {"name": "PF04218", "regions": [{"start": 14, "end": 66}]}, {"name": "PS50960", "regions": [{"start": 11, "end": 62}]}, {"name": "SSF46689", "regions": [{"start": 79, "end": 144}, {"start": 13, "end": 76}]}, {"name": "PF03184", "regions": [{"start": 179, "end": 382}]}], "start": 143739743, "end": 143747881, "cdna_coding_start": 334, "cdna_coding_end": 2040, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000585713", "exons": [{"start": 143751302, "end": 143751349}, {"start": 143747343, "end": 143747881}], "domains": [], "start": 143747343, "end": 143751349, "cdna_coding_start": 511, "cdna_coding_end": 587, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143751669, "end": 143764085, "name": "ENSG00000262150", "strand": "1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000571412", "exons": [{"start": 143761817, "end": 143761924}, {"start": 143762688, "end": 143762795}, {"start": 143763282, "end": 143764085}], "domains": [{"name": "PF00021", "regions": [{"start": 14, "end": 84}]}, {"name": "SM00134", "regions": [{"start": 12, "end": 98}]}, {"name": "SSF57302", "regions": [{"start": 10, "end": 84}]}], "start": 143761817, "end": 143764085, "cdna_coding_start": 84, "cdna_coding_end": 428, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000575167", "exons": [{"start": 143761818, "end": 143761924}, {"start": 143762688, "end": 143763131}], "domains": [{"name": "PF00021", "regions": [{"start": 14, "end": 52}]}], "start": 143761818, "end": 143763131, "cdna_coding_start": 83, "cdna_coding_end": 247, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143781472, "end": 143786488, "name": "ENSG00000262378", "strand": "1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000572300", "exons": [{"start": 143781472, "end": 143781991}, {"start": 143782961, "end": 143783074}, {"start": 143784452, "end": 143786488}], "domains": [{"name": "SSF57302", "regions": [{"start": 42, "end": 122}]}], "start": 143781472, "end": 143786488, "cdna_coding_start": 418, "cdna_coding_end": 915, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000571924", "exons": [{"start": 143781474, "end": 143781991}, {"start": 143782961, "end": 143783074}, {"start": 143783861, "end": 143784083}], "domains": [], "start": 143781474, "end": 143784083, "cdna_coding_start": 416, "cdna_coding_end": 718, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000574968", "exons": [{"start": 143781475, "end": 143781991}, {"start": 143782961, "end": 143783074}, {"start": 143784484, "end": 143784815}], "domains": [], "start": 143781475, "end": 143784815, "cdna_coding_start": 415, "cdna_coding_end": 732, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000570580", "exons": [{"start": 143781833, "end": 143781991}, {"start": 143782961, "end": 143785525}], "domains": [], "start": 143781833, "end": 143785525, "cdna_coding_start": 57, "cdna_coding_end": 314, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143808564, "end": 143818288, "name": "ENSG00000263194", "strand": "1", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000575946", "exons": [{"start": 143808564, "end": 143809220}, {"start": 143816687, "end": 143818288}], "domains": [{"name": "SSF54637", "regions": [{"start": 44, "end": 189}]}], "start": 143808564, "end": 143818288, "cdna_coding_start": 145, "cdna_coding_end": 771, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000573810", "exons": [{"start": 143809084, "end": 143809220}, {"start": 143816687, "end": 143817029}, {"start": 143818218, "end": 143818288}], "domains": [{"name": "SSF54637", "regions": [{"start": 2, "end": 64}]}], "start": 143809084, "end": 143818288, "cdna_coding_start": 1, "cdna_coding_end": 251, "aliases": []}]}, {"chr": "12", "start": 175931, "end": 287626, "name": "ENSG00000120645", "strand": "1", "aliases": ["IQSEC3"], "transcripts": [{"is_best_transcript": false, "name": "ENST00000538872", "exons": [{"start": 175931, "end": 176602}, {"start": 208312, "end": 208380}, {"start": 234799, "end": 235078}, {"start": 247433, "end": 248520}, {"start": 250290, "end": 250451}, {"start": 266191, "end": 266313}, {"start": 266694, "end": 266860}, {"start": 271092, "end": 271231}, {"start": 272660, "end": 272785}, {"start": 274600, "end": 274699}, {"start": 274895, "end": 275056}, {"start": 278179, "end": 278271}, {"start": 280278, "end": 280327}, {"start": 283765, "end": 287626}], "domains": [{"name": "SM00222", "regions": [{"start": 648, "end": 839}]}, {"name": "SSF50729", "regions": [{"start": 845, "end": 983}]}, {"name": "PS50096", "regions": [{"start": 315, "end": 344}]}, {"name": "PS50099", "regions": [{"start": 1061, "end": 1165}]}, {"name": "SSF48425", "regions": [{"start": 647, "end": 844}]}, {"name": "PS50190", "regions": [{"start": 644, "end": 837}]}, {"name": "PF01369", "regions": [{"start": 651, "end": 839}]}], "start": 175931, "end": 287626, "cdna_coding_start": 119, "cdna_coding_end": 3667, "aliases": []}, {"is_best_transcript": true, "name": "ENST00000382841", "exons": [{"start": 186542, "end": 186878}, {"start": 208312, "end": 208380}, {"start": 247433, "end": 248520}, {"start": 250290, "end": 250451}, {"start": 266191, "end": 266313}, {"start": 266694, "end": 266860}, {"start": 271092, "end": 271231}, {"start": 272660, "end": 272785}, {"start": 274600, "end": 274699}, {"start": 274895, "end": 275056}, {"start": 278179, "end": 278271}, {"start": 280278, "end": 280327}, {"start": 280413, "end": 280496}], "domains": [{"name": "PS50190", "regions": [{"start": 341, "end": 534}]}, {"name": "PF01369", "regions": [{"start": 348, "end": 536}]}, {"name": "SM00222", "regions": [{"start": 345, "end": 536}]}, {"name": "SSF50729", "regions": [{"start": 542, "end": 680}]}, {"name": "SSF48425", "regions": [{"start": 344, "end": 541}]}, {"name": "PS50096", "regions": [{"start": 12, "end": 41}]}], "start": 186542, "end": 280496, "cdna_coding_start": 413, "cdna_coding_end": 2692, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000326261", "exons": [{"start": 176049, "end": 176602}, {"start": 208312, "end": 208380}, {"start": 234799, "end": 235078}, {"start": 247433, "end": 248520}, {"start": 250290, "end": 250451}, {"start": 266191, "end": 266313}, {"start": 266694, "end": 266860}, {"start": 271092, "end": 271231}, {"start": 272660, "end": 272785}, {"start": 274600, "end": 274699}, {"start": 274895, "end": 275056}, {"start": 278179, "end": 278271}, {"start": 280278, "end": 280327}, {"start": 283765, "end": 286390}, {"start": 286498, "end": 287620}], "domains": [{"name": "PF01369", "regions": [{"start": 651, "end": 839}]}, {"name": "PS50190", "regions": [{"start": 644, "end": 837}]}, {"name": "SSF48425", "regions": [{"start": 647, "end": 844}]}, {"name": "PS50096", "regions": [{"start": 315, "end": 344}]}, {"name": "PS50099", "regions": [{"start": 1061, "end": 1165}]}, {"name": "SM00222", "regions": [{"start": 648, "end": 839}]}, {"name": "SSF50729", "regions": [{"start": 845, "end": 983}]}], "start": 176049, "end": 287620, "cdna_coding_start": 1, "cdna_coding_end": 3549, "aliases": []}]}, {"chr": "12", "start": 299243, "end": 323736, "name": "ENSG00000111181", "strand": "-1", "aliases": ["SLC6A12"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000359674", "exons": [{"start": 322677, "end": 322863}, {"start": 321192, "end": 321276}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311906, "end": 312046}, {"start": 310928, "end": 311015}, {"start": 309817, "end": 309949}, {"start": 307963, "end": 308097}, {"start": 307066, "end": 307169}, {"start": 306543, "end": 306667}, {"start": 305936, "end": 306048}, {"start": 305290, "end": 305427}, {"start": 304391, "end": 304493}, {"start": 302443, "end": 302543}, {"start": 301644, "end": 301814}, {"start": 299243, "end": 300377}], "domains": [{"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}, {"start": 238, "end": 255}, {"start": 320, "end": 340}, {"start": 374, "end": 393}, {"start": 458, "end": 478}, {"start": 498, "end": 518}]}, {"name": "PR01198", "regions": [{"start": 4, "end": 17}, {"start": 565, "end": 582}, {"start": 599, "end": 611}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 560}]}, {"name": "PS50267", "regions": [{"start": 35, "end": 563}]}], "start": 299243, "end": 322863, "cdna_coding_start": 330, "cdna_coding_end": 2174, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000424061", "exons": [{"start": 322456, "end": 322504}, {"start": 321192, "end": 321276}, {"start": 319371, "end": 319734}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311906, "end": 312046}, {"start": 310928, "end": 311015}, {"start": 309817, "end": 309949}, {"start": 307963, "end": 308097}, {"start": 307066, "end": 307169}, {"start": 306543, "end": 306667}, {"start": 305936, "end": 306048}, {"start": 305290, "end": 305427}, {"start": 304391, "end": 304493}, {"start": 302443, "end": 302543}, {"start": 301644, "end": 301814}, {"start": 299302, "end": 300377}], "domains": [{"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}, {"start": 238, "end": 255}, {"start": 320, "end": 340}, {"start": 374, "end": 393}, {"start": 458, "end": 478}, {"start": 498, "end": 518}]}, {"name": "PR01198", "regions": [{"start": 4, "end": 17}, {"start": 565, "end": 582}, {"start": 599, "end": 611}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 560}]}, {"name": "PS50267", "regions": [{"start": 35, "end": 563}]}], "start": 299302, "end": 322504, "cdna_coding_start": 556, "cdna_coding_end": 2400, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000536824", "exons": [{"start": 323088, "end": 323286}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311906, "end": 312046}, {"start": 310928, "end": 311015}, {"start": 309817, "end": 309949}, {"start": 307963, "end": 308097}, {"start": 307066, "end": 307169}, {"start": 306543, "end": 306667}, {"start": 305936, "end": 306048}, {"start": 305290, "end": 305427}, {"start": 304391, "end": 304493}, {"start": 302443, "end": 302543}, {"start": 301644, "end": 301814}, {"start": 300234, "end": 300377}], "domains": [{"name": "PS50267", "regions": [{"start": 35, "end": 563}]}, {"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}, {"start": 238, "end": 255}, {"start": 320, "end": 340}, {"start": 374, "end": 393}, {"start": 458, "end": 478}, {"start": 498, "end": 518}]}, {"name": "PR01198", "regions": [{"start": 4, "end": 17}, {"start": 565, "end": 582}, {"start": 599, "end": 611}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 560}]}], "start": 300234, "end": 323286, "cdna_coding_start": 257, "cdna_coding_end": 2101, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000537793", "exons": [{"start": 320301, "end": 320382}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311958, "end": 312046}], "domains": [{"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 146}]}, {"name": "PS50267", "regions": [{"start": 35, "end": 146}]}], "start": 311958, "end": 320382, "cdna_coding_start": 140, "cdna_coding_end": 577, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000535347", "exons": [{"start": 323088, "end": 323256}, {"start": 321192, "end": 321276}, {"start": 319057, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 312035, "end": 312046}], "domains": [], "start": 312035, "end": 323256, "cdna_coding_start": 312, "cdna_coding_end": 554, "aliases": []}]}]} +{"genes": [{"chr": "15", "start": 63889592, "end": 63893885, "name": "ENSG00000259662", "strand": "+", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000539570", "exons": [{"start": 63889592, "end": 63889944}, {"start": 63893495, "end": 63893885}], "domains": [{"name": "SSF81383", "regions": [{"start": 9, "end": 49}]}], "start": 63889592, "end": 63893885, "cdna_coding_start": 1, "cdna_coding_end": 744, "aliases": []}]}, {"chr": "14", "start": 102027834, "end": 102028748, "name": "ENSG00000258865", "strand": "+", "aliases": ["DIO3"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000510508", "exons": [{"start": 102027834, "end": 102028748}], "domains": [{"name": "PF00837", "regions": [{"start": 38, "end": 293}]}, {"name": "SSF52833", "regions": [{"start": 125, "end": 198}]}], "start": 102027834, "end": 102028748, "cdna_coding_start": 1, "cdna_coding_end": 915, "aliases": []}]}, {"chr": "X", "start": 49364778, "end": 49370618, "name": "ENSG00000255738", "strand": "+", "aliases": ["GAGE4"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000381700", "exons": [{"start": 49364778, "end": 49364861}, {"start": 49365327, "end": 49365447}, {"start": 49368271, "end": 49368396}, {"start": 49370596, "end": 49370618}], "domains": [{"name": "PF05831", "regions": [{"start": 1, "end": 116}]}], "start": 49364778, "end": 49370618, "cdna_coding_start": 1, "cdna_coding_end": 354, "aliases": []}]}, {"chr": "10", "start": 89621708, "end": 89622244, "name": "ENSG00000227268", "strand": "-", "aliases": ["KLLN"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000445946", "exons": [{"start": 89621708, "end": 89622244}], "domains": [], "start": 89621708, "end": 89622244, "cdna_coding_start": 1, "cdna_coding_end": 537, "aliases": []}]}, {"chr": "19", "start": 50193095, "end": 50193750, "name": "ENSG00000224420", "strand": "+", "aliases": ["ADM5"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000420022", "exons": [{"start": 50193095, "end": 50193168}, {"start": 50193363, "end": 50193750}], "domains": [], "start": 50193095, "end": 50193750, "cdna_coding_start": 1, "cdna_coding_end": 462, "aliases": []}]}, {"chr": "4", "start": 69056959, "end": 69083631, "name": "ENSG00000226894", "strand": "-", "aliases": ["TMPRSS11BNL"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000432593", "exons": [{"start": 69083624, "end": 69083631}, {"start": 69078080, "end": 69078195}, {"start": 69057125, "end": 69057242}, {"start": 69056959, "end": 69057034}], "domains": [{"name": "SSF82671", "regions": [{"start": 35, "end": 87}]}], "start": 69056959, "end": 69083631, "cdna_coding_start": 1, "cdna_coding_end": 318, "aliases": []}]}, {"chr": "1", "start": 179833916, "end": 179834311, "name": "ENSG00000258664", "strand": "-", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000553856", "exons": [{"start": 179833916, "end": 179834311}], "domains": [], "start": 179833916, "end": 179834311, "cdna_coding_start": 1, "cdna_coding_end": 396, "aliases": []}]}, {"chr": "19", "start": 8959608, "end": 9091814, "name": "ENSG00000181143", "strand": "-", "aliases": ["MUC16"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000397910", "exons": [{"start": 9082340, "end": 9091814}, {"start": 9080451, "end": 9080555}, {"start": 9056173, "end": 9077865}, {"start": 9054235, "end": 9054348}, {"start": 9045564, "end": 9050243}, {"start": 9043426, "end": 9043461}, {"start": 9038380, "end": 9038415}, {"start": 9038077, "end": 9038136}, {"start": 9033610, "end": 9033737}, {"start": 9033231, "end": 9033298}, {"start": 9028224, "end": 9028396}, {"start": 9027540, "end": 9027575}, {"start": 9027216, "end": 9027281}, {"start": 9026191, "end": 9026315}, {"start": 9025591, "end": 9025658}, {"start": 9024826, "end": 9024998}, {"start": 9024461, "end": 9024496}, {"start": 9024134, "end": 9024199}, {"start": 9021054, "end": 9021184}, {"start": 9020765, "end": 9020832}, {"start": 9019985, "end": 9020157}, {"start": 9019600, "end": 9019635}, {"start": 9019275, "end": 9019340}, {"start": 9018437, "end": 9018561}, {"start": 9018133, "end": 9018200}, {"start": 9017346, "end": 9017518}, {"start": 9016981, "end": 9017016}, {"start": 9016660, "end": 9016722}, {"start": 9015621, "end": 9015745}, {"start": 9015318, "end": 9015385}, {"start": 9014532, "end": 9014704}, {"start": 9014169, "end": 9014204}, {"start": 9013845, "end": 9013910}, {"start": 9012774, "end": 9012898}, {"start": 9012468, "end": 9012535}, {"start": 9011322, "end": 9011494}, {"start": 9010971, "end": 9011006}, {"start": 9010648, "end": 9010713}, {"start": 9009588, "end": 9009712}, {"start": 9009267, "end": 9009334}, {"start": 9008173, "end": 9008345}, {"start": 9007809, "end": 9007844}, {"start": 9007487, "end": 9007552}, {"start": 9006642, "end": 9006766}, {"start": 9006344, "end": 9006411}, {"start": 9005559, "end": 9005731}, {"start": 9005194, "end": 9005229}, {"start": 9004869, "end": 9004934}, {"start": 9003566, "end": 9003690}, {"start": 9003293, "end": 9003360}, {"start": 9002501, "end": 9002673}, {"start": 9002153, "end": 9002188}, {"start": 9001831, "end": 9001896}, {"start": 9000442, "end": 9000566}, {"start": 9000147, "end": 9000214}, {"start": 8999392, "end": 8999564}, {"start": 8999025, "end": 8999060}, {"start": 8998698, "end": 8998763}, {"start": 8997412, "end": 8997536}, {"start": 8997118, "end": 8997185}, {"start": 8996321, "end": 8996493}, {"start": 8995954, "end": 8995989}, {"start": 8995635, "end": 8995700}, {"start": 8994417, "end": 8994538}, {"start": 8994142, "end": 8994209}, {"start": 8993373, "end": 8993545}, {"start": 8993007, "end": 8993042}, {"start": 8987210, "end": 8987334}, {"start": 8987045, "end": 8987112}, {"start": 8982157, "end": 8982329}, {"start": 8979217, "end": 8979252}, {"start": 8977640, "end": 8977690}, {"start": 8976739, "end": 8976860}, {"start": 8976581, "end": 8976648}, {"start": 8976260, "end": 8976432}, {"start": 8973972, "end": 8974102}, {"start": 8973549, "end": 8973616}, {"start": 8971676, "end": 8971824}, {"start": 8969276, "end": 8969427}, {"start": 8968880, "end": 8968947}, {"start": 8966650, "end": 8966816}, {"start": 8962354, "end": 8962395}, {"start": 8961952, "end": 8962031}, {"start": 8959608, "end": 8959706}], "domains": [{"name": "PS50324", "regions": [{"start": 466, "end": 934}, {"start": 1638, "end": 3054}, {"start": 3899, "end": 4383}, {"start": 5333, "end": 5532}, {"start": 5907, "end": 5999}, {"start": 7083, "end": 10394}]}, {"name": "SSF48726", "regions": [{"start": 3813, "end": 11805}]}, {"name": "SSF82671", "regions": [{"start": 12377, "end": 12509}, {"start": 13312, "end": 13444}, {"start": 13468, "end": 13600}, {"start": 13000, "end": 13132}, {"start": 12688, "end": 12820}, {"start": 13624, "end": 13756}, {"start": 12533, "end": 12665}, {"start": 13156, "end": 13288}, {"start": 12844, "end": 12976}, {"start": 12219, "end": 12351}, {"start": 13780, "end": 13911}, {"start": 12063, "end": 12195}, {"start": 14064, "end": 14195}, {"start": 13913, "end": 14045}, {"start": 14310, "end": 14440}, {"start": 14202, "end": 14308}]}, {"name": "SM00200", "regions": [{"start": 12068, "end": 12199}, {"start": 12226, "end": 12349}, {"start": 12384, "end": 12513}, {"start": 12851, "end": 12982}, {"start": 13007, "end": 13138}, {"start": 13471, "end": 13604}, {"start": 14314, "end": 14444}]}, {"name": "PS50024", "regions": [{"start": 12068, "end": 12133}, {"start": 13007, "end": 13070}, {"start": 13472, "end": 13538}, {"start": 13628, "end": 13694}, {"start": 14314, "end": 14380}]}, {"name": "PS50325", "regions": [{"start": 14, "end": 12083}]}, {"name": "PF01390", "regions": [{"start": 12074, "end": 12175}, {"start": 12231, "end": 12330}, {"start": 12388, "end": 12489}, {"start": 12544, "end": 12646}, {"start": 12699, "end": 12801}, {"start": 12855, "end": 12952}, {"start": 13011, "end": 13113}, {"start": 13167, "end": 13271}, {"start": 13324, "end": 13427}, {"start": 13479, "end": 13583}, {"start": 13635, "end": 13739}, {"start": 13793, "end": 13894}, {"start": 13922, "end": 14030}, {"start": 14076, "end": 14175}, {"start": 14202, "end": 14271}, {"start": 14319, "end": 14423}]}], "start": 8959608, "end": 9091814, "cdna_coding_start": 1, "cdna_coding_end": 43524, "aliases": []}]}, {"chr": "3", "start": 157815816, "end": 157823813, "name": "ENSG00000258518", "strand": "-", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000483851", "exons": [{"start": 157823468, "end": 157823813}, {"start": 157820467, "end": 157820675}, {"start": 157818043, "end": 157818100}, {"start": 157817649, "end": 157817737}, {"start": 157815816, "end": 157816073}], "domains": [{"name": "PS50071", "regions": [{"start": 138, "end": 198}]}, {"name": "PF00046", "regions": [{"start": 141, "end": 197}]}, {"name": "PS50310", "regions": [{"start": 244, "end": 316}]}, {"name": "PS50316", "regions": [{"start": 247, "end": 259}]}, {"name": "SSF46689", "regions": [{"start": 131, "end": 209}]}, {"name": "SM00389", "regions": [{"start": 140, "end": 202}]}, {"name": "PF03826", "regions": [{"start": 298, "end": 316}]}, {"name": "PS50315", "regions": [{"start": 60, "end": 90}]}, {"name": "PS50803", "regions": [{"start": 301, "end": 314}]}, {"name": "PR00031", "regions": [{"start": 169, "end": 178}, {"start": 178, "end": 194}]}], "start": 157815816, "end": 157823813, "cdna_coding_start": 1, "cdna_coding_end": 960, "aliases": []}]}, {"chr": "10", "start": 225953, "end": 295049, "name": "ENSG00000259741", "strand": "+", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000558098", "exons": [{"start": 225953, "end": 226068}, {"start": 255829, "end": 255988}, {"start": 267135, "end": 267296}, {"start": 282778, "end": 282855}, {"start": 283525, "end": 283617}, {"start": 285378, "end": 285465}, {"start": 285996, "end": 286051}, {"start": 286833, "end": 286910}, {"start": 287961, "end": 288079}, {"start": 292706, "end": 292913}, {"start": 293338, "end": 293406}, {"start": 294276, "end": 294548}, {"start": 294843, "end": 295049}], "domains": [{"name": "PS50812", "regions": [{"start": 280, "end": 331}]}, {"name": "PS50016", "regions": [{"start": 100, "end": 148}]}, {"name": "SM00297", "regions": [{"start": 151, "end": 257}]}, {"name": "PF00855", "regions": [{"start": 278, "end": 342}]}, {"name": "SSF57903", "regions": [{"start": 86, "end": 151}]}, {"name": "SSF47370", "regions": [{"start": 130, "end": 268}]}, {"name": "SM00293", "regions": [{"start": 278, "end": 329}]}, {"name": "SM00249", "regions": [{"start": 102, "end": 146}]}, {"name": "PS50014", "regions": [{"start": 186, "end": 238}]}, {"name": "PF00439", "regions": [{"start": 182, "end": 242}]}, {"name": "SSF63748", "regions": [{"start": 270, "end": 396}]}], "start": 225953, "end": 295049, "cdna_coding_start": 1, "cdna_coding_end": 1707, "aliases": []}]}, {"chr": "11", "start": 117160282, "end": 117166263, "name": "ENSG00000265969", "strand": "-", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000292095", "exons": [{"start": 117166214, "end": 117166263}, {"start": 117165847, "end": 117166063}, {"start": 117164587, "end": 117164724}, {"start": 117163770, "end": 117163904}, {"start": 117162428, "end": 117162529}, {"start": 117161616, "end": 117161765}, {"start": 117161204, "end": 117161375}, {"start": 117160282, "end": 117160523}], "domains": [{"name": "SSF50630", "regions": [{"start": 18, "end": 346}]}, {"name": "PF00026", "regions": [{"start": 17, "end": 316}]}, {"name": "PR01816", "regions": [{"start": 15, "end": 25}, {"start": 112, "end": 132}, {"start": 260, "end": 274}, {"start": 315, "end": 327}, {"start": 334, "end": 353}, {"start": 355, "end": 375}]}, {"name": "PR00792", "regions": [{"start": 133, "end": 146}, {"start": 186, "end": 197}, {"start": 292, "end": 307}]}, {"name": "PR01815", "regions": [{"start": 47, "end": 70}, {"start": 168, "end": 187}, {"start": 218, "end": 241}, {"start": 256, "end": 270}, {"start": 316, "end": 339}, {"start": 352, "end": 373}]}], "start": 117160282, "end": 117166263, "cdna_coding_start": 1, "cdna_coding_end": 1206, "aliases": []}]}, {"chr": "14", "start": 55034638, "end": 55255662, "name": "ENSG00000262355", "strand": "+", "aliases": ["SAMD4A"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000305831", "exons": [{"start": 55034638, "end": 55034830}, {"start": 55168780, "end": 55169298}, {"start": 55215533, "end": 55215642}, {"start": 55218169, "end": 55218255}, {"start": 55226879, "end": 55227212}, {"start": 55231173, "end": 55231258}, {"start": 55236822, "end": 55236940}, {"start": 55241652, "end": 55241853}, {"start": 55243132, "end": 55243258}, {"start": 55251255, "end": 55251338}, {"start": 55255634, "end": 55255662}], "domains": [{"name": "PF07647", "regions": [{"start": 234, "end": 292}]}, {"name": "SSF47769", "regions": [{"start": 232, "end": 293}]}, {"name": "PF00536", "regions": [{"start": 235, "end": 292}]}, {"name": "SM00454", "regions": [{"start": 231, "end": 294}]}], "start": 55034638, "end": 55255662, "cdna_coding_start": 1, "cdna_coding_end": 1890, "aliases": []}]}, {"chr": "MT", "start": 3307, "end": 4262, "name": "ENSG00000198888", "strand": "+", "aliases": ["MT-ND1"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361390", "exons": [{"start": 3307, "end": 4262}], "domains": [{"name": "PF00146", "regions": [{"start": 2, "end": 308}]}], "start": 3307, "end": 4262, "cdna_coding_start": 1, "cdna_coding_end": 956, "aliases": []}]}, {"chr": "MT", "start": 4470, "end": 5511, "name": "ENSG00000198763", "strand": "+", "aliases": ["MT-ND2"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361453", "exons": [{"start": 4470, "end": 5511}], "domains": [{"name": "PF06444", "regions": [{"start": 290, "end": 345}]}, {"name": "PR01436", "regions": [{"start": 159, "end": 172}, {"start": 183, "end": 196}, {"start": 202, "end": 220}, {"start": 242, "end": 254}, {"start": 274, "end": 293}]}, {"name": "PF00361", "regions": [{"start": 23, "end": 268}]}], "start": 4470, "end": 5511, "cdna_coding_start": 1, "cdna_coding_end": 1042, "aliases": []}]}, {"chr": "MT", "start": 5904, "end": 7445, "name": "ENSG00000198804", "strand": "+", "aliases": ["MT-CO1"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361624", "exons": [{"start": 5904, "end": 7445}], "domains": [{"name": "PR01165", "regions": [{"start": 5, "end": 30}, {"start": 52, "end": 75}, {"start": 80, "end": 104}, {"start": 122, "end": 134}, {"start": 154, "end": 172}, {"start": 183, "end": 202}, {"start": 234, "end": 255}, {"start": 281, "end": 296}, {"start": 305, "end": 326}, {"start": 340, "end": 358}, {"start": 368, "end": 387}, {"start": 418, "end": 439}]}, {"name": "PS50855", "regions": [{"start": 1, "end": 511}]}, {"name": "SSF81442", "regions": [{"start": 1, "end": 513}]}, {"name": "PF00115", "regions": [{"start": 12, "end": 460}]}], "start": 5904, "end": 7445, "cdna_coding_start": 1, "cdna_coding_end": 1542, "aliases": []}]}, {"chr": "MT", "start": 7586, "end": 8269, "name": "ENSG00000198712", "strand": "+", "aliases": ["MT-CO2"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361739", "exons": [{"start": 7586, "end": 8269}], "domains": [{"name": "SSF81464", "regions": [{"start": 1, "end": 90}]}, {"name": "PF00116", "regions": [{"start": 95, "end": 213}]}, {"name": "TIGR02866", "regions": [{"start": 13, "end": 215}]}, {"name": "PS50999", "regions": [{"start": 1, "end": 91}]}, {"name": "PS50857", "regions": [{"start": 92, "end": 225}]}, {"name": "SSF49503", "regions": [{"start": 91, "end": 220}]}, {"name": "PF02790", "regions": [{"start": 1, "end": 83}]}, {"name": "PR01166", "regions": [{"start": 57, "end": 69}, {"start": 69, "end": 89}, {"start": 91, "end": 110}, {"start": 134, "end": 155}, {"start": 158, "end": 178}, {"start": 178, "end": 195}, {"start": 196, "end": 213}]}], "start": 7586, "end": 8269, "cdna_coding_start": 1, "cdna_coding_end": 684, "aliases": []}]}, {"chr": "MT", "start": 8366, "end": 8572, "name": "ENSG00000228253", "strand": "+", "aliases": ["MT-ATP8"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361851", "exons": [{"start": 8366, "end": 8572}], "domains": [{"name": "PF00895", "regions": [{"start": 1, "end": 56}]}], "start": 8366, "end": 8572, "cdna_coding_start": 1, "cdna_coding_end": 207, "aliases": []}]}, {"chr": "MT", "start": 8527, "end": 9207, "name": "ENSG00000198899", "strand": "+", "aliases": ["MT-ATP6"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361899", "exons": [{"start": 8527, "end": 9207}], "domains": [{"name": "SSF81336", "regions": [{"start": 62, "end": 223}]}, {"name": "PF00119", "regions": [{"start": 21, "end": 224}]}, {"name": "TIGR01131", "regions": [{"start": 9, "end": 225}]}, {"name": "PR00123", "regions": [{"start": 71, "end": 87}, {"start": 132, "end": 147}, {"start": 152, "end": 174}, {"start": 209, "end": 224}]}], "start": 8527, "end": 9207, "cdna_coding_start": 1, "cdna_coding_end": 681, "aliases": []}]}, {"chr": "MT", "start": 9207, "end": 9990, "name": "ENSG00000198938", "strand": "+", "aliases": ["MT-CO3"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000362079", "exons": [{"start": 9207, "end": 9990}], "domains": [{"name": "PF00510", "regions": [{"start": 6, "end": 261}]}, {"name": "SSF81452", "regions": [{"start": 1, "end": 260}]}, {"name": "PS50253", "regions": [{"start": 4, "end": 261}]}], "start": 9207, "end": 9990, "cdna_coding_start": 1, "cdna_coding_end": 784, "aliases": []}]}, {"chr": "MT", "start": 10059, "end": 10404, "name": "ENSG00000198840", "strand": "+", "aliases": ["MT-ND3"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361227", "exons": [{"start": 10059, "end": 10404}], "domains": [{"name": "PF00507", "regions": [{"start": 13, "end": 113}]}], "start": 10059, "end": 10404, "cdna_coding_start": 1, "cdna_coding_end": 346, "aliases": []}]}, {"chr": "MT", "start": 10470, "end": 10766, "name": "ENSG00000212907", "strand": "+", "aliases": ["MT-ND4L"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361335", "exons": [{"start": 10470, "end": 10766}], "domains": [{"name": "PF00420", "regions": [{"start": 4, "end": 98}]}], "start": 10470, "end": 10766, "cdna_coding_start": 1, "cdna_coding_end": 297, "aliases": []}]}, {"chr": "MT", "start": 10760, "end": 12137, "name": "ENSG00000198886", "strand": "+", "aliases": ["MT-ND4"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361381", "exons": [{"start": 10760, "end": 12137}], "domains": [{"name": "TIGR01972", "regions": [{"start": 59, "end": 456}]}, {"name": "PR01437", "regions": [{"start": 118, "end": 137}, {"start": 149, "end": 173}, {"start": 221, "end": 245}, {"start": 308, "end": 327}, {"start": 360, "end": 386}]}, {"name": "PF00361", "regions": [{"start": 112, "end": 383}]}, {"name": "PF01059", "regions": [{"start": 1, "end": 109}]}], "start": 10760, "end": 12137, "cdna_coding_start": 1, "cdna_coding_end": 1378, "aliases": []}]}, {"chr": "MT", "start": 12337, "end": 14148, "name": "ENSG00000198786", "strand": "+", "aliases": ["MT-ND5"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361567", "exons": [{"start": 12337, "end": 14148}], "domains": [{"name": "PF00662", "regions": [{"start": 62, "end": 123}]}, {"name": "TIGR01974", "regions": [{"start": 8, "end": 501}]}, {"name": "PR01434", "regions": [{"start": 83, "end": 108}, {"start": 111, "end": 131}, {"start": 158, "end": 179}, {"start": 219, "end": 240}, {"start": 241, "end": 267}, {"start": 309, "end": 321}, {"start": 412, "end": 431}]}, {"name": "PF00361", "regions": [{"start": 134, "end": 397}]}, {"name": "PF06455", "regions": [{"start": 422, "end": 602}]}], "start": 12337, "end": 14148, "cdna_coding_start": 1, "cdna_coding_end": 1812, "aliases": []}]}, {"chr": "MT", "start": 14149, "end": 14673, "name": "ENSG00000198695", "strand": "-", "aliases": ["MT-ND6"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361681", "exons": [{"start": 14149, "end": 14673}], "domains": [{"name": "PF00499", "regions": [{"start": 14, "end": 171}]}], "start": 14149, "end": 14673, "cdna_coding_start": 1, "cdna_coding_end": 525, "aliases": []}]}, {"chr": "MT", "start": 14747, "end": 15887, "name": "ENSG00000198727", "strand": "+", "aliases": ["MT-CYB"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000361789", "exons": [{"start": 14747, "end": 15887}], "domains": [{"name": "PF00032", "regions": [{"start": 259, "end": 359}]}, {"name": "PS51003", "regions": [{"start": 210, "end": 380}]}, {"name": "PS51002", "regions": [{"start": 1, "end": 209}]}, {"name": "PF00033", "regions": [{"start": 24, "end": 200}]}, {"name": "SSF81342", "regions": [{"start": 1, "end": 260}]}, {"name": "SSF81648", "regions": [{"start": 261, "end": 379}]}], "start": 14747, "end": 15887, "cdna_coding_start": 1, "cdna_coding_end": 1141, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143738824, "end": 143763329, "name": "ENSG00000262023", "strand": "-", "aliases": [], "transcripts": [{"is_best_transcript": false, "name": "ENST00000585837", "exons": [{"start": 143751293, "end": 143751331}, {"start": 143745839, "end": 143747881}, {"start": 143740183, "end": 143740305}, {"start": 143738824, "end": 143739441}], "domains": [{"name": "SM00674", "regions": [{"start": 83, "end": 149}]}, {"name": "PF03221", "regions": [{"start": 87, "end": 149}]}, {"name": "PF03184", "regions": [{"start": 179, "end": 382}]}, {"name": "SSF46689", "regions": [{"start": 79, "end": 144}, {"start": 13, "end": 76}]}, {"name": "PS50960", "regions": [{"start": 11, "end": 62}]}, {"name": "PF04218", "regions": [{"start": 14, "end": 66}]}], "start": 143738824, "end": 143751331, "cdna_coding_start": 502, "cdna_coding_end": 2172, "aliases": []}, {"is_best_transcript": true, "name": "ENST00000571961", "exons": [{"start": 143751293, "end": 143751355}, {"start": 143745839, "end": 143747881}, {"start": 143739726, "end": 143740305}], "domains": [{"name": "PF03221", "regions": [{"start": 87, "end": 149}]}, {"name": "SM00674", "regions": [{"start": 83, "end": 149}]}, {"name": "PF04218", "regions": [{"start": 14, "end": 66}]}, {"name": "PS50960", "regions": [{"start": 11, "end": 62}]}, {"name": "SSF46689", "regions": [{"start": 79, "end": 144}, {"start": 13, "end": 76}]}, {"name": "PF03184", "regions": [{"start": 179, "end": 382}]}], "start": 143739726, "end": 143751355, "cdna_coding_start": 526, "cdna_coding_end": 2196, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000592696", "exons": [{"start": 143747818, "end": 143747881}, {"start": 143739743, "end": 143747688}], "domains": [{"name": "PF03221", "regions": [{"start": 87, "end": 149}]}, {"name": "SM00674", "regions": [{"start": 83, "end": 149}]}, {"name": "PF04218", "regions": [{"start": 14, "end": 66}]}, {"name": "PS50960", "regions": [{"start": 11, "end": 62}]}, {"name": "SSF46689", "regions": [{"start": 79, "end": 144}, {"start": 13, "end": 76}]}, {"name": "PF03184", "regions": [{"start": 179, "end": 382}]}], "start": 143739743, "end": 143747881, "cdna_coding_start": 334, "cdna_coding_end": 2040, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000585713", "exons": [{"start": 143751302, "end": 143751349}, {"start": 143747343, "end": 143747881}], "domains": [], "start": 143747343, "end": 143751349, "cdna_coding_start": 511, "cdna_coding_end": 587, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143751669, "end": 143764085, "name": "ENSG00000262150", "strand": "+", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000571412", "exons": [{"start": 143761817, "end": 143761924}, {"start": 143762688, "end": 143762795}, {"start": 143763282, "end": 143764085}], "domains": [{"name": "PF00021", "regions": [{"start": 14, "end": 84}]}, {"name": "SM00134", "regions": [{"start": 12, "end": 98}]}, {"name": "SSF57302", "regions": [{"start": 10, "end": 84}]}], "start": 143761817, "end": 143764085, "cdna_coding_start": 84, "cdna_coding_end": 428, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000575167", "exons": [{"start": 143761818, "end": 143761924}, {"start": 143762688, "end": 143763131}], "domains": [{"name": "PF00021", "regions": [{"start": 14, "end": 52}]}], "start": 143761818, "end": 143763131, "cdna_coding_start": 83, "cdna_coding_end": 247, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143781472, "end": 143786488, "name": "ENSG00000262378", "strand": "+", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000572300", "exons": [{"start": 143781472, "end": 143781991}, {"start": 143782961, "end": 143783074}, {"start": 143784452, "end": 143786488}], "domains": [{"name": "SSF57302", "regions": [{"start": 42, "end": 122}]}], "start": 143781472, "end": 143786488, "cdna_coding_start": 418, "cdna_coding_end": 915, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000571924", "exons": [{"start": 143781474, "end": 143781991}, {"start": 143782961, "end": 143783074}, {"start": 143783861, "end": 143784083}], "domains": [], "start": 143781474, "end": 143784083, "cdna_coding_start": 416, "cdna_coding_end": 718, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000574968", "exons": [{"start": 143781475, "end": 143781991}, {"start": 143782961, "end": 143783074}, {"start": 143784484, "end": 143784815}], "domains": [], "start": 143781475, "end": 143784815, "cdna_coding_start": 415, "cdna_coding_end": 732, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000570580", "exons": [{"start": 143781833, "end": 143781991}, {"start": 143782961, "end": 143785525}], "domains": [], "start": 143781833, "end": 143785525, "cdna_coding_start": 57, "cdna_coding_end": 314, "aliases": []}]}, {"chr": "HG418_PATCH", "start": 143808564, "end": 143818288, "name": "ENSG00000263194", "strand": "+", "aliases": [], "transcripts": [{"is_best_transcript": true, "name": "ENST00000575946", "exons": [{"start": 143808564, "end": 143809220}, {"start": 143816687, "end": 143818288}], "domains": [{"name": "SSF54637", "regions": [{"start": 44, "end": 189}]}], "start": 143808564, "end": 143818288, "cdna_coding_start": 145, "cdna_coding_end": 771, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000573810", "exons": [{"start": 143809084, "end": 143809220}, {"start": 143816687, "end": 143817029}, {"start": 143818218, "end": 143818288}], "domains": [{"name": "SSF54637", "regions": [{"start": 2, "end": 64}]}], "start": 143809084, "end": 143818288, "cdna_coding_start": 1, "cdna_coding_end": 251, "aliases": []}]}, {"chr": "12", "start": 175931, "end": 287626, "name": "ENSG00000120645", "strand": "+", "aliases": ["IQSEC3"], "transcripts": [{"is_best_transcript": false, "name": "ENST00000538872", "exons": [{"start": 175931, "end": 176602}, {"start": 208312, "end": 208380}, {"start": 234799, "end": 235078}, {"start": 247433, "end": 248520}, {"start": 250290, "end": 250451}, {"start": 266191, "end": 266313}, {"start": 266694, "end": 266860}, {"start": 271092, "end": 271231}, {"start": 272660, "end": 272785}, {"start": 274600, "end": 274699}, {"start": 274895, "end": 275056}, {"start": 278179, "end": 278271}, {"start": 280278, "end": 280327}, {"start": 283765, "end": 287626}], "domains": [{"name": "SM00222", "regions": [{"start": 648, "end": 839}]}, {"name": "SSF50729", "regions": [{"start": 845, "end": 983}]}, {"name": "PS50096", "regions": [{"start": 315, "end": 344}]}, {"name": "PS50099", "regions": [{"start": 1061, "end": 1165}]}, {"name": "SSF48425", "regions": [{"start": 647, "end": 844}]}, {"name": "PS50190", "regions": [{"start": 644, "end": 837}]}, {"name": "PF01369", "regions": [{"start": 651, "end": 839}]}], "start": 175931, "end": 287626, "cdna_coding_start": 119, "cdna_coding_end": 3667, "aliases": []}, {"is_best_transcript": true, "name": "ENST00000382841", "exons": [{"start": 186542, "end": 186878}, {"start": 208312, "end": 208380}, {"start": 247433, "end": 248520}, {"start": 250290, "end": 250451}, {"start": 266191, "end": 266313}, {"start": 266694, "end": 266860}, {"start": 271092, "end": 271231}, {"start": 272660, "end": 272785}, {"start": 274600, "end": 274699}, {"start": 274895, "end": 275056}, {"start": 278179, "end": 278271}, {"start": 280278, "end": 280327}, {"start": 280413, "end": 280496}], "domains": [{"name": "PS50190", "regions": [{"start": 341, "end": 534}]}, {"name": "PF01369", "regions": [{"start": 348, "end": 536}]}, {"name": "SM00222", "regions": [{"start": 345, "end": 536}]}, {"name": "SSF50729", "regions": [{"start": 542, "end": 680}]}, {"name": "SSF48425", "regions": [{"start": 344, "end": 541}]}, {"name": "PS50096", "regions": [{"start": 12, "end": 41}]}], "start": 186542, "end": 280496, "cdna_coding_start": 413, "cdna_coding_end": 2692, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000326261", "exons": [{"start": 176049, "end": 176602}, {"start": 208312, "end": 208380}, {"start": 234799, "end": 235078}, {"start": 247433, "end": 248520}, {"start": 250290, "end": 250451}, {"start": 266191, "end": 266313}, {"start": 266694, "end": 266860}, {"start": 271092, "end": 271231}, {"start": 272660, "end": 272785}, {"start": 274600, "end": 274699}, {"start": 274895, "end": 275056}, {"start": 278179, "end": 278271}, {"start": 280278, "end": 280327}, {"start": 283765, "end": 286390}, {"start": 286498, "end": 287620}], "domains": [{"name": "PF01369", "regions": [{"start": 651, "end": 839}]}, {"name": "PS50190", "regions": [{"start": 644, "end": 837}]}, {"name": "SSF48425", "regions": [{"start": 647, "end": 844}]}, {"name": "PS50096", "regions": [{"start": 315, "end": 344}]}, {"name": "PS50099", "regions": [{"start": 1061, "end": 1165}]}, {"name": "SM00222", "regions": [{"start": 648, "end": 839}]}, {"name": "SSF50729", "regions": [{"start": 845, "end": 983}]}], "start": 176049, "end": 287620, "cdna_coding_start": 1, "cdna_coding_end": 3549, "aliases": []}]}, {"chr": "12", "start": 299243, "end": 323736, "name": "ENSG00000111181", "strand": "-", "aliases": ["SLC6A12"], "transcripts": [{"is_best_transcript": true, "name": "ENST00000359674", "exons": [{"start": 322677, "end": 322863}, {"start": 321192, "end": 321276}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311906, "end": 312046}, {"start": 310928, "end": 311015}, {"start": 309817, "end": 309949}, {"start": 307963, "end": 308097}, {"start": 307066, "end": 307169}, {"start": 306543, "end": 306667}, {"start": 305936, "end": 306048}, {"start": 305290, "end": 305427}, {"start": 304391, "end": 304493}, {"start": 302443, "end": 302543}, {"start": 301644, "end": 301814}, {"start": 299243, "end": 300377}], "domains": [{"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}, {"start": 238, "end": 255}, {"start": 320, "end": 340}, {"start": 374, "end": 393}, {"start": 458, "end": 478}, {"start": 498, "end": 518}]}, {"name": "PR01198", "regions": [{"start": 4, "end": 17}, {"start": 565, "end": 582}, {"start": 599, "end": 611}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 560}]}, {"name": "PS50267", "regions": [{"start": 35, "end": 563}]}], "start": 299243, "end": 322863, "cdna_coding_start": 330, "cdna_coding_end": 2174, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000424061", "exons": [{"start": 322456, "end": 322504}, {"start": 321192, "end": 321276}, {"start": 319371, "end": 319734}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311906, "end": 312046}, {"start": 310928, "end": 311015}, {"start": 309817, "end": 309949}, {"start": 307963, "end": 308097}, {"start": 307066, "end": 307169}, {"start": 306543, "end": 306667}, {"start": 305936, "end": 306048}, {"start": 305290, "end": 305427}, {"start": 304391, "end": 304493}, {"start": 302443, "end": 302543}, {"start": 301644, "end": 301814}, {"start": 299302, "end": 300377}], "domains": [{"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}, {"start": 238, "end": 255}, {"start": 320, "end": 340}, {"start": 374, "end": 393}, {"start": 458, "end": 478}, {"start": 498, "end": 518}]}, {"name": "PR01198", "regions": [{"start": 4, "end": 17}, {"start": 565, "end": 582}, {"start": 599, "end": 611}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 560}]}, {"name": "PS50267", "regions": [{"start": 35, "end": 563}]}], "start": 299302, "end": 322504, "cdna_coding_start": 556, "cdna_coding_end": 2400, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000536824", "exons": [{"start": 323088, "end": 323286}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311906, "end": 312046}, {"start": 310928, "end": 311015}, {"start": 309817, "end": 309949}, {"start": 307963, "end": 308097}, {"start": 307066, "end": 307169}, {"start": 306543, "end": 306667}, {"start": 305936, "end": 306048}, {"start": 305290, "end": 305427}, {"start": 304391, "end": 304493}, {"start": 302443, "end": 302543}, {"start": 301644, "end": 301814}, {"start": 300234, "end": 300377}], "domains": [{"name": "PS50267", "regions": [{"start": 35, "end": 563}]}, {"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}, {"start": 238, "end": 255}, {"start": 320, "end": 340}, {"start": 374, "end": 393}, {"start": 458, "end": 478}, {"start": 498, "end": 518}]}, {"name": "PR01198", "regions": [{"start": 4, "end": 17}, {"start": 565, "end": 582}, {"start": 599, "end": 611}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 560}]}], "start": 300234, "end": 323286, "cdna_coding_start": 257, "cdna_coding_end": 2101, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000537793", "exons": [{"start": 320301, "end": 320382}, {"start": 318939, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 311958, "end": 312046}], "domains": [{"name": "PR00176", "regions": [{"start": 44, "end": 65}, {"start": 73, "end": 92}, {"start": 117, "end": 143}]}, {"name": "PF00209", "regions": [{"start": 36, "end": 146}]}, {"name": "PS50267", "regions": [{"start": 35, "end": 146}]}], "start": 311958, "end": 320382, "cdna_coding_start": 140, "cdna_coding_end": 577, "aliases": []}, {"is_best_transcript": false, "name": "ENST00000535347", "exons": [{"start": 323088, "end": 323256}, {"start": 321192, "end": 321276}, {"start": 319057, "end": 319209}, {"start": 313730, "end": 313864}, {"start": 312035, "end": 312046}], "domains": [], "start": 312035, "end": 323256, "cdna_coding_start": 312, "cdna_coding_end": 554, "aliases": []}]}]} diff --git a/tests/data/example_genes.json b/tests/data/example_genes.json index 51e5d477..f1a6cf8e 100644 --- a/tests/data/example_genes.json +++ b/tests/data/example_genes.json @@ -7991,6 +7991,8 @@ "transcripts": [{ "name": "ENST00000346085", "is_best_transcript": true, + "start": 157099063, + "end": 157529495, "cdna_coding_end": 6751, "cdna_coding_start": 2, "exons": [ diff --git a/tests/data/mock_annotations.json b/tests/data/mock_annotations.json index 4d4bda28..7e331c50 100644 --- a/tests/data/mock_annotations.json +++ b/tests/data/mock_annotations.json @@ -1 +1 @@ -{"ensembl_version": 69, "genes": [{"name": "ENSG00000128891", "start": "1", "strand": "-1", "end": "36375", "transcripts": [{"is_best_transcript": "false", "exons": [{"end": "36329", "name": "ENSE00002560148", "start": "36294"}, {"end": "34345", "start": "34090", "name": "ENSE00002761197"}, {"end": "28690", "name": "ENSE00002865842", "start": "28534"}, {"name": "ENSE00002539417", "start": "1", "end": "108"}], "start": "1", "name": "ENST00000560305", "end": "36329", "cdna_coding_start": 49, "aliases": [], "domains": [], "cdna_coding_end": 498}, {"start": "2649", "exons": [{"end": "34345", "name": "ENSE00002761197", "start": "34090"}, {"start": "28534", "name": "ENSE00002865842", "end": "28690"}, {"name": "ENSE00002540983", "start": "24913", "end": "25445"}, {"name": "ENSE00002563185", "start": "2649", "end": "3620"}], "name": "ENST00000559153", "is_best_transcript": "false", "domains": [], "cdna_coding_end": 570, "end": "34345", "cdna_coding_start": 13, "aliases": []}, {"is_best_transcript": "false", "exons": [{"end": "36310", "name": "ENSE00002537918", "start": "36109"}, {"start": "34090", "name": "ENSE00002761197", "end": "34345"}, {"end": "28690", "start": "28534", "name": "ENSE00002865842"}, {"start": "2659", "name": "ENSE00002565021", "end": "3620"}], "start": "2659", "name": "ENST00000561011", "end": "36310", "cdna_coding_start": 215, "aliases": [], "domains": [], "cdna_coding_end": 619}, {"is_best_transcript": "false", "exons": [{"end": "36307", "name": "ENSE00002562556", "start": "36294"}, {"end": "34345", "name": "ENSE00002761197", "start": "34090"}, {"name": "ENSE00002865842", "start": "28534", "end": "28690"}, {"end": "25445", "name": "ENSE00002555814", "start": "24755"}, {"name": "ENSE00002547244", "start": "5661", "end": "10673"}], "start": "5661", "name": "ENST00000559291", "cdna_coding_start": 27, "end": "36307", "aliases": [], "cdna_coding_end": 584, "domains": []}, {"is_best_transcript": "false", "name": "ENST00000559911", "start": "6218", "exons": [{"end": "36329", "name": "ENSE00002560148", "start": "36294"}, {"end": "34345", "start": "34090", "name": "ENSE00002761197"}, {"start": "28534", "name": "ENSE00002865842", "end": "28690"}, {"name": "ENSE00002551447", "start": "10557", "end": "10673"}, {"name": "ENSE00002544927", "start": "6218", "end": "6328"}], "aliases": [], "end": "36329", "cdna_coding_start": 49, "domains": [], "cdna_coding_end": 660}, {"exons": [{"name": "ENSE00002560148", "start": "36294", "end": "36329"}, {"name": "ENSE00002761197", "start": "34090", "end": "34345"}, {"name": "ENSE00002865842", "start": "28534", "end": "28690"}, {"start": "10547", "name": "ENSE00002549191", "end": "10673"}], "start": "10547", "name": "ENST00000558113", "is_best_transcript": "false", "domains": [], "cdna_coding_end": 570, "aliases": [], "end": "36329", "cdna_coding_start": 49}, {"cdna_coding_end": 832, "domains": [], "aliases": ["NP_443081.1", "NM_052849.2"], "cdna_coding_start": 275, "end": "36370", "start": "24417", "exons": [{"name": "ENSE00001528957", "start": "36109", "end": "36370"}, {"start": "34090", "name": "ENSE00002761197", "end": "34345"}, {"start": "28534", "name": "ENSE00002865842", "end": "28690"}, {"end": "25445", "start": "24417", "name": "ENSE00001933860"}], "name": "ENST00000358005", "is_best_transcript": "true"}, {"name": "ENST00000416810", "start": "24422", "exons": [{"end": "36375", "start": "36294", "name": "ENSE00002565648"}, {"end": "34345", "start": "34090", "name": "ENSE00002761197"}, {"start": "28534", "name": "ENSE00002865842", "end": "28690"}, {"end": "25445", "start": "24422", "name": "ENSE00001427910"}], "is_best_transcript": "false", "domains": [], "cdna_coding_end": 652, "aliases": ["NP_001074261.1", "NM_001080792.1"], "end": "36375", "cdna_coding_start": 95}, {"name": "ENST00000558750", "start": "25260", "exons": [{"start": "36084", "name": "ENSE00001759032", "end": "36298"}, {"name": "ENSE00002815778", "start": "34090", "end": "34345"}, {"name": "ENSE00002865842", "start": "28534", "end": "28690"}, {"start": "25260", "name": "ENSE00002570697", "end": "25445"}], "is_best_transcript": "false", "domains": [], "cdna_coding_end": 785, "aliases": ["NP_001074260.1", "NM_001080791.1"], "end": "36298", "cdna_coding_start": 201}, {"cdna_coding_start": 98, "end": "36337", "aliases": [], "cdna_coding_end": 349, "domains": [], "is_best_transcript": "false", "exons": [{"end": "36337", "start": "36294", "name": "ENSE00002551707"}, {"end": "28690", "start": "28534", "name": "ENSE00002785665"}, {"end": "25445", "start": "25298", "name": "ENSE00002565968"}], "start": "25298", "name": "ENST00000558918"}, {"cdna_coding_end": 557, "domains": [], "aliases": [], "cdna_coding_start": 47, "end": "36332", "start": "25336", "exons": [{"end": "36332", "start": "36294", "name": "ENSE00002549749"}, {"end": "34340", "name": "ENSE00002571615", "start": "34090"}, {"end": "28690", "name": "ENSE00002865842", "start": "28534"}, {"name": "ENSE00002556846", "start": "25336", "end": "25445"}], "name": "ENST00000559103", "is_best_transcript": "false"}, {"cdna_coding_end": 484, "domains": [], "aliases": [], "cdna_coding_start": 80, "end": "36360", "name": "ENST00000558871", "start": "27939", "exons": [{"end": "36360", "name": "ENSE00002572819", "start": "36294"}, {"start": "34090", "name": "ENSE00002761197", "end": "34345"}, {"end": "28690", "start": "27939", "name": "ENSE00002536768"}], "is_best_transcript": "false"}], "aliases": ["C15orf57"], "chr": "gene1"}, {"name": "ENSG00000104147", "start": "1", "transcripts": [{"cdna_coding_end": 750, "domains": [], "cdna_coding_start": 61, "end": "23354", "aliases": ["NP_009211.1", "NM_007280.1"], "exons": [{"end": "23354", "name": "ENSE00000930990", "start": "22973"}, {"end": "22714", "start": "22648", "name": "ENSE00000930989"}, {"start": "10391", "name": "ENSE00000942405", "end": "10513"}, {"start": "4006", "name": "ENSE00000942406", "end": "4087"}, {"end": "582", "name": "ENSE00000884003", "start": "1"}], "start": "1", "name": "ENST00000220514", "is_best_transcript": "true"}, {"is_best_transcript": "false", "name": "ENST00000560640", "start": "495", "exons": [{"end": "23220", "start": "22973", "name": "ENSE00002541746"}, {"name": "ENSE00000930989", "start": "22648", "end": "22714"}, {"end": "4087", "name": "ENSE00000942406", "start": "4006"}, {"start": "495", "name": "ENSE00002548921", "end": "582"}], "aliases": [], "end": "23220", "cdna_coding_start": 1, "domains": [], "cdna_coding_end": 485}], "end": "23354", "strand": "-1", "chr": "gene2", "aliases": ["OIP5"]}, {"chr": "gene2", "aliases": ["NUSAP1"], "transcripts": [{"aliases": ["NP_001230072.1", "NM_001243143.1"], "end": "71783", "cdna_coding_start": 265, "domains": [], "cdna_coding_end": 1545, "is_best_transcript": "false", "start": "23427", "exons": [{"end": "23783", "start": "23427", "name": "ENSE00002541618"}, {"end": "33187", "start": "33119", "name": "ENSE00002921169"}, {"end": "39974", "start": "39831", "name": "ENSE00002741972"}, {"end": "41862", "name": "ENSE00002865492", "start": "41766"}, {"end": "46873", "name": "ENSE00002852607", "start": "46772"}, {"name": "ENSE00002899380", "start": "48882", "end": "48991"}, {"end": "56322", "name": "ENSE00002874525", "start": "56135"}, {"end": "62417", "start": "62260", "name": "ENSE00002766779"}, {"name": "ENSE00002819484", "start": "66445", "end": "66561"}, {"end": "68037", "name": "ENSE00002924785", "start": "67929"}, {"end": "71783", "name": "ENSE00002542797", "start": "70826"}], "name": "ENST00000260359"}, {"end": "41819", "cdna_coding_start": 104, "aliases": [], "domains": [], "cdna_coding_end": 334, "is_best_transcript": "false", "start": "23588", "name": "ENST00000559046", "exons": [{"end": "23783", "name": "ENSE00002574418", "start": "23588"}, {"end": "33187", "start": "33119", "name": "ENSE00002921169"}, {"end": "36335", "start": "36219", "name": "ENSE00002559031"}, {"end": "39974", "name": "ENSE00002847051", "start": "39831"}, {"name": "ENSE00002564510", "start": "41766", "end": "41819"}]}, {"aliases": ["NP_001230073.1"], "end": "70996", "cdna_coding_start": 88, "domains": [], "cdna_coding_end": 1224, "is_best_transcript": "false", "start": "23604", "exons": [{"end": "23783", "start": "23604", "name": "ENSE00002312096"}, {"name": "ENSE00002741972", "start": "39831", "end": "39974"}, {"start": "41721", "name": "ENSE00002920477", "end": "41862"}, {"end": "46873", "start": "46772", "name": "ENSE00002852607"}, {"name": "ENSE00002899380", "start": "48882", "end": "48991"}, {"end": "56322", "start": "56138", "name": "ENSE00001668934"}, {"name": "ENSE00002766779", "start": "62260", "end": "62417"}, {"start": "67929", "name": "ENSE00002924785", "end": "68037"}, {"end": "70996", "start": "70826", "name": "ENSE00002254025"}], "name": "ENST00000450592"}, {"cdna_coding_start": 88, "end": "71135", "aliases": ["NP_057443.2", "NM_016359.4"], "cdna_coding_end": 1413, "domains": [], "is_best_transcript": "false", "start": "23604", "exons": [{"start": "23604", "name": "ENSE00002312096", "end": "23783"}, {"end": "33187", "start": "33119", "name": "ENSE00002921169"}, {"name": "ENSE00002741972", "start": "39831", "end": "39974"}, {"end": "41862", "start": "41721", "name": "ENSE00002920477"}, {"name": "ENSE00002852607", "start": "46772", "end": "46873"}, {"end": "48991", "name": "ENSE00002899380", "start": "48882"}, {"end": "56322", "start": "56135", "name": "ENSE00002874525"}, {"name": "ENSE00002766779", "start": "62260", "end": "62417"}, {"start": "66445", "name": "ENSE00002819484", "end": "66561"}, {"start": "67929", "name": "ENSE00002924785", "end": "68037"}, {"name": "ENSE00002563931", "start": "70826", "end": "71135"}], "name": "ENST00000559596"}, {"exons": [{"end": "23783", "start": "23604", "name": "ENSE00002312096"}, {"start": "33119", "name": "ENSE00002921169", "end": "33187"}, {"name": "ENSE00002741972", "start": "39831", "end": "39974"}, {"end": "41862", "name": "ENSE00002920477", "start": "41721"}, {"name": "ENSE00002852607", "start": "46772", "end": "46873"}, {"start": "48882", "name": "ENSE00002899380", "end": "48991"}, {"end": "56322", "name": "ENSE00001668934", "start": "56138"}, {"end": "62417", "start": "62260", "name": "ENSE00002766779"}, {"start": "66445", "name": "ENSE00002819484", "end": "66561"}, {"start": "67929", "name": "ENSE00002924785", "end": "68037"}, {"end": "71783", "start": "70826", "name": "ENSE00002542797"}], "start": "23604", "name": "ENST00000414849", "is_best_transcript": "true", "domains": [], "cdna_coding_end": 1410, "aliases": ["NP_001230071.1", "NP_060924.4", "NM_001243142.1", "NM_018454.7"], "end": "71783", "cdna_coding_start": 88}, {"start": "23619", "name": "ENST00000560747", "exons": [{"end": "23783", "start": "23619", "name": "ENSE00002560678"}, {"start": "33119", "name": "ENSE00002921169", "end": "33187"}, {"end": "39974", "name": "ENSE00002741972", "start": "39831"}, {"end": "41862", "start": "41724", "name": "ENSE00002569544"}, {"end": "46873", "name": "ENSE00002852607", "start": "46772"}, {"end": "48991", "name": "ENSE00002899380", "start": "48882"}, {"end": "56322", "start": "56138", "name": "ENSE00001668934"}, {"start": "62260", "name": "ENSE00002766779", "end": "62417"}, {"end": "66561", "start": "66445", "name": "ENSE00002819484"}, {"name": "ENSE00002924785", "start": "67929", "end": "68037"}, {"end": "71783", "start": "70826", "name": "ENSE00002542797"}], "is_best_transcript": "false", "domains": [], "cdna_coding_end": 1392, "aliases": [], "end": "71783", "cdna_coding_start": 73}, {"is_best_transcript": "false", "exons": [{"end": "23783", "name": "ENSE00002572945", "start": "23646"}, {"end": "33187", "name": "ENSE00002921169", "start": "33119"}, {"start": "39831", "name": "ENSE00002741972", "end": "39974"}, {"end": "41862", "start": "41724", "name": "ENSE00002569544"}, {"end": "46873", "name": "ENSE00002852607", "start": "46772"}, {"start": "48882", "name": "ENSE00002899380", "end": "48991"}, {"end": "56322", "name": "ENSE00002874525", "start": "56135"}, {"start": "62260", "name": "ENSE00002766779", "end": "62417"}, {"name": "ENSE00002819484", "start": "66445", "end": "66561"}, {"start": "67929", "name": "ENSE00002924785", "end": "68037"}, {"start": "70826", "name": "ENSE00002542797", "end": "71783"}], "start": "23646", "name": "ENST00000560177", "end": "71783", "cdna_coding_start": 46, "aliases": [], "domains": [], "cdna_coding_end": 1368}, {"name": "ENST00000557840", "start": "33129", "exons": [{"end": "33187", "name": "ENSE00002562906", "start": "33129"}, {"end": "56322", "start": "56164", "name": "ENSE00002552359"}, {"name": "ENSE00002874978", "start": "62260", "end": "62417"}, {"end": "66484", "start": "66445", "name": "ENSE00002572000"}], "is_best_transcript": "false", "domains": [], "cdna_coding_end": 206, "aliases": [], "end": "66484", "cdna_coding_start": 1}, {"aliases": [], "end": "68005", "cdna_coding_start": 1, "domains": [], "cdna_coding_end": 435, "is_best_transcript": "false", "start": "41835", "exons": [{"end": "41862", "name": "ENSE00002545491", "start": "41835"}, {"start": "46772", "name": "ENSE00002852607", "end": "46873"}, {"name": "ENSE00002899380", "start": "48882", "end": "48991"}, {"start": "56138", "name": "ENSE00001668934", "end": "56322"}, {"name": "ENSE00002538934", "start": "56549", "end": "56596"}, {"end": "62417", "start": "62260", "name": "ENSE00002874978"}, {"end": "68005", "start": "67929", "name": "ENSE00002570378"}], "name": "ENST00000560898"}, {"is_best_transcript": "false", "start": "23461", "exons": [{"end": "23783", "start": "23461", "name": "ENSE00001206443"}, {"end": "33187", "start": "33119", "name": "ENSE00002921169"}, {"start": "39831", "name": "ENSE00002741972", "end": "39974"}, {"end": "41862", "name": "ENSE00002920477", "start": "41721"}, {"name": "ENSE00002852607", "start": "46772", "end": "46873"}, {"end": "48991", "start": "48882", "name": "ENSE00002899380"}, {"start": "56135", "name": "ENSE00002874525", "end": "56322"}, {"end": "62417", "start": "62260", "name": "ENSE00002766779"}, {"start": "67929", "name": "ENSE00002924785", "end": "68037"}, {"start": "70826", "name": "ENSE00000931000", "end": "71780"}], "name": "ENST00000450318", "aliases": [], "end": "71780", "cdna_coding_start": 231, "domains": [], "cdna_coding_end": 1439}], "end": "71783", "strand": "1", "start": "23427", "name": "ENSG00000137804"}, {"aliases": ["PFKFB2"], "chr": "gene3", "end": "31569", "strand": "1", "transcripts": [{"start": "3805", "exons": [{"end": "3911", "name": "ENSE00001872821", "start": "3805"}, {"name": "ENSE00002901181", "start": "5246", "end": "5347"}, {"start": "12498", "name": "ENSE00002934688", "end": "12623"}, {"end": "13261", "start": "13165", "name": "ENSE00002915251"}, {"name": "ENSE00002265243", "start": "13688", "end": "13754"}, {"start": "13892", "name": "ENSE00002223873", "end": "13966"}, {"name": "ENSE00002234935", "start": "14318", "end": "14374"}, {"start": "15581", "name": "ENSE00002233337", "end": "15705"}, {"end": "18251", "start": "18044", "name": "ENSE00000842580"}, {"name": "ENSE00000842581", "start": "18708", "end": "18854"}, {"start": "19969", "name": "ENSE00002876346", "end": "20073"}, {"end": "20954", "start": "20825", "name": "ENSE00001595535"}, {"name": "ENSE00002889740", "start": "21733", "end": "21795"}, {"name": "ENSE00002760808", "start": "22054", "end": "22118"}, {"start": "22749", "name": "ENSE00001185299", "end": "28368"}], "name": "ENST00000367080", "is_best_transcript": "true", "domains": [{"regions": [{"start": "40", "end": "193"}], "desc": "Chromatin_KTI12", "name": "PF08433"}, {"name": "SSF53254", "desc": "", "regions": [{"end": "469", "start": "251"}]}, {"desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709", "regions": [{"start": "1", "end": "488"}]}, {"regions": [{"end": "250", "start": "34"}], "name": "SSF52540", "desc": ""}, {"regions": [{"end": "249", "start": "27"}], "name": "PF01591", "desc": "6Phosfructo_kin"}, {"desc": "6Pfruct_kin", "name": "PR00991", "regions": [{"start": "124", "end": "138"}, {"start": "150", "end": "164"}, {"end": "190", "start": "176"}, {"start": "230", "end": "251"}, {"start": "252", "end": "274"}, {"start": "329", "end": "345"}]}, {"regions": [{"end": "398", "start": "251"}], "desc": "His_Pase_superF_clade-1", "name": "PF00300"}, {"desc": "His_Pase_superF_clade-1", "name": "SM00855", "regions": [{"start": "251", "end": "398"}]}], "cdna_coding_end": 1642, "end": "28368", "cdna_coding_start": 125, "aliases": ["NP_006203.2", "NM_006212.2"]}, {"aliases": ["NP_001018063.1", "NM_001018053.1"], "end": "31569", "cdna_coding_start": 74, "domains": [{"name": "SSF52540", "desc": "", "regions": [{"end": "250", "start": "34"}]}, {"regions": [{"start": "251", "end": "398"}], "name": "SM00855", "desc": "His_Pase_superF_clade-1"}, {"regions": [{"start": "124", "end": "138"}, {"start": "150", "end": "164"}, {"start": "176", "end": "190"}, {"start": "230", "end": "251"}, {"start": "252", "end": "274"}, {"start": "329", "end": "345"}], "desc": "6Pfruct_kin", "name": "PR00991"}, {"desc": "His_Pase_superF_clade-1", "name": "PF00300", "regions": [{"start": "251", "end": "398"}]}, {"regions": [{"end": "249", "start": "27"}], "name": "PF01591", "desc": "6Phosfructo_kin"}, {"name": "PIRSF000709", "desc": "Bifunct_6PFK/fruc_bisP_Ptase", "regions": [{"end": "468", "start": "1"}]}, {"name": "SSF53254", "desc": "", "regions": [{"end": "469", "start": "251"}]}, {"name": "PF08433", "desc": "Chromatin_KTI12", "regions": [{"start": "40", "end": "193"}]}], "cdna_coding_end": 1489, "is_best_transcript": "false", "start": "3856", "name": "ENST00000367079", "exons": [{"start": "3856", "name": "ENSE00001818425", "end": "3911"}, {"end": "5347", "name": "ENSE00002901181", "start": "5246"}, {"end": "12623", "start": "12498", "name": "ENSE00002934688"}, {"end": "13261", "name": "ENSE00002915251", "start": "13165"}, {"end": "13754", "start": "13688", "name": "ENSE00002265243"}, {"end": "13966", "name": "ENSE00002223873", "start": "13892"}, {"end": "14374", "name": "ENSE00002234935", "start": "14318"}, {"start": "15581", "name": "ENSE00002233337", "end": "15705"}, {"name": "ENSE00000842580", "start": "18044", "end": "18251"}, {"name": "ENSE00000842581", "start": "18708", "end": "18854"}, {"end": "20073", "start": "19969", "name": "ENSE00002876346"}, {"name": "ENSE00001595535", "start": "20825", "end": "20954"}, {"end": "21795", "start": "21733", "name": "ENSE00002889740"}, {"name": "ENSE00002760808", "start": "22054", "end": "22118"}, {"start": "29499", "name": "ENSE00001443434", "end": "31569"}]}, {"aliases": [], "end": "22950", "cdna_coding_start": 227, "domains": [{"regions": [{"end": "160", "start": "7"}], "name": "PF08433", "desc": "Chromatin_KTI12"}, {"regions": [{"start": "1", "end": "455"}], "desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709"}, {"regions": [{"end": "436", "start": "218"}], "desc": "", "name": "SSF53254"}, {"regions": [{"end": "105", "start": "91"}, {"end": "131", "start": "117"}, {"start": "143", "end": "157"}, {"start": "197", "end": "218"}, {"end": "241", "start": "219"}, {"start": "296", "end": "312"}], "name": "PR00991", "desc": "6Pfruct_kin"}, {"name": "PF01591", "desc": "6Phosfructo_kin", "regions": [{"end": "216", "start": "2"}]}, {"name": "PF00300", "desc": "His_Pase_superF_clade-1", "regions": [{"start": "218", "end": "365"}]}, {"name": "SM00855", "desc": "His_Pase_superF_clade-1", "regions": [{"start": "218", "end": "365"}]}, {"regions": [{"start": "1", "end": "217"}], "name": "SSF52540", "desc": ""}], "cdna_coding_end": 1645, "is_best_transcript": "false", "start": "5245", "exons": [{"start": "5245", "name": "ENSE00002308738", "end": "5347"}, {"name": "ENSE00002226272", "start": "11367", "end": "11475"}, {"end": "12623", "name": "ENSE00002905114", "start": "12498"}, {"end": "13261", "start": "13165", "name": "ENSE00002915251"}, {"name": "ENSE00002265243", "start": "13688", "end": "13754"}, {"end": "13966", "start": "13892", "name": "ENSE00002223873"}, {"end": "14374", "name": "ENSE00002234935", "start": "14318"}, {"end": "15705", "start": "15581", "name": "ENSE00002233337"}, {"end": "18251", "start": "18044", "name": "ENSE00000842580"}, {"end": "18854", "start": "18708", "name": "ENSE00000842581"}, {"end": "20073", "name": "ENSE00002876346", "start": "19969"}, {"name": "ENSE00001595535", "start": "20825", "end": "20954"}, {"name": "ENSE00002889740", "start": "21733", "end": "21795"}, {"name": "ENSE00002760808", "start": "22054", "end": "22118"}, {"name": "ENSE00002308130", "start": "22749", "end": "22950"}], "name": "ENST00000545806"}, {"exons": [{"end": "162", "start": "1", "name": "ENSE00002231494"}, {"name": "ENSE00002892386", "start": "5246", "end": "5347"}, {"name": "ENSE00002797095", "start": "12498", "end": "12623"}, {"name": "ENSE00002308344", "start": "13092", "end": "13261"}, {"name": "ENSE00002265243", "start": "13688", "end": "13754"}, {"end": "13966", "name": "ENSE00002223873", "start": "13892"}, {"start": "14318", "name": "ENSE00002234935", "end": "14374"}, {"start": "15581", "name": "ENSE00002233337", "end": "15705"}, {"end": "18251", "start": "18044", "name": "ENSE00000842580"}, {"end": "18854", "start": "18708", "name": "ENSE00000842581"}, {"end": "20073", "start": "19969", "name": "ENSE00002876346"}, {"name": "ENSE00001595535", "start": "20825", "end": "20954"}, {"end": "21795", "start": "21733", "name": "ENSE00002889740"}, {"start": "22054", "name": "ENSE00002760808", "end": "22118"}, {"end": "29679", "start": "29499", "name": "ENSE00001791500"}], "start": "1", "name": "ENST00000411990", "is_best_transcript": "false", "cdna_coding_end": 1668, "domains": [{"desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709", "regions": [{"start": "1", "end": "370"}]}, {"desc": "", "name": "SSF53254", "regions": [{"end": "371", "start": "153"}]}, {"desc": "", "name": "SSF52540", "regions": [{"start": "4", "end": "152"}]}, {"desc": "6Pfruct_kin", "name": "PR00991", "regions": [{"start": "26", "end": "40"}, {"start": "52", "end": "66"}, {"start": "78", "end": "92"}, {"start": "132", "end": "153"}, {"end": "176", "start": "154"}, {"start": "231", "end": "247"}]}, {"name": "PF01591", "desc": "6Phosfructo_kin", "regions": [{"end": "151", "start": "1"}]}, {"regions": [{"end": "300", "start": "153"}], "name": "PF00300", "desc": "His_Pase_superF_clade-1"}, {"regions": [{"start": "153", "end": "300"}], "name": "SM00855", "desc": "His_Pase_superF_clade-1"}], "aliases": [], "cdna_coding_start": 547, "end": "29679"}, {"cdna_coding_start": 84, "end": "29931", "aliases": [], "cdna_coding_end": 878, "domains": [{"regions": [{"start": "1", "end": "64"}], "name": "SSF52540", "desc": ""}, {"desc": "", "name": "SSF53254", "regions": [{"end": "262", "start": "65"}]}, {"desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709", "regions": [{"start": "1", "end": "261"}]}, {"regions": [{"start": "65", "end": "212"}], "name": "SM00855", "desc": "His_Pase_superF_clade-1"}, {"name": "PF01591", "desc": "6Phosfructo_kin", "regions": [{"end": "63", "start": "1"}]}, {"desc": "6Pfruct_kin", "name": "PR00991", "regions": [{"start": "44", "end": "65"}, {"start": "66", "end": "88"}, {"start": "143", "end": "159"}]}, {"regions": [{"end": "212", "start": "65"}], "name": "PF00300", "desc": "His_Pase_superF_clade-1"}], "is_best_transcript": "false", "start": "15549", "name": "ENST00000541914", "exons": [{"name": "ENSE00002294124", "start": "15549", "end": "15705"}, {"name": "ENSE00000842580", "start": "18044", "end": "18251"}, {"start": "18708", "name": "ENSE00000842581", "end": "18854"}, {"end": "20073", "start": "19969", "name": "ENSE00002876346"}, {"start": "20825", "name": "ENSE00001595535", "end": "20954"}, {"name": "ENSE00002760808", "start": "22054", "end": "22118"}, {"name": "ENSE00002284395", "start": "29499", "end": "29931"}]}], "start": "1", "name": "ENSG00000123836"}, {"start": "1", "name": "ENSG00000187416", "chr": "gene4", "aliases": ["LHFPL3"], "transcripts": [{"domains": [{"regions": [{"start": "22", "end": "199"}], "desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242"}], "cdna_coding_end": 835, "aliases": [], "end": "578576", "cdna_coding_start": 167, "exons": [{"end": "569", "name": "ENSE00001713788", "start": "1"}, {"end": "408255", "start": "408019", "name": "ENSE00001349915"}, {"end": "578576", "start": "577531", "name": "ENSE00001723245"}], "start": "1", "name": "ENST00000424859", "is_best_transcript": "false"}, {"start": "45", "name": "ENST00000401970", "exons": [{"end": "569", "name": "ENSE00001554382", "start": "45"}, {"end": "408255", "start": "408019", "name": "ENSE00001349915"}, {"end": "516773", "name": "ENSE00001746769", "start": "516726"}, {"end": "578098", "start": "577531", "name": "ENSE00001593689"}], "is_best_transcript": "false", "domains": [{"desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242", "regions": [{"end": "199", "start": "22"}]}], "cdna_coding_end": 779, "aliases": [], "end": "578098", "cdna_coding_start": 123}, {"end": "579898", "cdna_coding_start": 125, "aliases": ["NP_945351.1", "NM_199000.2"], "domains": [{"desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242", "regions": [{"end": "213", "start": "36"}]}, {"regions": [{"end": "22", "start": "4"}], "desc": "", "name": "PS50310"}], "cdna_coding_end": 835, "is_best_transcript": "true", "start": "1", "name": "ENST00000535008", "exons": [{"start": "1", "name": "ENSE00002286305", "end": "573"}, {"end": "141399", "name": "ENSE00002499039", "start": "141389"}, {"name": "ENSE00002324321", "start": "294348", "end": "294368"}, {"end": "408255", "name": "ENSE00002278044", "start": "408055"}, {"name": "ENSE00002230996", "start": "577531", "end": "579898"}]}, {"aliases": [], "cdna_coding_start": 81, "end": "578099", "cdna_coding_end": 779, "domains": [{"name": "PS50310", "desc": "", "regions": [{"start": "4", "end": "22"}]}, {"desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242", "regions": [{"end": "213", "start": "36"}]}], "is_best_transcript": "false", "name": "ENST00000543266", "start": "45", "exons": [{"end": "573", "name": "ENSE00002312340", "start": "45"}, {"name": "ENSE00002499039", "start": "141389", "end": "141399"}, {"start": "294348", "name": "ENSE00002324321", "end": "294368"}, {"end": "408255", "start": "408055", "name": "ENSE00002278044"}, {"end": "516773", "start": "516726", "name": "ENSE00001746769"}, {"end": "578099", "start": "577531", "name": "ENSE00002270291"}]}], "strand": "1", "end": "579898"}, {"start": "1", "name": "ENSG00000122565", "strand": "1", "end": "12195", "transcripts": [{"cdna_coding_start": 429, "end": "12195", "aliases": ["NP_009207.2", "NM_007276.4"], "cdna_coding_end": 980, "domains": [{"name": "PR00504", "desc": "Chromo_dom_subgr", "regions": [{"end": "35", "start": "27"}, {"end": "54", "start": "40"}, {"end": "67", "start": "55"}]}, {"name": "PF01393", "desc": "Chromo_shadow_dom", "regions": [{"start": "119", "end": "176"}]}, {"regions": [{"end": "175", "start": "106"}, {"start": "15", "end": "79"}], "name": "SSF54160", "desc": "Chromodomain-like"}, {"regions": [{"end": "78", "start": "30"}], "name": "PF00385", "desc": "Chromo_domain"}, {"regions": [{"start": "29", "end": "81"}, {"start": "120", "end": "172"}], "name": "SM00298", "desc": "Chromo_domain/shadow"}, {"regions": [{"end": "177", "start": "115"}], "name": "SM00300", "desc": "Chromo_shadow_dom"}, {"desc": "Chromo_domain/shadow", "name": "PS50013", "regions": [{"end": "88", "start": "30"}, {"end": "179", "start": "121"}]}], "is_best_transcript": "true", "exons": [{"end": "400", "start": "1", "name": "ENSE00001189828"}, {"end": "1861", "name": "ENSE00001433792", "start": "1810"}, {"start": "5207", "name": "ENSE00002858776", "end": "5349"}, {"start": "7232", "name": "ENSE00002882596", "end": "7394"}, {"end": "10595", "start": "10501", "name": "ENSE00002768648"}, {"start": "10921", "name": "ENSE00002778428", "end": "12195"}], "start": "1", "name": "ENST00000337620"}, {"start": "532", "name": "ENST00000396386", "exons": [{"start": "532", "name": "ENSE00001524790", "end": "665"}, {"start": "1810", "name": "ENSE00001433792", "end": "1861"}, {"end": "5349", "name": "ENSE00002858776", "start": "5207"}, {"name": "ENSE00002882596", "start": "7232", "end": "7394"}, {"start": "10501", "name": "ENSE00002768648", "end": "10595"}, {"name": "ENSE00001553667", "start": "10921", "end": "12190"}], "is_best_transcript": "false", "cdna_coding_end": 714, "domains": [{"regions": [{"start": "115", "end": "177"}], "name": "SM00300", "desc": "Chromo_shadow_dom"}, {"regions": [{"end": "88", "start": "30"}, {"end": "179", "start": "121"}], "desc": "Chromo_domain/shadow", "name": "PS50013"}, {"name": "SM00298", "desc": "Chromo_domain/shadow", "regions": [{"start": "29", "end": "81"}, {"start": "120", "end": "172"}]}, {"desc": "Chromo_dom_subgr", "name": "PR00504", "regions": [{"end": "35", "start": "27"}, {"end": "54", "start": "40"}, {"start": "55", "end": "67"}]}, {"regions": [{"start": "106", "end": "175"}, {"start": "15", "end": "79"}], "name": "SSF54160", "desc": "Chromodomain-like"}, {"name": "PF01393", "desc": "Chromo_shadow_dom", "regions": [{"start": "119", "end": "176"}]}, {"regions": [{"end": "78", "start": "30"}], "name": "PF00385", "desc": "Chromo_domain"}], "cdna_coding_start": 163, "end": "12190", "aliases": ["NP_057671.2", "NM_016587.3"]}, {"cdna_coding_end": 1068, "domains": [{"name": "PF00385", "desc": "Chromo_domain", "regions": [{"end": "62", "start": "30"}]}, {"name": "SSF54160", "desc": "Chromodomain-like", "regions": [{"start": "22", "end": "62"}]}, {"desc": "Chromo_dom_subgr", "name": "PR00504", "regions": [{"end": "35", "start": "27"}, {"start": "40", "end": "54"}, {"start": "55", "end": "62"}]}, {"regions": [{"start": "30", "end": "62"}], "name": "PS50013", "desc": "Chromo_domain/shadow"}], "cdna_coding_start": 882, "end": "7251", "aliases": [], "start": "596", "name": "ENST00000456948", "exons": [{"end": "1448", "start": "596", "name": "ENSE00001696375"}, {"start": "1810", "name": "ENSE00001433792", "end": "1861"}, {"end": "5349", "name": "ENSE00002858776", "start": "5207"}, {"start": "7232", "name": "ENSE00001778411", "end": "7251"}], "is_best_transcript": "false"}, {"is_best_transcript": "false", "start": "604", "name": "ENST00000409747", "exons": [{"end": "665", "start": "604", "name": "ENSE00001588865"}, {"end": "1861", "start": "1810", "name": "ENSE00001433792"}, {"start": "5207", "name": "ENSE00002858776", "end": "5349"}, {"name": "ENSE00001577751", "start": "7305", "end": "7394"}, {"start": "10501", "name": "ENSE00002765595", "end": "10595"}, {"start": "10921", "name": "ENSE00002880768", "end": "12195"}], "cdna_coding_start": 91, "end": "12195", "aliases": [], "cdna_coding_end": 396, "domains": [{"regions": [{"start": "15", "end": "56"}], "desc": "Chromodomain-like", "name": "SSF54160"}, {"regions": [{"end": "57", "start": "30"}], "desc": "Chromo_domain", "name": "PF00385"}, {"desc": "Chromo_domain/shadow", "name": "PS50013", "regions": [{"start": "30", "end": "91"}]}]}], "aliases": ["CBX3"], "chr": "gene5"}, {"name": "ENSG00000171862", "start": "1", "end": "108818", "strand": "1", "transcripts": [{"aliases": ["NP_000305.3", "NM_000314.4"], "cdna_coding_start": 1358, "end": "108818", "cdna_coding_end": 2569, "domains": [{"desc": "C2_Ca/lipid-bd_dom_CaLB", "name": "SSF49562", "regions": [{"start": "188", "end": "351"}]}, {"regions": [{"end": "159", "start": "80"}], "name": "PF00782", "desc": "Dual-sp_phosphatase_cat-dom"}, {"desc": "Bifunc_PIno_P3_Pase/Pase_PTEN", "name": "PIRSF038025", "regions": [{"start": "1", "end": "403"}]}, {"regions": [{"start": "59", "end": "181"}], "name": "PF00102", "desc": "Tyr_Pase_rcpt/non-rcpt"}, {"desc": "Tensin_phosphatase_C2-dom", "name": "PF10409", "regions": [{"end": "349", "start": "188"}]}, {"regions": [{"end": "350", "start": "190"}], "desc": "Tensin_phosphatase_C2-dom", "name": "PS51182"}, {"name": "SM00404", "desc": "Tyr_Pase_cat", "regions": [{"end": "183", "start": "23"}]}, {"regions": [{"end": "187", "start": "14"}], "desc": "", "name": "SSF52799"}, {"regions": [{"start": "102", "end": "173"}], "desc": "Tyr/Dual-specificity_Pase", "name": "PS50056"}, {"regions": [{"end": "185", "start": "14"}], "name": "PS51181", "desc": "Phosphatase_tensin-typ"}], "is_best_transcript": "true", "start": "1", "name": "ENST00000371953", "exons": [{"end": "1436", "start": "1", "name": "ENSE00001456562"}, {"end": "30997", "name": "ENSE00001156351", "start": "30913"}, {"end": "62445", "start": "62401", "name": "ENSE00001156344"}, {"end": "67977", "name": "ENSE00002779611", "start": "67934"}, {"start": "69901", "name": "ENSE00001156330", "end": "70139"}, {"end": "89147", "start": "89006", "name": "ENSE00001156327"}, {"end": "94907", "start": "94741", "name": "ENSE00002737042"}, {"start": "97782", "name": "ENSE00001156315", "end": "98006"}, {"end": "108818", "name": "ENSE00001456541", "start": "102175"}]}], "aliases": ["PTEN"], "chr": "gene6"}], "best_transcript_file": "/home/creisle/svn/ensembl_flatfiles/ens69_best_transcript.txt", "script": "generate_ensembl_json_temp.pl", "hugo_mapping_file": "/projects/tumour_char/analysis_scripts/databases/processed_files/drug_target_tables/compiled_gene_drug_pathway.v1_2_4.tsv", "generation_time": "Tue Feb 28 11:58:00 2017", "script_version": "2.1.4"} \ No newline at end of file +{"ensembl_version": 69, "genes": [{"name": "ENSG00000128891", "start": 1, "strand": "-", "end": 36375, "transcripts": [{"is_best_transcript": false, "exons": [{"end": 36329, "name": "ENSE00002560148", "start": 36294}, {"end": 34345, "start": 34090, "name": "ENSE00002761197"}, {"end": 28690, "name": "ENSE00002865842", "start": 28534}, {"name": "ENSE00002539417", "start": 1, "end": 108}], "start": 1, "name": "ENST00000560305", "end": 36329, "cdna_coding_start": 49, "aliases": [], "domains": [], "cdna_coding_end": 498}, {"start": 2649, "exons": [{"end": 34345, "name": "ENSE00002761197", "start": 34090}, {"start": 28534, "name": "ENSE00002865842", "end": 28690}, {"name": "ENSE00002540983", "start": 24913, "end": 25445}, {"name": "ENSE00002563185", "start": 2649, "end": 3620}], "name": "ENST00000559153", "is_best_transcript": false, "domains": [], "cdna_coding_end": 570, "end": 34345, "cdna_coding_start": 13, "aliases": []}, {"is_best_transcript": false, "exons": [{"end": 36310, "name": "ENSE00002537918", "start": 36109}, {"start": 34090, "name": "ENSE00002761197", "end": 34345}, {"end": 28690, "start": 28534, "name": "ENSE00002865842"}, {"start": 2659, "name": "ENSE00002565021", "end": 3620}], "start": 2659, "name": "ENST00000561011", "end": 36310, "cdna_coding_start": 215, "aliases": [], "domains": [], "cdna_coding_end": 619}, {"is_best_transcript": false, "exons": [{"end": 36307, "name": "ENSE00002562556", "start": 36294}, {"end": 34345, "name": "ENSE00002761197", "start": 34090}, {"name": "ENSE00002865842", "start": 28534, "end": 28690}, {"end": 25445, "name": "ENSE00002555814", "start": 24755}, {"name": "ENSE00002547244", "start": 5661, "end": 10673}], "start": 5661, "name": "ENST00000559291", "cdna_coding_start": 27, "end": 36307, "aliases": [], "cdna_coding_end": 584, "domains": []}, {"is_best_transcript": false, "name": "ENST00000559911", "start": 6218, "exons": [{"end": 36329, "name": "ENSE00002560148", "start": 36294}, {"end": 34345, "start": 34090, "name": "ENSE00002761197"}, {"start": 28534, "name": "ENSE00002865842", "end": 28690}, {"name": "ENSE00002551447", "start": 10557, "end": 10673}, {"name": "ENSE00002544927", "start": 6218, "end": 6328}], "aliases": [], "end": 36329, "cdna_coding_start": 49, "domains": [], "cdna_coding_end": 660}, {"exons": [{"name": "ENSE00002560148", "start": 36294, "end": 36329}, {"name": "ENSE00002761197", "start": 34090, "end": 34345}, {"name": "ENSE00002865842", "start": 28534, "end": 28690}, {"start": 10547, "name": "ENSE00002549191", "end": 10673}], "start": 10547, "name": "ENST00000558113", "is_best_transcript": false, "domains": [], "cdna_coding_end": 570, "aliases": [], "end": 36329, "cdna_coding_start": 49}, {"cdna_coding_end": 832, "domains": [], "aliases": ["NP_443081.1", "NM_052849.2"], "cdna_coding_start": 275, "end": 36370, "start": 24417, "exons": [{"name": "ENSE00001528957", "start": 36109, "end": 36370}, {"start": 34090, "name": "ENSE00002761197", "end": 34345}, {"start": 28534, "name": "ENSE00002865842", "end": 28690}, {"end": 25445, "start": 24417, "name": "ENSE00001933860"}], "name": "ENST00000358005", "is_best_transcript": true}, {"name": "ENST00000416810", "start": 24422, "exons": [{"end": 36375, "start": 36294, "name": "ENSE00002565648"}, {"end": 34345, "start": 34090, "name": "ENSE00002761197"}, {"start": 28534, "name": "ENSE00002865842", "end": 28690}, {"end": 25445, "start": 24422, "name": "ENSE00001427910"}], "is_best_transcript": false, "domains": [], "cdna_coding_end": 652, "aliases": ["NP_001074261.1", "NM_001080792.1"], "end": 36375, "cdna_coding_start": 95}, {"name": "ENST00000558750", "start": 25260, "exons": [{"start": 36084, "name": "ENSE00001759032", "end": 36298}, {"name": "ENSE00002815778", "start": 34090, "end": 34345}, {"name": "ENSE00002865842", "start": 28534, "end": 28690}, {"start": 25260, "name": "ENSE00002570697", "end": 25445}], "is_best_transcript": false, "domains": [], "cdna_coding_end": 785, "aliases": ["NP_001074260.1", "NM_001080791.1"], "end": 36298, "cdna_coding_start": 201}, {"cdna_coding_start": 98, "end": 36337, "aliases": [], "cdna_coding_end": 349, "domains": [], "is_best_transcript": false, "exons": [{"end": 36337, "start": 36294, "name": "ENSE00002551707"}, {"end": 28690, "start": 28534, "name": "ENSE00002785665"}, {"end": 25445, "start": 25298, "name": "ENSE00002565968"}], "start": 25298, "name": "ENST00000558918"}, {"cdna_coding_end": 557, "domains": [], "aliases": [], "cdna_coding_start": 47, "end": 36332, "start": 25336, "exons": [{"end": 36332, "start": 36294, "name": "ENSE00002549749"}, {"end": 34340, "name": "ENSE00002571615", "start": 34090}, {"end": 28690, "name": "ENSE00002865842", "start": 28534}, {"name": "ENSE00002556846", "start": 25336, "end": 25445}], "name": "ENST00000559103", "is_best_transcript": false}, {"cdna_coding_end": 484, "domains": [], "aliases": [], "cdna_coding_start": 80, "end": 36360, "name": "ENST00000558871", "start": 27939, "exons": [{"end": 36360, "name": "ENSE00002572819", "start": 36294}, {"start": 34090, "name": "ENSE00002761197", "end": 34345}, {"end": 28690, "start": 27939, "name": "ENSE00002536768"}], "is_best_transcript": false}], "aliases": ["C15orf57"], "chr": "gene1"}, {"name": "ENSG00000104147", "start": 1, "transcripts": [{"cdna_coding_end": 750, "domains": [], "cdna_coding_start": 61, "end": 23354, "aliases": ["NP_009211.1", "NM_007280.1"], "exons": [{"end": 23354, "name": "ENSE00000930990", "start": 22973}, {"end": 22714, "start": 22648, "name": "ENSE00000930989"}, {"start": 10391, "name": "ENSE00000942405", "end": 10513}, {"start": 4006, "name": "ENSE00000942406", "end": 4087}, {"end": 582, "name": "ENSE00000884003", "start": 1}], "start": 1, "name": "ENST00000220514", "is_best_transcript": true}, {"is_best_transcript": false, "name": "ENST00000560640", "start": 495, "exons": [{"end": 23220, "start": 22973, "name": "ENSE00002541746"}, {"name": "ENSE00000930989", "start": 22648, "end": 22714}, {"end": 4087, "name": "ENSE00000942406", "start": 4006}, {"start": 495, "name": "ENSE00002548921", "end": 582}], "aliases": [], "end": 23220, "cdna_coding_start": 1, "domains": [], "cdna_coding_end": 485}], "end": 23354, "strand": "-", "chr": "gene2", "aliases": ["OIP5"]}, {"chr": "gene2", "aliases": ["NUSAP1"], "transcripts": [{"aliases": ["NP_001230072.1", "NM_001243143.1"], "end": 71783, "cdna_coding_start": 265, "domains": [], "cdna_coding_end": 1545, "is_best_transcript": false, "start": 23427, "exons": [{"end": 23783, "start": 23427, "name": "ENSE00002541618"}, {"end": 33187, "start": 33119, "name": "ENSE00002921169"}, {"end": 39974, "start": 39831, "name": "ENSE00002741972"}, {"end": 41862, "name": "ENSE00002865492", "start": 41766}, {"end": 46873, "name": "ENSE00002852607", "start": 46772}, {"name": "ENSE00002899380", "start": 48882, "end": 48991}, {"end": 56322, "name": "ENSE00002874525", "start": 56135}, {"end": 62417, "start": 62260, "name": "ENSE00002766779"}, {"name": "ENSE00002819484", "start": 66445, "end": 66561}, {"end": 68037, "name": "ENSE00002924785", "start": 67929}, {"end": 71783, "name": "ENSE00002542797", "start": 70826}], "name": "ENST00000260359"}, {"end": 41819, "cdna_coding_start": 104, "aliases": [], "domains": [], "cdna_coding_end": 334, "is_best_transcript": false, "start": 23588, "name": "ENST00000559046", "exons": [{"end": 23783, "name": "ENSE00002574418", "start": 23588}, {"end": 33187, "start": 33119, "name": "ENSE00002921169"}, {"end": 36335, "start": 36219, "name": "ENSE00002559031"}, {"end": 39974, "name": "ENSE00002847051", "start": 39831}, {"name": "ENSE00002564510", "start": 41766, "end": 41819}]}, {"aliases": ["NP_001230073.1"], "end": 70996, "cdna_coding_start": 88, "domains": [], "cdna_coding_end": 1224, "is_best_transcript": false, "start": 23604, "exons": [{"end": 23783, "start": 23604, "name": "ENSE00002312096"}, {"name": "ENSE00002741972", "start": 39831, "end": 39974}, {"start": 41721, "name": "ENSE00002920477", "end": 41862}, {"end": 46873, "start": 46772, "name": "ENSE00002852607"}, {"name": "ENSE00002899380", "start": 48882, "end": 48991}, {"end": 56322, "start": 56138, "name": "ENSE00001668934"}, {"name": "ENSE00002766779", "start": 62260, "end": 62417}, {"start": 67929, "name": "ENSE00002924785", "end": 68037}, {"end": 70996, "start": 70826, "name": "ENSE00002254025"}], "name": "ENST00000450592"}, {"cdna_coding_start": 88, "end": 71135, "aliases": ["NP_057443.2", "NM_016359.4"], "cdna_coding_end": 1413, "domains": [], "is_best_transcript": false, "start": 23604, "exons": [{"start": 23604, "name": "ENSE00002312096", "end": 23783}, {"end": 33187, "start": 33119, "name": "ENSE00002921169"}, {"name": "ENSE00002741972", "start": 39831, "end": 39974}, {"end": 41862, "start": 41721, "name": "ENSE00002920477"}, {"name": "ENSE00002852607", "start": 46772, "end": 46873}, {"end": 48991, "name": "ENSE00002899380", "start": 48882}, {"end": 56322, "start": 56135, "name": "ENSE00002874525"}, {"name": "ENSE00002766779", "start": 62260, "end": 62417}, {"start": 66445, "name": "ENSE00002819484", "end": 66561}, {"start": 67929, "name": "ENSE00002924785", "end": 68037}, {"name": "ENSE00002563931", "start": 70826, "end": 71135}], "name": "ENST00000559596"}, {"exons": [{"end": 23783, "start": 23604, "name": "ENSE00002312096"}, {"start": 33119, "name": "ENSE00002921169", "end": 33187}, {"name": "ENSE00002741972", "start": 39831, "end": 39974}, {"end": 41862, "name": "ENSE00002920477", "start": 41721}, {"name": "ENSE00002852607", "start": 46772, "end": 46873}, {"start": 48882, "name": "ENSE00002899380", "end": 48991}, {"end": 56322, "name": "ENSE00001668934", "start": 56138}, {"end": 62417, "start": 62260, "name": "ENSE00002766779"}, {"start": 66445, "name": "ENSE00002819484", "end": 66561}, {"start": 67929, "name": "ENSE00002924785", "end": 68037}, {"end": 71783, "start": 70826, "name": "ENSE00002542797"}], "start": 23604, "name": "ENST00000414849", "is_best_transcript": true, "domains": [], "cdna_coding_end": 1410, "aliases": ["NP_001230071.1", "NP_060924.4", "NM_001243142.1", "NM_018454.7"], "end": 71783, "cdna_coding_start": 88}, {"start": 23619, "name": "ENST00000560747", "exons": [{"end": 23783, "start": 23619, "name": "ENSE00002560678"}, {"start": 33119, "name": "ENSE00002921169", "end": 33187}, {"end": 39974, "name": "ENSE00002741972", "start": 39831}, {"end": 41862, "start": 41724, "name": "ENSE00002569544"}, {"end": 46873, "name": "ENSE00002852607", "start": 46772}, {"end": 48991, "name": "ENSE00002899380", "start": 48882}, {"end": 56322, "start": 56138, "name": "ENSE00001668934"}, {"start": 62260, "name": "ENSE00002766779", "end": 62417}, {"end": 66561, "start": 66445, "name": "ENSE00002819484"}, {"name": "ENSE00002924785", "start": 67929, "end": 68037}, {"end": 71783, "start": 70826, "name": "ENSE00002542797"}], "is_best_transcript": false, "domains": [], "cdna_coding_end": 1392, "aliases": [], "end": 71783, "cdna_coding_start": 73}, {"is_best_transcript": false, "exons": [{"end": 23783, "name": "ENSE00002572945", "start": 23646}, {"end": 33187, "name": "ENSE00002921169", "start": 33119}, {"start": 39831, "name": "ENSE00002741972", "end": 39974}, {"end": 41862, "start": 41724, "name": "ENSE00002569544"}, {"end": 46873, "name": "ENSE00002852607", "start": 46772}, {"start": 48882, "name": "ENSE00002899380", "end": 48991}, {"end": 56322, "name": "ENSE00002874525", "start": 56135}, {"start": 62260, "name": "ENSE00002766779", "end": 62417}, {"name": "ENSE00002819484", "start": 66445, "end": 66561}, {"start": 67929, "name": "ENSE00002924785", "end": 68037}, {"start": 70826, "name": "ENSE00002542797", "end": 71783}], "start": 23646, "name": "ENST00000560177", "end": 71783, "cdna_coding_start": 46, "aliases": [], "domains": [], "cdna_coding_end": 1368}, {"name": "ENST00000557840", "start": 33129, "exons": [{"end": 33187, "name": "ENSE00002562906", "start": 33129}, {"end": 56322, "start": 56164, "name": "ENSE00002552359"}, {"name": "ENSE00002874978", "start": 62260, "end": 62417}, {"end": 66484, "start": 66445, "name": "ENSE00002572000"}], "is_best_transcript": false, "domains": [], "cdna_coding_end": 206, "aliases": [], "end": 66484, "cdna_coding_start": 1}, {"aliases": [], "end": 68005, "cdna_coding_start": 1, "domains": [], "cdna_coding_end": 435, "is_best_transcript": false, "start": 41835, "exons": [{"end": 41862, "name": "ENSE00002545491", "start": 41835}, {"start": 46772, "name": "ENSE00002852607", "end": 46873}, {"name": "ENSE00002899380", "start": 48882, "end": 48991}, {"start": 56138, "name": "ENSE00001668934", "end": 56322}, {"name": "ENSE00002538934", "start": 56549, "end": 56596}, {"end": 62417, "start": 62260, "name": "ENSE00002874978"}, {"end": 68005, "start": 67929, "name": "ENSE00002570378"}], "name": "ENST00000560898"}, {"is_best_transcript": false, "start": 23461, "exons": [{"end": 23783, "start": 23461, "name": "ENSE00001206443"}, {"end": 33187, "start": 33119, "name": "ENSE00002921169"}, {"start": 39831, "name": "ENSE00002741972", "end": 39974}, {"end": 41862, "name": "ENSE00002920477", "start": 41721}, {"name": "ENSE00002852607", "start": 46772, "end": 46873}, {"end": 48991, "start": 48882, "name": "ENSE00002899380"}, {"start": 56135, "name": "ENSE00002874525", "end": 56322}, {"end": 62417, "start": 62260, "name": "ENSE00002766779"}, {"start": 67929, "name": "ENSE00002924785", "end": 68037}, {"start": 70826, "name": "ENSE00000931000", "end": 71780}], "name": "ENST00000450318", "aliases": [], "end": 71780, "cdna_coding_start": 231, "domains": [], "cdna_coding_end": 1439}], "end": 71783, "strand": "+", "start": 23427, "name": "ENSG00000137804"}, {"aliases": ["PFKFB2"], "chr": "gene3", "end": 31569, "strand": "+", "transcripts": [{"start": 3805, "exons": [{"end": 3911, "name": "ENSE00001872821", "start": 3805}, {"name": "ENSE00002901181", "start": 5246, "end": 5347}, {"start": 12498, "name": "ENSE00002934688", "end": 12623}, {"end": 13261, "start": 13165, "name": "ENSE00002915251"}, {"name": "ENSE00002265243", "start": 13688, "end": 13754}, {"start": 13892, "name": "ENSE00002223873", "end": 13966}, {"name": "ENSE00002234935", "start": 14318, "end": 14374}, {"start": 15581, "name": "ENSE00002233337", "end": 15705}, {"end": 18251, "start": 18044, "name": "ENSE00000842580"}, {"name": "ENSE00000842581", "start": 18708, "end": 18854}, {"start": 19969, "name": "ENSE00002876346", "end": 20073}, {"end": 20954, "start": 20825, "name": "ENSE00001595535"}, {"name": "ENSE00002889740", "start": 21733, "end": 21795}, {"name": "ENSE00002760808", "start": 22054, "end": 22118}, {"start": 22749, "name": "ENSE00001185299", "end": 28368}], "name": "ENST00000367080", "is_best_transcript": true, "domains": [{"regions": [{"start": 40, "end": 193}], "desc": "Chromatin_KTI12", "name": "PF08433"}, {"name": "SSF53254", "desc": "", "regions": [{"end": 469, "start": 251}]}, {"desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709", "regions": [{"start": 1, "end": 488}]}, {"regions": [{"end": 250, "start": 34}], "name": "SSF52540", "desc": ""}, {"regions": [{"end": 249, "start": 27}], "name": "PF01591", "desc": "6Phosfructo_kin"}, {"desc": "6Pfruct_kin", "name": "PR00991", "regions": [{"start": 124, "end": 138}, {"start": 150, "end": 164}, {"end": 190, "start": 176}, {"start": 230, "end": 251}, {"start": 252, "end": 274}, {"start": 329, "end": 345}]}, {"regions": [{"end": 398, "start": 251}], "desc": "His_Pase_superF_clade-1", "name": "PF00300"}, {"desc": "His_Pase_superF_clade-1", "name": "SM00855", "regions": [{"start": 251, "end": 398}]}], "cdna_coding_end": 1642, "end": 28368, "cdna_coding_start": 125, "aliases": ["NP_006203.2", "NM_006212.2"]}, {"aliases": ["NP_001018063.1", "NM_001018053.1"], "end": 31569, "cdna_coding_start": 74, "domains": [{"name": "SSF52540", "desc": "", "regions": [{"end": 250, "start": 34}]}, {"regions": [{"start": 251, "end": 398}], "name": "SM00855", "desc": "His_Pase_superF_clade-1"}, {"regions": [{"start": 124, "end": 138}, {"start": 150, "end": 164}, {"start": 176, "end": 190}, {"start": 230, "end": 251}, {"start": 252, "end": 274}, {"start": 329, "end": 345}], "desc": "6Pfruct_kin", "name": "PR00991"}, {"desc": "His_Pase_superF_clade-1", "name": "PF00300", "regions": [{"start": 251, "end": 398}]}, {"regions": [{"end": 249, "start": 27}], "name": "PF01591", "desc": "6Phosfructo_kin"}, {"name": "PIRSF000709", "desc": "Bifunct_6PFK/fruc_bisP_Ptase", "regions": [{"end": 468, "start": 1}]}, {"name": "SSF53254", "desc": "", "regions": [{"end": 469, "start": 251}]}, {"name": "PF08433", "desc": "Chromatin_KTI12", "regions": [{"start": 40, "end": 193}]}], "cdna_coding_end": 1489, "is_best_transcript": false, "start": 3856, "name": "ENST00000367079", "exons": [{"start": 3856, "name": "ENSE00001818425", "end": 3911}, {"end": 5347, "name": "ENSE00002901181", "start": 5246}, {"end": 12623, "start": 12498, "name": "ENSE00002934688"}, {"end": 13261, "name": "ENSE00002915251", "start": 13165}, {"end": 13754, "start": 13688, "name": "ENSE00002265243"}, {"end": 13966, "name": "ENSE00002223873", "start": 13892}, {"end": 14374, "name": "ENSE00002234935", "start": 14318}, {"start": 15581, "name": "ENSE00002233337", "end": 15705}, {"name": "ENSE00000842580", "start": 18044, "end": 18251}, {"name": "ENSE00000842581", "start": 18708, "end": 18854}, {"end": 20073, "start": 19969, "name": "ENSE00002876346"}, {"name": "ENSE00001595535", "start": 20825, "end": 20954}, {"end": 21795, "start": 21733, "name": "ENSE00002889740"}, {"name": "ENSE00002760808", "start": 22054, "end": 22118}, {"start": 29499, "name": "ENSE00001443434", "end": 31569}]}, {"aliases": [], "end": 22950, "cdna_coding_start": 227, "domains": [{"regions": [{"end": 160, "start": 7}], "name": "PF08433", "desc": "Chromatin_KTI12"}, {"regions": [{"start": 1, "end": 455}], "desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709"}, {"regions": [{"end": 436, "start": 218}], "desc": "", "name": "SSF53254"}, {"regions": [{"end": 105, "start": 91}, {"end": 131, "start": 117}, {"start": 143, "end": 157}, {"start": 197, "end": 218}, {"end": 241, "start": 219}, {"start": 296, "end": 312}], "name": "PR00991", "desc": "6Pfruct_kin"}, {"name": "PF01591", "desc": "6Phosfructo_kin", "regions": [{"end": 216, "start": 2}]}, {"name": "PF00300", "desc": "His_Pase_superF_clade-1", "regions": [{"start": 218, "end": 365}]}, {"name": "SM00855", "desc": "His_Pase_superF_clade-1", "regions": [{"start": 218, "end": 365}]}, {"regions": [{"start": 1, "end": 217}], "name": "SSF52540", "desc": ""}], "cdna_coding_end": 1645, "is_best_transcript": false, "start": 5245, "exons": [{"start": 5245, "name": "ENSE00002308738", "end": 5347}, {"name": "ENSE00002226272", "start": 11367, "end": 11475}, {"end": 12623, "name": "ENSE00002905114", "start": 12498}, {"end": 13261, "start": 13165, "name": "ENSE00002915251"}, {"name": "ENSE00002265243", "start": 13688, "end": 13754}, {"end": 13966, "start": 13892, "name": "ENSE00002223873"}, {"end": 14374, "name": "ENSE00002234935", "start": 14318}, {"end": 15705, "start": 15581, "name": "ENSE00002233337"}, {"end": 18251, "start": 18044, "name": "ENSE00000842580"}, {"end": 18854, "start": 18708, "name": "ENSE00000842581"}, {"end": 20073, "name": "ENSE00002876346", "start": 19969}, {"name": "ENSE00001595535", "start": 20825, "end": 20954}, {"name": "ENSE00002889740", "start": 21733, "end": 21795}, {"name": "ENSE00002760808", "start": 22054, "end": 22118}, {"name": "ENSE00002308130", "start": 22749, "end": 22950}], "name": "ENST00000545806"}, {"exons": [{"end": 162, "start": 1, "name": "ENSE00002231494"}, {"name": "ENSE00002892386", "start": 5246, "end": 5347}, {"name": "ENSE00002797095", "start": 12498, "end": 12623}, {"name": "ENSE00002308344", "start": 13092, "end": 13261}, {"name": "ENSE00002265243", "start": 13688, "end": 13754}, {"end": 13966, "name": "ENSE00002223873", "start": 13892}, {"start": 14318, "name": "ENSE00002234935", "end": 14374}, {"start": 15581, "name": "ENSE00002233337", "end": 15705}, {"end": 18251, "start": 18044, "name": "ENSE00000842580"}, {"end": 18854, "start": 18708, "name": "ENSE00000842581"}, {"end": 20073, "start": 19969, "name": "ENSE00002876346"}, {"name": "ENSE00001595535", "start": 20825, "end": 20954}, {"end": 21795, "start": 21733, "name": "ENSE00002889740"}, {"start": 22054, "name": "ENSE00002760808", "end": 22118}, {"end": 29679, "start": 29499, "name": "ENSE00001791500"}], "start": 1, "name": "ENST00000411990", "is_best_transcript": false, "cdna_coding_end": 1668, "domains": [{"desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709", "regions": [{"start": 1, "end": 370}]}, {"desc": "", "name": "SSF53254", "regions": [{"end": 371, "start": 153}]}, {"desc": "", "name": "SSF52540", "regions": [{"start": 4, "end": 152}]}, {"desc": "6Pfruct_kin", "name": "PR00991", "regions": [{"start": 26, "end": 40}, {"start": 52, "end": 66}, {"start": 78, "end": 92}, {"start": 132, "end": 153}, {"end": 176, "start": 154}, {"start": 231, "end": 247}]}, {"name": "PF01591", "desc": "6Phosfructo_kin", "regions": [{"end": 151, "start": 1}]}, {"regions": [{"end": 300, "start": 153}], "name": "PF00300", "desc": "His_Pase_superF_clade-1"}, {"regions": [{"start": 153, "end": 300}], "name": "SM00855", "desc": "His_Pase_superF_clade-1"}], "aliases": [], "cdna_coding_start": 547, "end": 29679}, {"cdna_coding_start": 84, "end": 29931, "aliases": [], "cdna_coding_end": 878, "domains": [{"regions": [{"start": 1, "end": 64}], "name": "SSF52540", "desc": ""}, {"desc": "", "name": "SSF53254", "regions": [{"end": 262, "start": 65}]}, {"desc": "Bifunct_6PFK/fruc_bisP_Ptase", "name": "PIRSF000709", "regions": [{"start": 1, "end": 261}]}, {"regions": [{"start": 65, "end": 212}], "name": "SM00855", "desc": "His_Pase_superF_clade-1"}, {"name": "PF01591", "desc": "6Phosfructo_kin", "regions": [{"end": 63, "start": 1}]}, {"desc": "6Pfruct_kin", "name": "PR00991", "regions": [{"start": 44, "end": 65}, {"start": 66, "end": 88}, {"start": 143, "end": 159}]}, {"regions": [{"end": 212, "start": 65}], "name": "PF00300", "desc": "His_Pase_superF_clade-1"}], "is_best_transcript": false, "start": 15549, "name": "ENST00000541914", "exons": [{"name": "ENSE00002294124", "start": 15549, "end": 15705}, {"name": "ENSE00000842580", "start": 18044, "end": 18251}, {"start": 18708, "name": "ENSE00000842581", "end": 18854}, {"end": 20073, "start": 19969, "name": "ENSE00002876346"}, {"start": 20825, "name": "ENSE00001595535", "end": 20954}, {"name": "ENSE00002760808", "start": 22054, "end": 22118}, {"name": "ENSE00002284395", "start": 29499, "end": 29931}]}], "start": 1, "name": "ENSG00000123836"}, {"start": 1, "name": "ENSG00000187416", "chr": "gene4", "aliases": ["LHFPL3"], "transcripts": [{"domains": [{"regions": [{"start": 22, "end": 199}], "desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242"}], "cdna_coding_end": 835, "aliases": [], "end": 578576, "cdna_coding_start": 167, "exons": [{"end": 569, "name": "ENSE00001713788", "start": 1}, {"end": 408255, "start": 408019, "name": "ENSE00001349915"}, {"end": 578576, "start": 577531, "name": "ENSE00001723245"}], "start": 1, "name": "ENST00000424859", "is_best_transcript": false}, {"start": 45, "name": "ENST00000401970", "exons": [{"end": 569, "name": "ENSE00001554382", "start": 45}, {"end": 408255, "start": 408019, "name": "ENSE00001349915"}, {"end": 516773, "name": "ENSE00001746769", "start": 516726}, {"end": 578098, "start": 577531, "name": "ENSE00001593689"}], "is_best_transcript": false, "domains": [{"desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242", "regions": [{"end": 199, "start": 22}]}], "cdna_coding_end": 779, "aliases": [], "end": 578098, "cdna_coding_start": 123}, {"end": 579898, "cdna_coding_start": 125, "aliases": ["NP_945351.1", "NM_199000.2"], "domains": [{"desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242", "regions": [{"end": 213, "start": 36}]}, {"regions": [{"end": 22, "start": 4}], "desc": "", "name": "PS50310"}], "cdna_coding_end": 835, "is_best_transcript": true, "start": 1, "name": "ENST00000535008", "exons": [{"start": 1, "name": "ENSE00002286305", "end": 573}, {"end": 141399, "name": "ENSE00002499039", "start": 141389}, {"name": "ENSE00002324321", "start": 294348, "end": 294368}, {"end": 408255, "name": "ENSE00002278044", "start": 408055}, {"name": "ENSE00002230996", "start": 577531, "end": 579898}]}, {"aliases": [], "cdna_coding_start": 81, "end": 578099, "cdna_coding_end": 779, "domains": [{"name": "PS50310", "desc": "", "regions": [{"start": 4, "end": 22}]}, {"desc": "Lipome_HGMIC_fus_partner-like", "name": "PF10242", "regions": [{"end": 213, "start": 36}]}], "is_best_transcript": false, "name": "ENST00000543266", "start": 45, "exons": [{"end": 573, "name": "ENSE00002312340", "start": 45}, {"name": "ENSE00002499039", "start": 141389, "end": 141399}, {"start": 294348, "name": "ENSE00002324321", "end": 294368}, {"end": 408255, "start": 408055, "name": "ENSE00002278044"}, {"end": 516773, "start": 516726, "name": "ENSE00001746769"}, {"end": 578099, "start": 577531, "name": "ENSE00002270291"}]}], "strand": "+", "end": 579898}, {"start": 1, "name": "ENSG00000122565", "strand": "+", "end": 12195, "transcripts": [{"cdna_coding_start": 429, "end": 12195, "aliases": ["NP_009207.2", "NM_007276.4"], "cdna_coding_end": 980, "domains": [{"name": "PR00504", "desc": "Chromo_dom_subgr", "regions": [{"end": 35, "start": 27}, {"end": 54, "start": 40}, {"end": 67, "start": 55}]}, {"name": "PF01393", "desc": "Chromo_shadow_dom", "regions": [{"start": 119, "end": 176}]}, {"regions": [{"end": 175, "start": 106}, {"start": 15, "end": 79}], "name": "SSF54160", "desc": "Chromodomain-like"}, {"regions": [{"end": 78, "start": 30}], "name": "PF00385", "desc": "Chromo_domain"}, {"regions": [{"start": 29, "end": 81}, {"start": 120, "end": 172}], "name": "SM00298", "desc": "Chromo_domain/shadow"}, {"regions": [{"end": 177, "start": 115}], "name": "SM00300", "desc": "Chromo_shadow_dom"}, {"desc": "Chromo_domain/shadow", "name": "PS50013", "regions": [{"end": 88, "start": 30}, {"end": 179, "start": 121}]}], "is_best_transcript": true, "exons": [{"end": 400, "start": 1, "name": "ENSE00001189828"}, {"end": 1861, "name": "ENSE00001433792", "start": 1810}, {"start": 5207, "name": "ENSE00002858776", "end": 5349}, {"start": 7232, "name": "ENSE00002882596", "end": 7394}, {"end": 10595, "start": 10501, "name": "ENSE00002768648"}, {"start": 10921, "name": "ENSE00002778428", "end": 12195}], "start": 1, "name": "ENST00000337620"}, {"start": 532, "name": "ENST00000396386", "exons": [{"start": 532, "name": "ENSE00001524790", "end": 665}, {"start": 1810, "name": "ENSE00001433792", "end": 1861}, {"end": 5349, "name": "ENSE00002858776", "start": 5207}, {"name": "ENSE00002882596", "start": 7232, "end": 7394}, {"start": 10501, "name": "ENSE00002768648", "end": 10595}, {"name": "ENSE00001553667", "start": 10921, "end": 12190}], "is_best_transcript": false, "cdna_coding_end": 714, "domains": [{"regions": [{"start": 115, "end": 177}], "name": "SM00300", "desc": "Chromo_shadow_dom"}, {"regions": [{"end": 88, "start": 30}, {"end": 179, "start": 121}], "desc": "Chromo_domain/shadow", "name": "PS50013"}, {"name": "SM00298", "desc": "Chromo_domain/shadow", "regions": [{"start": 29, "end": 81}, {"start": 120, "end": 172}]}, {"desc": "Chromo_dom_subgr", "name": "PR00504", "regions": [{"end": 35, "start": 27}, {"end": 54, "start": 40}, {"start": 55, "end": 67}]}, {"regions": [{"start": 106, "end": 175}, {"start": 15, "end": 79}], "name": "SSF54160", "desc": "Chromodomain-like"}, {"name": "PF01393", "desc": "Chromo_shadow_dom", "regions": [{"start": 119, "end": 176}]}, {"regions": [{"end": 78, "start": 30}], "name": "PF00385", "desc": "Chromo_domain"}], "cdna_coding_start": 163, "end": 12190, "aliases": ["NP_057671.2", "NM_016587.3"]}, {"cdna_coding_end": 1068, "domains": [{"name": "PF00385", "desc": "Chromo_domain", "regions": [{"end": 62, "start": 30}]}, {"name": "SSF54160", "desc": "Chromodomain-like", "regions": [{"start": 22, "end": 62}]}, {"desc": "Chromo_dom_subgr", "name": "PR00504", "regions": [{"end": 35, "start": 27}, {"start": 40, "end": 54}, {"start": 55, "end": 62}]}, {"regions": [{"start": 30, "end": 62}], "name": "PS50013", "desc": "Chromo_domain/shadow"}], "cdna_coding_start": 882, "end": 7251, "aliases": [], "start": 596, "name": "ENST00000456948", "exons": [{"end": 1448, "start": 596, "name": "ENSE00001696375"}, {"start": 1810, "name": "ENSE00001433792", "end": 1861}, {"end": 5349, "name": "ENSE00002858776", "start": 5207}, {"start": 7232, "name": "ENSE00001778411", "end": 7251}], "is_best_transcript": false}, {"is_best_transcript": false, "start": 604, "name": "ENST00000409747", "exons": [{"end": 665, "start": 604, "name": "ENSE00001588865"}, {"end": 1861, "start": 1810, "name": "ENSE00001433792"}, {"start": 5207, "name": "ENSE00002858776", "end": 5349}, {"name": "ENSE00001577751", "start": 7305, "end": 7394}, {"start": 10501, "name": "ENSE00002765595", "end": 10595}, {"start": 10921, "name": "ENSE00002880768", "end": 12195}], "cdna_coding_start": 91, "end": 12195, "aliases": [], "cdna_coding_end": 396, "domains": [{"regions": [{"start": 15, "end": 56}], "desc": "Chromodomain-like", "name": "SSF54160"}, {"regions": [{"end": 57, "start": 30}], "desc": "Chromo_domain", "name": "PF00385"}, {"desc": "Chromo_domain/shadow", "name": "PS50013", "regions": [{"start": 30, "end": 91}]}]}], "aliases": ["CBX3"], "chr": "gene5"}, {"name": "ENSG00000171862", "start": 1, "end": 108818, "strand": "+", "transcripts": [{"aliases": ["NP_000305.3", "NM_000314.4"], "cdna_coding_start": 1358, "end": 108818, "cdna_coding_end": 2569, "domains": [{"desc": "C2_Ca/lipid-bd_dom_CaLB", "name": "SSF49562", "regions": [{"start": 188, "end": 351}]}, {"regions": [{"end": 159, "start": 80}], "name": "PF00782", "desc": "Dual-sp_phosphatase_cat-dom"}, {"desc": "Bifunc_PIno_P3_Pase/Pase_PTEN", "name": "PIRSF038025", "regions": [{"start": 1, "end": 403}]}, {"regions": [{"start": 59, "end": 181}], "name": "PF00102", "desc": "Tyr_Pase_rcpt/non-rcpt"}, {"desc": "Tensin_phosphatase_C2-dom", "name": "PF10409", "regions": [{"end": 349, "start": 188}]}, {"regions": [{"end": 350, "start": 190}], "desc": "Tensin_phosphatase_C2-dom", "name": "PS51182"}, {"name": "SM00404", "desc": "Tyr_Pase_cat", "regions": [{"end": 183, "start": 23}]}, {"regions": [{"end": 187, "start": 14}], "desc": "", "name": "SSF52799"}, {"regions": [{"start": 102, "end": 173}], "desc": "Tyr/Dual-specificity_Pase", "name": "PS50056"}, {"regions": [{"end": 185, "start": 14}], "name": "PS51181", "desc": "Phosphatase_tensin-typ"}], "is_best_transcript": true, "start": 1, "name": "ENST00000371953", "exons": [{"end": 1436, "start": 1, "name": "ENSE00001456562"}, {"end": 30997, "name": "ENSE00001156351", "start": 30913}, {"end": 62445, "start": 62401, "name": "ENSE00001156344"}, {"end": 67977, "name": "ENSE00002779611", "start": 67934}, {"start": 69901, "name": "ENSE00001156330", "end": 70139}, {"end": 89147, "start": 89006, "name": "ENSE00001156327"}, {"end": 94907, "start": 94741, "name": "ENSE00002737042"}, {"start": 97782, "name": "ENSE00001156315", "end": 98006}, {"end": 108818, "name": "ENSE00001456541", "start": 102175}]}], "aliases": ["PTEN"], "chr": "gene6"}], "best_transcript_file": "/home/creisle/svn/ensembl_flatfiles/ens69_best_transcript.txt", "script": "generate_ensembl_json_temp.pl", "hugo_mapping_file": "/projects/tumour_char/analysis_scripts/databases/processed_files/drug_target_tables/compiled_gene_drug_pathway.v1_2_4.tsv", "generation_time": "Tue Feb 28 11:58:00 2017", "script_version": "2.1.4"} diff --git a/tests/data/mock_reference_annotations.full.json b/tests/data/mock_reference_annotations.full.json new file mode 100644 index 00000000..836273b0 --- /dev/null +++ b/tests/data/mock_reference_annotations.full.json @@ -0,0 +1 @@ +{"genes": [{"aliases": ["C9orf47"], "chr": "fakereference9", "end": 5278, "name": "ENSG00000186354", "start": 1, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 685, "cdna_coding_start": 134, "domains": [], "end": 5278, "exons": [{"end": 322, "start": 1}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": true, "name": "ENST00000375851", "start": 1}, {"aliases": [], "cdna_coding_end": 783, "cdna_coding_start": 76, "domains": [], "end": 1202, "exons": [{"end": 322, "start": 59}, {"end": 1202, "start": 608}], "is_best_transcript": false, "name": "ENST00000375850", "start": 59}, {"aliases": [], "cdna_coding_end": 677, "cdna_coding_start": 69, "domains": [], "end": 5278, "exons": [{"end": 379, "start": 66}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": false, "name": "ENST00000334490", "start": 66}]}, {"aliases": ["S1PR3"], "chr": "fakereference9", "end": 14148, "name": "ENSG00000213694", "start": 585, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 1533, "cdna_coding_start": 397, "domains": [{"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}], "end": 14148, "exons": [{"end": 833, "start": 585}, {"end": 14148, "start": 10192}], "is_best_transcript": false, "name": "ENST00000358157", "start": 585}, {"aliases": [], "cdna_coding_end": 5832, "cdna_coding_start": 4696, "domains": [{"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}], "end": 14148, "exons": [{"end": 14148, "start": 5644}], "is_best_transcript": true, "name": "ENST00000375846", "start": 5644}]}]} \ No newline at end of file diff --git a/tests/data/mock_reference_annotations.full.tsv b/tests/data/mock_reference_annotations.full.tsv deleted file mode 100644 index 7ead95e3..00000000 --- a/tests/data/mock_reference_annotations.full.tsv +++ /dev/null @@ -1,6 +0,0 @@ -ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges -ENSG00000186354 C9orf47 fakereference9 1 1 5278 ENST00000375851 ENST00000375851 NP_001135885.1;NM_001142413.1 1 5278 134 685 1-322;608-833;990-5278 -ENSG00000186354 C9orf47 fakereference9 1 1 5278 ENST00000375851 ENST00000375850 59 1202 76 783 59-322;608-1202 -ENSG00000186354 C9orf47 fakereference9 1 1 5278 ENST00000375851 ENST00000334490 NP_001001938.1;NM_001001938.3 66 5278 69 677 66-379;608-833;990-5278 -ENSG00000213694 S1PR3 fakereference9 1 585 14148 ENST00000375846 ENST00000358157 NP_005217.2;NM_005226.3 585 14148 397 1533 585-833;10192-14148 PR00362:49-62,185-200,298-308;PR00642:63-75,102-112,139-155,329-345;PS50262:56-298;PF10320:55-312;SSF81321:1-340;PR00237:41-65,74-95,118-140,153-174,196-219,241-265,280-306;PR01523:13-25,92-101,112-123,194-204,215-224,272-283,301-311;PF00001:56-298;PR01524:24-40,139-155,223-233,314-323,326-340 -ENSG00000213694 S1PR3 fakereference9 1 585 14148 ENST00000375846 ENST00000375846 5644 14148 4696 5832 5644-14148 PF10320:55-312;PR00362:49-62,185-200,298-308;PS50262:56-298;PR00642:63-75,102-112,139-155,329-345;PR00237:41-65,74-95,118-140,153-174,196-219,241-265,280-306;PR01523:13-25,92-101,112-123,194-204,215-224,272-283,301-311;PR01524:24-40,139-155,223-233,314-323,326-340;PF00001:56-298;SSF81321:1-340 diff --git a/tests/data/mock_reference_annotations.json b/tests/data/mock_reference_annotations.json index 28b06e57..44c96d4e 100644 --- a/tests/data/mock_reference_annotations.json +++ b/tests/data/mock_reference_annotations.json @@ -4,7 +4,7 @@ "chr": "fake", "start": 1, "end": 1000, - "strand": "1", + "strand": "+", "name": "ENSG0001", "aliases": [], "transcripts": [ diff --git a/tests/data/mock_reference_annotations.tsv b/tests/data/mock_reference_annotations.tsv deleted file mode 100644 index 14391a69..00000000 --- a/tests/data/mock_reference_annotations.tsv +++ /dev/null @@ -1,7 +0,0 @@ -ensembl_gene_id chr strand gene_start gene_end ensembl_transcript_id transcript_genomic_start transcript_genomic_end -GENE-A fake + 100 200 TRANSCRIPT-A 100 200 -GENE-B fake - 250 350 TRANSCRIPT-B 250 350 -GENE-C fake + 300 400 TRANSCRIPT-C 300 400 -GENE-D fake - 450 550 TRANSCRIPT-D 450 550 -GENE-E fake + 500 600 TRANSCRIPT-E 500 600 -GENE-F fake + 550 650 TRANSCRIPT-E 550 650 diff --git a/tests/data/mock_reference_annotations2.json b/tests/data/mock_reference_annotations2.json new file mode 100644 index 00000000..f1ef1c50 --- /dev/null +++ b/tests/data/mock_reference_annotations2.json @@ -0,0 +1 @@ +{"genes": [{"aliases": [], "chr": "fake", "end": 200, "name": "GENE-A", "start": 100, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 200, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-A", "start": 100}]}, {"aliases": [], "chr": "fake", "end": 350, "name": "GENE-B", "start": 250, "strand": "-", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 350, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-B", "start": 250}]}, {"aliases": [], "chr": "fake", "end": 400, "name": "GENE-C", "start": 300, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 400, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-C", "start": 300}]}, {"aliases": [], "chr": "fake", "end": 550, "name": "GENE-D", "start": 450, "strand": "-", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 550, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-D", "start": 450}]}, {"aliases": [], "chr": "fake", "end": 600, "name": "GENE-E", "start": 500, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 600, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-E", "start": 500}]}, {"aliases": [], "chr": "fake", "end": 650, "name": "GENE-F", "start": 550, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 650, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-E", "start": 550}]}]} \ No newline at end of file diff --git a/tests/integration/test_annotate.py b/tests/integration/test_annotate.py index bf816b70..d6e5e2ed 100644 --- a/tests/integration/test_annotate.py +++ b/tests/integration/test_annotate.py @@ -31,7 +31,7 @@ def setUpModule(): global REFERENCE_ANNOTATIONS, REFERENCE_GENOME, REF_CHR, EXAMPLE_GENES EXAMPLE_GENES = get_example_genes() - REFERENCE_ANNOTATIONS = load_annotations(get_data('mock_reference_annotations.tsv')) + REFERENCE_ANNOTATIONS = load_annotations(get_data('mock_reference_annotations2.json')) count = sum([len(genes) for genes in REFERENCE_ANNOTATIONS.values()]) print('loaded annotations', count) assert count >= 6 # make sure this is the file we expect @@ -1737,7 +1737,7 @@ def test_calculate_orf_nested(self): class TestAnnotateEvents: def test_annotate_events(self): - reference_annotations = load_annotations(get_data('mock_reference_annotations.full.tsv')) + reference_annotations = load_annotations(get_data('mock_reference_annotations.full.json')) b1 = Breakpoint('fakereference9', 658, orient=ORIENT.RIGHT, strand=STRAND.POS) b2 = Breakpoint('fakereference9', 10237, orient=ORIENT.RIGHT, strand=STRAND.NEG) bpp = BreakpointPair( diff --git a/tests/integration/test_annotate_fileio.py b/tests/integration/test_annotate_fileio.py index 926928aa..7b1a09de 100644 --- a/tests/integration/test_annotate_fileio.py +++ b/tests/integration/test_annotate_fileio.py @@ -1,40 +1,11 @@ -from mavis.annotate.file_io import convert_tab_to_json, load_annotations +from mavis.annotate.file_io import load_annotations from ..util import get_data -TAB = get_data('annotations_subsample.tab') JSON = get_data('annotations_subsample.json') class TestAnnotationLoading: - def test_convert_tab_to_json(self): - json = convert_tab_to_json(TAB, warn=print) - assert len(json['genes']) == 32 - - def test_tab_equivalent_to_json(self): - tab_result = load_annotations(TAB, warn=print) - json_result = load_annotations(JSON, warn=print) - assert sorted(json_result.keys()) == sorted(tab_result.keys()) - - def test_load_tab(self): - result = load_annotations(TAB, warn=print) - assert len(result.keys()) == 12 - domains = [] - for gene in result['12']: - for t in gene.spliced_transcripts: - print(t) - if t.unspliced_transcript.name == 'ENST00000550458': - tl = t.translations[0] - domains = tl.domains - break - if domains: - break - for d in domains: - print(d.name, d.regions) - assert len(domains) == 2 - result = load_annotations(get_data('mock_reference_annotations.tsv'), warn=print) - assert len(result.keys()) == 1 - def test_load_json(self): result = load_annotations(JSON, warn=print) assert len(result.keys()) == 12 From 2c9bca0a6c1a8a05437bcd1ddaf6e442339cffb9 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 20 Jan 2022 23:59:47 -0800 Subject: [PATCH 085/137] Add conversion script to migration guide --- docs/migrating.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/migrating.md b/docs/migrating.md index d56e17c3..91fb0d4f 100644 --- a/docs/migrating.md +++ b/docs/migrating.md @@ -19,3 +19,13 @@ config file ### Scheduling MAVIS is now integrated with snakemake instead of handling its own scheduling + +## Reference Annotation Files + +MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory + +```bash +python src/tools/migrate_mavis_annotations_to_jsonl.py \ + /path/to/tab/file.tab \ + /path/to/new/json/file.json +``` From 02c108f779341f8604e8c1586c78e3c57692fe33 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 22 Jan 2022 12:16:21 -0800 Subject: [PATCH 086/137] Add some schema failure unit tests for annotations.jon --- tests/end_to_end/test_convert.py | 4 ++-- tests/unit/test_annotate_fileio.py | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_annotate_fileio.py diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index e0b29e8e..671e4d10 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -114,14 +114,14 @@ def test_sniffle(self): record = results['vcf-35777'][0] print(record, record.data) assert record.data['event_type'] == 'translocation' - + def test_cuteSV(self): results = self.run_main(get_data('cuteSV.vcf'), SUPPORTED_TOOL.VCF, False) print(results.keys()) record = results['vcf-cuteSV.BND.0'][0] print(record, record.data) assert record.data['event_type'] == 'inverted translocation' - + def test_breakseq2(self): self.run_main(get_data('breakseq.vcf'), SUPPORTED_TOOL.BREAKSEQ, False) diff --git a/tests/unit/test_annotate_fileio.py b/tests/unit/test_annotate_fileio.py new file mode 100644 index 00000000..43823519 --- /dev/null +++ b/tests/unit/test_annotate_fileio.py @@ -0,0 +1,22 @@ +import json + +import pytest +from mavis.annotate.file_io import load_annotations + + +@pytest.mark.parametrize( + 'annotations,error_message_include', + [ + [{'genes': []}, "schema['properties']['genes']"], + [ + {'genes': [{'start': '1'}]}, + "schema['properties']['genes']['items']['properties']['start']", + ], + ], +) +def test_min_genes_error(annotations, error_message_include, tmp_path): + filename = tmp_path / "annotations.json" + filename.write_text(json.dumps(annotations)) + with pytest.raises(AssertionError) as exc: + load_annotations(str(filename)) + assert error_message_include in str(exc.value) From 7e9b9091583182878fff157eeed146ba6d360f07 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 26 Jan 2022 12:43:40 -0800 Subject: [PATCH 087/137] Standardize logging --- src/mavis/align.py | 35 ++--- src/mavis/annotate/file_io.py | 20 ++- src/mavis/annotate/main.py | 41 +++--- src/mavis/annotate/variant.py | 11 +- src/mavis/assemble.py | 31 ++--- src/mavis/bam/cache.py | 18 ++- src/mavis/bam/stats.py | 12 +- src/mavis/blat.py | 11 +- src/mavis/cluster/cluster.py | 8 +- src/mavis/cluster/main.py | 39 +++--- src/mavis/config.py | 10 +- src/mavis/constants.py | 3 +- src/mavis/illustrate/diagram.py | 14 +- src/mavis/illustrate/scatter.py | 13 +- src/mavis/interval.py | 2 +- src/mavis/main.py | 14 +- src/mavis/overlay.py | 11 +- src/mavis/pairing/main.py | 16 +-- src/mavis/pairing/pairing.py | 18 +-- src/mavis/summary/main.py | 12 +- src/mavis/tools/__init__.py | 20 ++- src/mavis/tools/vcf.py | 12 +- src/mavis/util.py | 98 ++++---------- src/mavis/validate/base.py | 29 ++-- src/mavis/validate/main.py | 158 ++++++++-------------- src/tools/calculate_ref_alt_counts.py | 19 ++- tests/integration/test_annotate_fileio.py | 2 +- tests/integration/test_args.py | 20 +-- tests/integration/test_assemble.py | 5 +- tests/integration/test_bam.py | 12 +- tests/snakemake/test_mini_workflow.py | 16 +++ 31 files changed, 280 insertions(+), 450 deletions(-) diff --git a/src/mavis/align.py b/src/mavis/align.py index 20984ba6..df74feb7 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -5,27 +5,15 @@ import os import re import subprocess -import warnings -from copy import copy import pysam from .bam import cigar as _cigar from .bam import read as _read from .breakpoint import Breakpoint, BreakpointPair -from .constants import ( - CIGAR, - COLUMNS, - NA_MAPPING_QUALITY, - ORIENT, - STRAND, - SVTYPE, - MavisNamespace, - reverse_complement, -) -from .error import InvalidRearrangement +from .constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement from .interval import Interval -from .util import DEVNULL +from .util import logger class SUPPORTED_ALIGNER(MavisNamespace): @@ -234,7 +222,7 @@ def convert_to_duplication(alignment, reference_genome): opposing_strands=alignment.opposing_strands, read1=alignment.read1, read2=alignment.read2, - **alignment.data + **alignment.data, ) return result return alignment @@ -405,8 +393,7 @@ def align_sequences( blat_limit_top_aln=25, blat_min_identity=0.7, clean_files=True, - log=DEVNULL, - **kwargs + **kwargs, ): """ calls the alignment tool and parses the return output for a set of sequences @@ -428,7 +415,7 @@ def align_sequences( if not sequences: return [] - log('will use', aligner, 'to align', len(sequences), 'unique sequences', time_stamp=False) + logger.debug(f'will use {aligner} to align {len(sequences)} unique sequences') # call the aligner using subprocess if aligner == SUPPORTED_ALIGNER.BLAT: @@ -454,7 +441,7 @@ def align_sequences( blat_options, ] ) - log('writing aligner logging to:', aligner_output_log, time_stamp=False) + logger.debug(f'writing aligner logging to: {aligner_output_log}') with open(aligner_output_log, 'w') as log_fh: log_fh.write('>>> {}\n'.format(command)) subprocess.check_call(command, shell=True, stdout=log_fh, stderr=log_fh) @@ -471,7 +458,7 @@ def align_sequences( command = '{} -Y {} {} {}'.format( aligner, align_options, aligner_reference, aligner_fa_input_file ) - log('writing aligner logging to:', aligner_output_log, time_stamp=False) + logger.debug(f'writing aligner logging to: {aligner_output_log}') with open(aligner_output_log, 'w') as log_fh, open( aligner_output_file, 'w' ) as aligner_output_fh: @@ -489,10 +476,8 @@ def align_sequences( try: read.reference_id = input_bam_cache.reference_id(read.reference_name) except KeyError: - log( - 'dropping alignment (unknown reference)', - read.reference_name, - time_stamp=False, + logger.warning( + f'dropping alignment (unknown reference): {read.reference_name}' ) else: if read.is_paired: @@ -523,7 +508,7 @@ def align_sequences( try: os.remove(outputfile) except OSError as err: - warnings.warn(repr(err)) + logger.warning(repr(err)) def select_contig_alignments(evidence, reads_by_query): diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 6b1a16f2..9e6c92c2 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -13,7 +13,7 @@ from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, translate from ..interval import Interval -from ..util import DEVNULL, LOG +from ..util import logger from .base import BioInterval, ReferenceName from .genomic import Exon, Gene, PreTranscript, Template, Transcript from .protein import Domain, Translation @@ -60,7 +60,6 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]: def load_annotations( *filepaths: str, - warn: Callable = DEVNULL, reference_genome: Optional[Dict[str, SeqRecord]] = None, best_transcripts_only: bool = False, ) -> Dict[str, List[Gene]]: @@ -86,7 +85,6 @@ def load_annotations( data, reference_genome=reference_genome, best_transcripts_only=best_transcripts_only, - warn=warn, ) for chrom in current_annotations: @@ -99,7 +97,6 @@ def parse_annotations_json( data, reference_genome: Optional[Dict[str, SeqRecord]] = None, best_transcripts_only=False, - warn=DEVNULL, ) -> Dict[str, List[Gene]]: """ parses a json of annotation information into annotation objects @@ -159,7 +156,9 @@ def parse_annotations_json( tx_length = transcript['cdna_coding_end'] - transcript['cdna_coding_start'] + 1 # check that the translation makes sense before including it if tx_length % CODON_SIZE != 0: - warn('Ignoring translation. The translated region is not a multiple of three') + logger.warning( + 'Ignoring translation. The translated region is not a multiple of three' + ) continue tx_length = tx_length // CODON_SIZE domains = [] @@ -180,7 +179,7 @@ def parse_annotations_json( ) ) except AssertionError as err: - warn(repr(err)) + logger.warning(repr(err)) translation = Translation( transcript['cdna_coding_start'], transcript['cdna_coding_end'], @@ -193,9 +192,8 @@ def parse_annotations_json( met = seq[translation.start - 1 : translation.start + 2] stop = seq[translation.end - CODON_SIZE : translation.end] if translate(met) != START_AA or translate(stop) != STOP_AA: - warn( - 'Sequence error. The sequence computed from the reference does look like ' - 'a valid translation' + logger.warning( + 'Sequence error. The sequence computed from the reference does look like a valid translation' ) continue spl_tx.translations.append(translation) @@ -375,12 +373,12 @@ def load(self, ignore_cache=False, verbose=True): return self if self.key in ReferenceFile.CACHE and not ignore_cache: if verbose: - LOG('cached content:', self.name) + logger.info(f'cached content: {self.name}') self.content = ReferenceFile.CACHE[self.key].content return self self.files_exist() try: - LOG('loading:', self.name, time_stamp=True) + logger.info(f'loading: {self.name}') self.content = self.loader(*self.name, **self.opt) ReferenceFile.CACHE[self.key] = self except Exception as err: diff --git a/src/mavis/annotate/main.py b/src/mavis/annotate/main.py index e1df8e68..9b08f34e 100644 --- a/src/mavis/annotate/main.py +++ b/src/mavis/annotate/main.py @@ -10,7 +10,7 @@ from ..error import DrawingFitError, NotSpecifiedError from ..illustrate.constants import DiagramSettings from ..illustrate.diagram import draw_sv_summary_diagram -from ..util import LOG, generate_complete_stamp, mkdirp, read_inputs +from ..util import generate_complete_stamp, logger, mkdirp, read_inputs from .constants import PASS_FILENAME from .file_io import ReferenceFile from .fusion import determine_prime @@ -54,12 +54,8 @@ def draw(drawing_config, ann, reference_genome, template_metadata, drawings_dire ) for i, (curr_width, other_settings) in enumerate(drawing_attempts): - LOG( - 'drawing attempt:', - i + 1, - str(curr_width) + 'px', - other_settings if other_settings else '', - time_stamp=False, + logger.info( + f'drawing attempt: {i + 1} {curr_width}px {other_settings if other_settings else ""}' ) try: drawing_config.width = curr_width @@ -68,7 +64,7 @@ def draw(drawing_config, ann, reference_genome, template_metadata, drawings_dire ann, reference_genome=reference_genome, templates=template_metadata, - **other_settings + **other_settings, ) gene_aliases1 = 'NA' @@ -99,10 +95,10 @@ def draw(drawing_config, ann, reference_genome, template_metadata, drawings_dire drawing = os.path.join(drawings_directory, name + '.svg') legend = os.path.join(drawings_directory, name + '.legend.json') - LOG('generating svg:', drawing, time_stamp=False) + logger.info(f'generating svg: {drawing}') canvas.saveas(drawing) - LOG('generating legend:', legend, time_stamp=False) + logger.info(f'generating legend: {legend}') with open(legend, 'w') as fh: json.dump(legend_json, fh) break @@ -118,7 +114,7 @@ def main( library: str, config: Dict, start_time=int(time.time()), - **kwargs + **kwargs, ): """ Args: @@ -148,7 +144,7 @@ def main( expand_orient=True, expand_svtype=True, ) - LOG('read {} breakpoint pairs'.format(len(bpps))) + logger.info(f'read {len(bpps)} breakpoint pairs') annotations.load() reference_genome.load() @@ -161,7 +157,6 @@ def main( min_domain_mapping_match=config['annotate.min_domain_mapping_match'], max_proximity=config['cluster.max_proximity'], max_orf_cap=config['annotate.max_orf_cap'], - log=LOG, filters=annotation_filters, ) @@ -187,9 +182,9 @@ def main( COLUMNS.protein_synon, } header = None - LOG('opening for write:', tabbed_output_file) + logger.info(f'opening for write: {tabbed_output_file}') tabbed_fh = open(tabbed_output_file, 'w') - LOG('opening for write:', fa_output_file) + logger.info(f'opening for write: {fa_output_file}') fasta_fh = open(fa_output_file, 'w') try: @@ -201,14 +196,10 @@ def main( header_req.update(ann_row.keys()) header = sort_columns(header_req) tabbed_fh.write('\t'.join([str(c) for c in header]) + '\n') - LOG( - '({} of {}) current annotation'.format(i + 1, total), - ann.annotation_id, - ann.transcript1, - ann.transcript2, - ann.event_type, + logger.info( + f'({i + 1} of {total}) current annotation {ann.annotation_id} {ann.transcript1} {ann.transcript2} {ann.event_type}' ) - LOG(ann, time_stamp=False) + logger.info(str(ann)) # get the reference sequences for either transcript ref_cdna_seq = {} ref_protein_seq = {} @@ -289,9 +280,9 @@ def main( rows = [ann_row] for row in rows: tabbed_fh.write('\t'.join([str(row.get(k, None)) for k in header]) + '\n') - generate_complete_stamp(output, LOG, start_time=start_time) + generate_complete_stamp(output, start_time=start_time) finally: - LOG('closing:', tabbed_output_file) + logger.info(f'closing: {tabbed_output_file}') tabbed_fh.close() - LOG('closing:', fa_output_file) + logger.info(f'closing: {fa_output_file}') fasta_fh.close() diff --git a/src/mavis/annotate/variant.py b/src/mavis/annotate/variant.py index 0a81f34b..7eaf2149 100644 --- a/src/mavis/annotate/variant.py +++ b/src/mavis/annotate/variant.py @@ -8,7 +8,7 @@ from ..constants import COLUMNS, GENE_PRODUCT_TYPE, PROTOCOL, STOP_AA, STRAND, SVTYPE from ..error import NotSpecifiedError from ..interval import Interval -from ..util import DEVNULL +from ..util import logger from .fusion import FusionTranscript, determine_prime from .genomic import Gene, IntergenicRegion, PreTranscript, Transcript @@ -62,7 +62,7 @@ def __init__( stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, **bpp.data, - **kwargs + **kwargs, ) # match transcript to breakpoint if reveresed @@ -850,7 +850,6 @@ def annotate_events( min_orf_size: int = 200, min_domain_mapping_match: float = 0.95, max_orf_cap: int = 3, - log: Callable = DEVNULL, filters: List[Callable] = None, ) -> List[Annotation]: """ @@ -873,7 +872,7 @@ def annotate_events( results = [] total = len(bpps) for i, bpp in enumerate(bpps): - log('({} of {}) gathering annotations for'.format(i + 1, total), bpp) + logger.info(f'({i + 1} of {total}) gathering annotations for {repr(bpp)}') bpp.data[COLUMNS.validation_id] = bpp.data.get(COLUMNS.validation_id, str(uuid())) ann_list = _gather_annotations(annotations, bpp, proximity=max_proximity) for f in filters: @@ -904,6 +903,6 @@ def annotate_events( except NotImplementedError: pass # anti-sense fusions will throw this error except KeyError as e: - log('warning. could not build fusion product', repr(e)) - log('generated', len(ann_list), 'annotations', time_stamp=False) + logger.warning(f'warning. could not build fusion product: {repr(e)}') + logger.info(f'generated {len(ann_list)} annotations') return results diff --git a/src/mavis/assemble.py b/src/mavis/assemble.py index 0683dbad..c5654924 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/assemble.py @@ -1,5 +1,4 @@ import itertools -import warnings import distance import networkx as nx @@ -8,7 +7,7 @@ from .bam.read import calculate_alignment_score, nsb_align, sequence_complexity from .constants import reverse_complement from .interval import Interval -from .util import DEVNULL +from .util import logger class Contig: @@ -247,9 +246,7 @@ def digraph_connected_components(graph, subgraph=None): return nx.connected_components(g) -def pull_contigs_from_component( - assembly, component, min_edge_trim_weight, assembly_max_paths, log=DEVNULL -): +def pull_contigs_from_component(assembly, component, min_edge_trim_weight, assembly_max_paths): """ builds contigs from the a connected component of the assembly DeBruijn graph @@ -258,7 +255,6 @@ def pull_contigs_from_component( component (list): list of nodes which make up the connected component min_edge_trim_weight (int): the minimum weight to not remove a non cutting edge/path assembly_max_paths (int): the maximum number of paths allowed before the graph is further simplified - log (Callable): the log function Returns: Dict[str,int]: the paths/contigs and their scores @@ -287,12 +283,8 @@ def pull_contigs_from_component( if w > edge_weights[-1]: continue - log( - 'reducing estimated paths. Current estimate is {}+ from'.format(paths_est), - len(component), - 'nodes', - 'filter increase', - w, + logger.debug( + f'reducing estimated paths. Current estimate is {paths_est}+ from {len(component)} nodes filter increase {w}', ) assembly.trim_forks_by_freq(w) assembly.trim_noncutting_paths_by_freq(w) @@ -353,8 +345,7 @@ def assemble( assembly_max_paths=20, assembly_min_uniq=0.01, min_complexity=0, - log=lambda *pos, **kwargs: None, - **kwargs + **kwargs, ): """ for a set of sequences creates a DeBruijnGraph @@ -372,7 +363,6 @@ def assemble( min_contig_length: Minimum length of contigs assemble to attempt remapping reads to. Shorter contigs will be ignored remap_min_exact_match: see [assembly_min_exact_match_to_remap](/configuration/settings/#assembly_min_exact_match_to_remap) assembly_max_paths: see [assembly_max_paths](/configuration/settings/#assembly_max_paths) - log (Callable): the log function Returns: List[Contig]: a list of putative contigs @@ -404,7 +394,7 @@ def assemble( for component in digraph_connected_components(assembly): subgraph = assembly.subgraph(component) if not nx.is_directed_acyclic_graph(subgraph): - log('dropping cyclic component', time_stamp=False) + logger.debug('dropping cyclic component') for node in subgraph.get_nodes(): assembly.remove_node(node) # initial data cleaning @@ -422,12 +412,11 @@ def assemble( component, min_edge_trim_weight=min_edge_trim_weight, assembly_max_paths=assembly_max_paths, - log=log, ) ) # now map the contigs to the possible input sequences - log('filtering contigs by size and complexity', len(path_scores), time_stamp=False) + logger.debug(f'filtering contigs by size and complexity {len(path_scores)}') contigs = [] for seq, score in list(path_scores.items()): contig = Contig(seq, score) @@ -435,10 +424,10 @@ def assemble( not min_complexity or contig.complexity() >= min_complexity ): contigs.append(contig) - log('filtering similar contigs', len(contigs)) + logger.debug(f'filtering similar contigs {len(contigs)}') # remap the input reads contigs = filter_contigs(contigs, assembly_min_uniq) - log('remapping reads to {} contigs'.format(len(contigs))) + logger.debug(f'remapping reads to {len(contigs)} contigs') for input_seq in sequences: maps_to = {} # contig, score @@ -470,7 +459,7 @@ def assemble( assert len(best_alignments) >= 1 for contig, read in best_alignments: contig.add_mapped_sequence(read, len(best_alignments)) - log('assemblies complete') + logger.debug('assemblies complete') return contigs diff --git a/src/mavis/bam/cache.py b/src/mavis/bam/cache.py index dcd0c0f4..866d4003 100644 --- a/src/mavis/bam/cache.py +++ b/src/mavis/bam/cache.py @@ -1,14 +1,12 @@ import atexit -import logging import re -import warnings import pysam -from .read import SamRead +from .. import util as _util from ..annotate.base import ReferenceName from ..interval import Interval -from .. import util as _util +from .read import SamRead class BamCache: @@ -50,7 +48,7 @@ def add_read(self, read): read (pysam.AlignedSegment): the read to add to the cache """ if not read.is_unmapped and read.reference_start == read.reference_end: - _util.LOG('ignoring invalid read', read.query_name, level=logging.DEBUG) + _util.logger.debug(f'ignoring invalid read: {read.query_name}') return if not isinstance(read, SamRead): read = SamRead.copy(read) @@ -168,7 +166,7 @@ def fetch( if stop_on_cached_read and self.has_read(read): break if not read.is_unmapped and read.reference_start == read.reference_end: - _util.LOG('ignoring invalid read', read.query_name, level=logging.DEBUG) + _util.logger.debug(f'ignoring invalid read {read.query_name}') continue read = SamRead.copy(read) if not filter_if(read): @@ -231,7 +229,7 @@ def fetch_from_bins( if bin_limit is not None and count >= running_surplus: break if not read.is_unmapped and read.reference_start == read.reference_end: - _util.LOG('ignoring invalid read', read.query_name, level=logging.DEBUG) + _util.logger.debug(f'ignoring invalid read {read.query_name}') continue read = SamRead.copy(read) if not filter_if(read): @@ -273,10 +271,10 @@ def get_mate(self, read, primary_only=True, allow_file_access=False): if not allow_file_access or read.mate_is_unmapped: raise KeyError('mate is not found in the cache') else: - warnings.warn( - 'looking for uncached mate of {0}. This requires file access and' + _util.logger.warning( + f'looking for uncached mate of {read.query_name}. This requires file access and' ' requests may be slow. This should also not be using in a loop iterating using the file pointer ' - ' as it will change the file pointer position'.format(read.query_name) + ' as it will change the file pointer position' ) m = self.fh.mate(read) m = SamRead.copy(m) diff --git a/src/mavis/bam/stats.py b/src/mavis/bam/stats.py index e161227c..7901c044 100644 --- a/src/mavis/bam/stats.py +++ b/src/mavis/bam/stats.py @@ -1,12 +1,12 @@ #!/projects/tumour_char/analysis_scripts/python/centos06/anaconda3_v2.3.0/bin/python import math +import os import statistics as stats import warnings -import os - -from .read import sequenced_strand from ..constants import STRAND +from ..util import logger +from .read import sequenced_strand os.environ["OMP_NUM_THREADS"] = "4" # export OMP_NUM_THREADS=4 os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 @@ -140,10 +140,8 @@ def compute_transcriptome_bam_stats( randoms = [int(n * len(total_annotations)) for n in np.random.rand(sample_size)] genes = [total_annotations[r] for r in randoms] else: - warnings.warn( - 'insufficient annotations to match requested sample size. requested {}, but only {} annotations'.format( - sample_size, len(total_annotations) - ) + logger.warning( + f'insufficient annotations to match requested sample size. requested {sample_size}, but only {len(total_annotations)} annotations' ) fragment_hist = Histogram() diff --git a/src/mavis/blat.py b/src/mavis/blat.py index 6efd0fb2..de95d5cb 100644 --- a/src/mavis/blat.py +++ b/src/mavis/blat.py @@ -9,7 +9,6 @@ -- http://wiki.bits.vib.be/index.php/Blat """ -import logging import math import re @@ -28,7 +27,7 @@ reverse_complement, ) from .interval import Interval -from .util import LOG +from .util import logger class Blat: @@ -202,7 +201,7 @@ def split_csv_trailing_ints(x): ) final_rows.append(row) except AssertionError as err: - LOG(type(err), ':', str(err), level=logging.DEBUG) + logger.debug(f'{type(err)}:{err}') return header, final_rows @staticmethod @@ -391,11 +390,9 @@ def process_blat_output( try: read = Blat.pslx_row_to_pysam(row, input_bam_cache, reference_genome) except KeyError as err: - LOG( - 'warning: reference template name not recognized', str(err), level=logging.DEBUG - ) + logger.debug(f'warning: reference template name not recognized: {err}') except AssertionError as err: - LOG('warning: invalid blat alignment', repr(err), level=logging.DEBUG) + logger.warning(f'invalid blat alignment {repr(err)}') else: reads.append((row, read)) diff --git a/src/mavis/cluster/cluster.py b/src/mavis/cluster/cluster.py index 71fa39bd..36941d57 100644 --- a/src/mavis/cluster/cluster.py +++ b/src/mavis/cluster/cluster.py @@ -1,13 +1,13 @@ from __future__ import division +import itertools from collections import namedtuple from copy import copy -import itertools from ..breakpoint import Breakpoint, BreakpointPair from ..constants import ORIENT, STRAND from ..interval import Interval -from ..util import LOG +from ..util import logger class BreakpointPairGroupKey( @@ -285,7 +285,7 @@ def pair_center_distance(pair1, pair2): for group_key in sorted(set(list(groups) + list(phase2_groups))): count = len(groups.get(group_key, [])) + len(phase2_groups.get(group_key, [])) if verbose: - LOG(group_key, 'pairs:', count) + logger.info(f'{group_key} pairs: {count}') nodes = merge_by_union( groups.get(group_key, []), group_key, @@ -375,7 +375,7 @@ def pair_center_distance(pair1, pair2): ) nodes.setdefault(new_bpp, []).append(pair) if verbose: - LOG('merged', count, 'down to', len(nodes)) + logger.info(f'merged {count} down to {len(nodes)}') for node, pairs in nodes.items(): if node in mapping: raise KeyError('duplicate merge node', str(node), node, pair_key(node)) diff --git a/src/mavis/cluster/main.py b/src/mavis/cluster/main.py index ae07c755..aec884ba 100644 --- a/src/mavis/cluster/main.py +++ b/src/mavis/cluster/main.py @@ -9,10 +9,10 @@ from ..breakpoint import BreakpointPair from ..constants import COLUMNS, SUBCOMMAND from ..util import ( - LOG, filter_on_overlap, filter_uninformative, generate_complete_stamp, + logger, mkdirp, output_tabbed_file, read_inputs, @@ -27,12 +27,11 @@ def split_clusters( clusters: List[BreakpointPair], outputdir: str, total_batches: int, - min_clusters_per_file: int = 0, write_bed_summary: bool = True, ): """ For a set of clusters creates a bed file representation of all clusters. - Also splits the clusters evenly into multiple files based on the user parameters (min_clusters_per_file, max_files) + Also splits the clusters evenly into multiple files based on the user parameters (max_files) Returns: list: of output file names (not including the bed file) @@ -119,7 +118,7 @@ def main( other_chr = set() unfiltered_breakpoint_pairs = [] filtered_pairs = [] - LOG('filtering by library and chr name') + logger.info('filtering by library and chr name') for bpp in breakpoint_pairs: if bpp.library is None: bpp.library = library @@ -140,41 +139,38 @@ def main( other_chr -= set(config[f'{SECTION}.limit_to_chr']) breakpoint_pairs = unfiltered_breakpoint_pairs if other_libs: - LOG( - 'warning: ignoring breakpoints found for other libraries:', - sorted([lib for lib in other_libs]), + logger.info( + f'warning: ignoring breakpoints found for other libraries: {sorted([lib for lib in other_libs])}', ) if other_chr: - LOG('warning: filtered events on chromosomes', other_chr) + logger.info(f'warning: filtered events on chromosomes {other_chr}') # filter by masking file breakpoint_pairs, masked_pairs = filter_on_overlap(breakpoint_pairs, masking.content) for bpp in masked_pairs: filtered_pairs.append(bpp) # filter by informative if config[f'{SECTION}.uninformative_filter']: - LOG('filtering from', len(breakpoint_pairs), 'breakpoint pairs using informative filter') + logger.info( + f'filtering from {len(breakpoint_pairs)} breakpoint pairs using informative filter' + ) pass_clusters, uninformative_clusters = filter_uninformative( annotations.content, breakpoint_pairs, max_proximity=config[f'{SECTION}.max_proximity'] ) - LOG( - 'filtered from', - len(breakpoint_pairs), - 'down to', - len(pass_clusters), - '(removed {})'.format(len(uninformative_clusters)), + logger.info( + f'filtered from {len(breakpoint_pairs)} down to {len(pass_clusters)} (removed {uninformative_clusters})' ) breakpoint_pairs = pass_clusters for bpp in uninformative_clusters: bpp.data[COLUMNS.filter_comment] = 'Uninformative' filtered_pairs.append(bpp) else: - LOG('did not apply uninformative filter') + logger.info('did not apply uninformative filter') mkdirp(output) output_tabbed_file(filtered_pairs, filtered_output) if not config[f'{SECTION}.split_only']: - LOG('computing clusters') + logger.info('computing clusters') clusters = merge_breakpoint_pairs( breakpoint_pairs, cluster_radius=config[f'{SECTION}.cluster_radius'], @@ -211,9 +207,9 @@ def main( common_data = set(common_data) if len(common_data) == 1: cluster.data[item] = list(common_data)[0] - LOG('computed', len(clusters), 'clusters', time_stamp=False) - LOG('cluster input pairs distribution', sorted(hist.items()), time_stamp=False) - LOG('cluster intervals lengths', sorted(length_hist.items()), time_stamp=False) + logger.info(f'computed {len(clusters)} clusters') + logger.info(f'cluster input pairs distribution {sorted(hist.items())}') + logger.info(f'cluster intervals lengths {sorted(length_hist.items())}') # map input pairs to cluster ids # now create the mapping from the original input files to the cluster(s) @@ -234,9 +230,8 @@ def main( breakpoint_pairs, output, total_batches=lib_config['total_batches'], - min_clusters_per_file=config[f'{SECTION}.min_clusters_per_file'], write_bed_summary=True, ) - generate_complete_stamp(output, LOG, start_time=start_time) + generate_complete_stamp(output, start_time=start_time) return output_files diff --git a/src/mavis/config.py b/src/mavis/config.py index a0b4341a..f043263e 100644 --- a/src/mavis/config.py +++ b/src/mavis/config.py @@ -1,17 +1,11 @@ import argparse -import os from copy import copy as _copy -from typing import Dict, Optional - -import snakemake -from mavis_config import bash_expands -from snakemake.exceptions import WorkflowError -from snakemake.utils import validate as snakemake_validate +from typing import Dict from .annotate.file_io import ReferenceFile from .bam import stats from .bam.cache import BamCache -from .constants import INTEGER_COLUMNS, PROTOCOL, SUBCOMMAND, float_fraction +from .constants import PROTOCOL, float_fraction from .util import cast_boolean, filepath diff --git a/src/mavis/constants.py b/src/mavis/constants.py index fc27c03d..35a4731b 100644 --- a/src/mavis/constants.py +++ b/src/mavis/constants.py @@ -2,7 +2,6 @@ module responsible for small utility functions and constants used throughout the structural_variant package """ import argparse -import os import re from typing import List @@ -10,7 +9,7 @@ from Bio.Alphabet.IUPAC import ambiguous_dna from Bio.Data.IUPACData import ambiguous_dna_values from Bio.Seq import Seq -from mavis_config.constants import SUBCOMMAND, MavisNamespace +from mavis_config.constants import MavisNamespace PROGNAME: str = 'mavis' EXIT_OK: int = 0 diff --git a/src/mavis/illustrate/diagram.py b/src/mavis/illustrate/diagram.py index c9f757fd..f9e6fabc 100644 --- a/src/mavis/illustrate/diagram.py +++ b/src/mavis/illustrate/diagram.py @@ -4,13 +4,11 @@ """ from svgwrite import Drawing -from .elements import draw_exon_track, draw_genes, draw_template, draw_ustranscript, draw_vmarker -from .scatter import draw_scatter -from .util import generate_interval_mapping, LabelMapping - from ..annotate.genomic import IntergenicRegion from ..interval import Interval -from ..util import DEVNULL +from .elements import draw_exon_track, draw_genes, draw_template, draw_ustranscript, draw_vmarker +from .scatter import draw_scatter +from .util import LabelMapping, generate_interval_mapping # draw gene level view # draw gene box @@ -382,9 +380,7 @@ def draw_sv_summary_diagram( return canvas, legend -def draw_multi_transcript_overlay( - config, gene, vmarkers=None, window_buffer=0, plots=None, log=DEVNULL -): +def draw_multi_transcript_overlay(config, gene, vmarkers=None, window_buffer=0, plots=None): vmarkers = [] if vmarkers is None else vmarkers plots = [] if plots is None else plots @@ -434,7 +430,7 @@ def draw_multi_transcript_overlay( for plot in plots: if plot.points: - plot_group = draw_scatter(config, canvas, plot, mapping, log=log) + plot_group = draw_scatter(config, canvas, plot, mapping) main_group.add(plot_group) plot_group.translate(x, y) y += plot.height + config.padding * 2 diff --git a/src/mavis/illustrate/scatter.py b/src/mavis/illustrate/scatter.py index 976a6a85..fd78a66f 100644 --- a/src/mavis/illustrate/scatter.py +++ b/src/mavis/illustrate/scatter.py @@ -2,7 +2,7 @@ from ..bam.read import pileup, sequenced_strand from ..interval import Interval -from ..util import DEVNULL, LOG +from ..util import logger def bam_to_scatter( @@ -40,7 +40,7 @@ def bam_to_scatter( if not axis_name: axis_name = os.path.basename(bam_file) # one plot per bam - LOG('reading:', bam_file) + logger.info(f'reading: {bam_file}') plot = None samfile = pysam.AlignmentFile(bam_file, 'rb') @@ -63,7 +63,7 @@ def read_filter(read): except ValueError: # chrom not in bam pass - LOG('scatter plot {} has {} points'.format(axis_name, len(points))) + logger.info(f'scatter plot {axis_name} has {len(points)} points') plot = ScatterPlot( points, axis_name, @@ -123,7 +123,7 @@ def __init__( self.density = density -def draw_scatter(ds, canvas, plot, xmapping, log=DEVNULL): +def draw_scatter(ds, canvas, plot, xmapping): """ given a xmapping, draw the scatter plot svg group @@ -163,10 +163,7 @@ def draw_scatter(ds, canvas, plot, xmapping, log=DEVNULL): ) except IndexError: pass - log( - 'drew {} of {} points (density={})'.format(len(circles), len(plot.points), plot.density), - time_stamp=False, - ) + logger.info(f'drew {len(circles)} of {len(plot.points)} points (density={plot.density})') for x_px, y_px, color in px_points: if x_px.length() > ds.scatter_marker_radius: diff --git a/src/mavis/interval.py b/src/mavis/interval.py index c78e3aa8..3c73caf9 100644 --- a/src/mavis/interval.py +++ b/src/mavis/interval.py @@ -1,4 +1,4 @@ -from typing import Callable, Optional +from typing import Optional class Interval: diff --git a/src/mavis/main.py b/src/mavis/main.py index ea2f93aa..d417d7d5 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -31,7 +31,6 @@ def convert_main(inputs, outputfile, file_type, strand_specific=False, assume_no inputs, file_type, strand_specific, - _util.LOG, True, assume_no_untemplated=assume_no_untemplated, ) @@ -186,8 +185,8 @@ def main(argv=None): log_conf['filename'] = args.log logging.basicConfig(**log_conf) - _util.LOG('MAVIS: {}'.format(__version__)) - _util.LOG('hostname:', platform.node(), time_stamp=False) + _util.logger.info(f'MAVIS: {__version__}') + _util.logger.info(f'hostname: {platform.node()}') _util.log_arguments(args) config: Dict = dict() @@ -267,7 +266,7 @@ def main(argv=None): # add bam stats to the config if missing if not config.get('skip_stage.validate'): _config.add_bamstats_to_config(config) - _util.LOG(f'writing: {args.outputfile}') + _util.logger.info(f'writing: {args.outputfile}') with open(args.outputfile, 'w') as fh: fh.write(json.dumps(config, sort_keys=True, indent=' ')) else: @@ -284,11 +283,10 @@ def main(argv=None): hours = duration - duration % 3600 minutes = duration - hours - (duration - hours) % 60 seconds = duration - hours - minutes - _util.LOG( - 'run time (hh/mm/ss): {}:{:02d}:{:02d}'.format(hours // 3600, minutes // 60, seconds), - time_stamp=False, + _util.logger.info( + 'run time (hh/mm/ss): {}:{:02d}:{:02d}'.format(hours // 3600, minutes // 60, seconds) ) - _util.LOG('run time (s): {}'.format(duration), time_stamp=False) + _util.logger.info(f'run time (s): {duration}') except Exception as err: raise err finally: diff --git a/src/mavis/overlay.py b/src/mavis/overlay.py index 9543ff84..fccd5e34 100644 --- a/src/mavis/overlay.py +++ b/src/mavis/overlay.py @@ -92,10 +92,8 @@ def main( for gene in annotations.content[chrom]: if gene_name in gene.aliases or gene_name == gene.name: gene_to_draw = gene - _util.LOG( - 'Found target gene: {}(aka. {}) {}:{}-{}'.format( - gene.name, gene.aliases, gene.chr, gene.start, gene.end - ) + _util.logger.info( + f'Found target gene: {gene.name}(aka. {gene.aliases}) {gene.chr}:{gene.start}-{gene.end}' ) break if gene_to_draw is None: @@ -141,17 +139,16 @@ def main( vmarkers=vmarkers, plots=plots, window_buffer=buffer_length, - log=_util.LOG, ) break except DrawingFitError as err: if attempts > max_drawing_retries: raise err - _util.LOG('Drawing fit: extending window', drawing_width_iter_increase) + _util.logger.info(f'Drawing fit: extending window {drawing_width_iter_increase}') settings.width += drawing_width_iter_increase attempts += 1 svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name)) - _util.LOG('writing:', svg_output_file) + _util.logger.info(f'writing: {svg_output_file}') canvas.saveas(svg_output_file) diff --git a/src/mavis/pairing/main.py b/src/mavis/pairing/main.py index 850d0173..8b92586c 100644 --- a/src/mavis/pairing/main.py +++ b/src/mavis/pairing/main.py @@ -8,7 +8,7 @@ from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SPLICE_TYPE, SVTYPE -from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs +from ..util import generate_complete_stamp, logger, output_tabbed_file, read_inputs from .pairing import inferred_equivalent, pair_by_distance, product_key @@ -55,7 +55,7 @@ def main( expand_svtype=False, ) ) - LOG('read {} breakpoint pairs'.format(len(bpps))) + logger.info(f'read {len(bpps)} breakpoint pairs') # load all transcripts reference_transcripts = dict() @@ -99,20 +99,16 @@ def main( distance_pairings: Dict[str, Set[str]] = {} product_pairings: Dict[str, Set[str]] = {} - LOG('computing distance based pairings') + logger.info('computing distance based pairings') # pairwise comparison of breakpoints between all libraries for set_num, (category, calls) in enumerate( sorted(calls_by_cat.items(), key=lambda x: (len(x[1]), x[0]), reverse=True) ): - LOG( - 'comparing set {} of {} with {} items'.format( - set_num + 1, len(calls_by_cat), len(calls) - ) - ) + logger.info(f'comparing set {set_num + 1} of {len(calls_by_cat)} with {len(calls)} items') for node, adj_list in pair_by_distance(calls, distances, against_self=False).items(): distance_pairings.setdefault(node, set()).update(adj_list) - LOG('computing inferred (by product) pairings') + logger.info('computing inferred (by product) pairings') for calls in calls_by_ann.values(): calls_by_lib: Dict[str, List[BreakpointPair]] = {} for call in calls: @@ -140,4 +136,4 @@ def main( fname = os.path.join(output, 'mavis_paired.tab') output_tabbed_file(bpps, fname) - generate_complete_stamp(output, LOG) + generate_complete_stamp(output) diff --git a/src/mavis/pairing/pairing.py b/src/mavis/pairing/pairing.py index f1fb52b5..9d5d87f2 100644 --- a/src/mavis/pairing/pairing.py +++ b/src/mavis/pairing/pairing.py @@ -1,11 +1,11 @@ -from typing import Callable, Dict, List, Optional, Set +from typing import Dict, List, Optional, Set from ..annotate.variant import determine_prime from ..breakpoint import Breakpoint, BreakpointPair from ..constants import CALL_METHOD, COLUMNS, ORIENT, PRIME, PROTOCOL, STRAND from ..error import NotSpecifiedError from ..interval import Interval -from ..util import DEVNULL +from ..util import logger from .constants import PAIRING_DISTANCES @@ -183,7 +183,7 @@ def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) - def pair_by_distance( - calls: List[BreakpointPair], distances, log: Callable = DEVNULL, against_self: bool = False + calls: List[BreakpointPair], distances, against_self: bool = False ) -> Dict[str, Set[str]]: """ for a set of input calls, pair by distance @@ -195,14 +195,8 @@ def pair_by_distance( max_distance = max(distances.values()) max_useq = max([len(c.untemplated_seq) if c.untemplated_seq else 0 for c in calls] + [0]) max_distance += max_useq * 2 - log( - 'lowest_resolution', - lowest_resolution, - 'max_distance', - max_distance, - 'possible comparisons', - len(break1_sorted) * len(break1_sorted), - time_stamp=False, + logger.debug( + f'lowest_resolution:{lowest_resolution} max_distance:{max_distance} possible comparisons:{len(break1_sorted) * len(break1_sorted)}' ) comparisons = 0 @@ -241,7 +235,7 @@ def pair_by_distance( if equivalent(current, other, distances=distances): distance_pairings.setdefault(product_key(current), set()).add(product_key(other)) distance_pairings.setdefault(product_key(other), set()).add(product_key(current)) - log('computed {} comparisons'.format(comparisons), time_stamp=False) + logger.debug(f'computed {comparisons} comparisons') return distance_pairings diff --git a/src/mavis/summary/main.py b/src/mavis/summary/main.py index 94f79ddb..309da1b8 100644 --- a/src/mavis/summary/main.py +++ b/src/mavis/summary/main.py @@ -9,7 +9,7 @@ from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SPLICE_TYPE, SVTYPE -from ..util import LOG, generate_complete_stamp, output_tabbed_file, read_inputs, soft_cast +from ..util import generate_complete_stamp, logger, output_tabbed_file, read_inputs, soft_cast from .constants import HOMOPOLYMER_MIN_LENGTH from .summary import ( annotate_dgv, @@ -282,12 +282,12 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( rows = [] for lib in bpps_by_library: - LOG('annotating dgv for', lib) + logger.info(f'annotating dgv for {lib}') if not dgv_annotation.is_empty(): annotate_dgv( bpps_by_library[lib], dgv_annotation.content, distance=10 ) # TODO make distance a parameter - LOG('adding pairing states for', lib) + logger.info(f'adding pairing states for {lib}') for row in bpps_by_library[lib]: # in case no pairing was done, add default (applicable to single library summaries) row.data.setdefault(COLUMNS.inferred_pairing, '') @@ -312,7 +312,7 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( other_protocol=other_protocol, other_disease_state=other_disease_state, is_matched=other_lib in paired_libraries, - inferred_is_matched=other_lib in inferred_paired_libraries + inferred_is_matched=other_lib in inferred_paired_libraries, ) else: pairing_state = 'Not Applicable' @@ -324,7 +324,7 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( output, 'mavis_summary_all_{}.tab'.format('_'.join(sorted(list(libraries.keys())))) ) output_tabbed_file(rows, fname, header=output_columns) - LOG('wrote {} structural variants to {}'.format(len(rows), fname)) + logger.info(f'wrote {len(rows)} structural variants to {fname}') output_tabbed_file(filtered_pairs, os.path.join(output, 'filtered_pairs.tab')) # output by library non-synon protein-product for lib in bpps_by_library: @@ -344,4 +344,4 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( ): lib_rows.append(row) output_tabbed_file(lib_rows, filename, header=output_columns) - generate_complete_stamp(output, LOG) + generate_complete_stamp(output) diff --git a/src/mavis/tools/__init__.py b/src/mavis/tools/__init__.py index 8dfc2db0..a1b814bd 100644 --- a/src/mavis/tools/__init__.py +++ b/src/mavis/tools/__init__.py @@ -7,7 +7,7 @@ from ..breakpoint import Breakpoint, BreakpointPair from ..constants import COLUMNS, ORIENT, STRAND, SVTYPE from ..error import InvalidRearrangement -from ..util import DEVNULL, read_bpp_from_input_file +from ..util import logger, read_bpp_from_input_file from .breakdancer import convert_file as _convert_breakdancer_file from .chimerascan import convert_row as _parse_chimerascan from .cnvnator import convert_row as _parse_cnvnator @@ -21,7 +21,6 @@ def convert_tool_output( fnames: List[str], file_type: str = SUPPORTED_TOOL.MAVIS, stranded: bool = False, - log: Callable = DEVNULL, collapse: bool = True, assume_no_untemplated: bool = True, ) -> List[BreakpointPair]: @@ -32,14 +31,14 @@ def convert_tool_output( for fname in fnames: result.extend( _convert_tool_output( - fname, file_type, stranded, log, assume_no_untemplated=assume_no_untemplated + fname, file_type, stranded, assume_no_untemplated=assume_no_untemplated ) ) if collapse: collapse_mapping: Dict[BreakpointPair, List[BreakpointPair]] = {} for bpp in result: collapse_mapping.setdefault(bpp, []).append(bpp) - log('collapsed', len(result), 'to', len(collapse_mapping), 'calls') + logger.debug(f'collapsed {len(result)} to {len(collapse_mapping)} calls') result = [] temp_sets = set() for bpp, bpp_list in collapse_mapping.items(): @@ -217,7 +216,7 @@ def _convert_tool_row( untemplated_seq=untemplated_seq, event_type=event_type, stranded=stranded, - **{COLUMNS.tools: file_type, COLUMNS.tracking_id: std_row[COLUMNS.tracking_id]} + **{COLUMNS.tools: file_type, COLUMNS.tracking_id: std_row[COLUMNS.tracking_id]}, ) for col, value in std_row.items(): @@ -242,10 +241,9 @@ def _convert_tool_output( input_file: str, file_type: str = SUPPORTED_TOOL.MAVIS, stranded: bool = False, - log: Callable = DEVNULL, assume_no_untemplated: bool = True, ) -> List[BreakpointPair]: - log('reading:', input_file) + logger.info(f'reading: {input_file}') result = [] rows = None if file_type == SUPPORTED_TOOL.MAVIS: @@ -288,7 +286,7 @@ def _convert_tool_output( SUPPORTED_TOOL.BREAKSEQ, SUPPORTED_TOOL.STRELKA, ]: - rows = read_vcf(input_file, file_type, log) + rows = read_vcf(input_file) elif file_type == SUPPORTED_TOOL.BREAKDANCER: rows = _convert_breakdancer_file(input_file) else: @@ -296,16 +294,16 @@ def _convert_tool_output( df.columns = [c[1:] if c.startswith('#') else c for c in df.columns] rows = df.where(df.notnull(), None).to_dict('records') if rows: - log('found', len(rows), 'rows') + logger.info('found', len(rows), 'rows') for row in rows: try: std_rows = _convert_tool_row( row, file_type, stranded, assume_no_untemplated=assume_no_untemplated ) except Exception as err: - log('Error in converting row', row) + logger.error(f'Error in converting row {row}') raise err else: result.extend(std_rows) - log('generated', len(result), 'breakpoint pairs') + logger.info(f'generated {len(result)} breakpoint pairs') return result diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index eea0fadf..7717d743 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -12,8 +12,7 @@ from typing_extensions import TypedDict from ..constants import COLUMNS, ORIENT, SVTYPE -from ..util import DEVNULL -from .constants import SUPPORTED_TOOL +from ..util import logger PANDAS_DEFAULT_NA_VALUES = [ '-1.#IND', @@ -122,7 +121,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: +def convert_record(record, record_mapping={}) -> List[Dict]: """ converts a vcf record @@ -143,7 +142,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: try: value = record.info[key] except UnicodeDecodeError as err: - log('Ignoring invalid INFO field {} with error: {}'.format(key, err)) + logger.warning(f'Ignoring invalid INFO field {key} with error: {err}') else: try: value = value[0] if len(value) == 1 else value @@ -297,12 +296,11 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: return header_lines, df -def convert_file(input_file: str, file_type: str, log): +def convert_file(input_file: str): """process a VCF file Args: input_file: the input file name - file_type: the input type Raises: err: [description] @@ -313,7 +311,7 @@ def convert_file(input_file: str, file_type: str, log): for variant_record in convert_pandas_rows_to_variants(data): try: - rows.extend(convert_record(variant_record, log=log)) + rows.extend(convert_record(variant_record)) except NotImplementedError as err: logging.warning(str(err)) return rows diff --git a/src/mavis/util.py b/src/mavis/util.py index 446520d3..cfb269da 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -3,13 +3,8 @@ import logging import os import re -import sys import time -from argparse import Namespace -from datetime import datetime -from functools import partial -from glob import glob -from typing import Any, Callable, Dict, List, Optional, Set +from typing import Any, Callable, Dict, List, Set import pandas as pd from mavis_config import bash_expands @@ -25,7 +20,6 @@ STRAND, SUMMARY_LIST_COLUMNS, SVTYPE, - MavisNamespace, sort_columns, ) from .error import InvalidRearrangement @@ -33,43 +27,7 @@ ENV_VAR_PREFIX = 'MAVIS_' - -class Log: - """ - wrapper aroung the builtin logging to make it more readable - """ - - def __init__(self, indent_str=' ', indent_level=0, level=logging.INFO): - self.indent_str = indent_str - self.indent_level = indent_level - self.level = level - - def __call__(self, *pos, time_stamp=False, level=None, indent_level=0, **kwargs): - if level is None and self.level is None: - return - elif self.level is not None: - level = self.level - - stamp = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') if time_stamp else ' ' * 21 - indent_prefix = self.indent_str * (self.indent_level + indent_level) - message = '{} {}{}'.format(stamp, indent_prefix, ' '.join([str(p) for p in pos])) - logging.log(level, message, **kwargs) - - def indent(self): - return Log(self.indent_str, self.indent_level + 1, self.level) - - def dedent(self): - return Log(self.indent_str, max(0, self.indent_level - 1), self.level) - - def __enter__(self): - return self - - def __exit__(self, *pos): - pass - - -LOG = Log() -DEVNULL = Log(level=None) +logger = logging.getLogger('mavis') def filepath(path): @@ -151,30 +109,30 @@ def log_arguments(args): Args: args (Namespace): the namespace to print arguments for """ - LOG('arguments', time_stamp=True) - with LOG.indent() as log: - for arg, val in sorted(args.__dict__.items()): - if isinstance(val, list): - if len(val) <= 1: - log(arg, '= {}'.format(val)) - continue - log(arg, '= [') - for v in val: - log(repr(v), indent_level=1) - log(']') - elif ( - any([isinstance(val, typ) for typ in [str, int, float, bool, tuple]]) or val is None - ): - log(arg, '=', repr(val)) - else: - log(arg, '=', object.__repr__(val)) + logger.info('arguments') + + indent = ' ' + + for arg, val in sorted(args.__dict__.items()): + if isinstance(val, list): + if len(val) <= 1: + logger.info(f'{indent}{arg} = {val}') + continue + logger.info(f'{indent}{arg} = [') + for v in val: + logger.info(f'{indent * 2}{repr(v)}') + logger.info(f'{indent}]') + elif any([isinstance(val, typ) for typ in [str, int, float, bool, tuple]]) or val is None: + logger.info(f'{indent}{arg}= {repr(val)}') + else: + logger.info(f'{arg} = {object.__repr__(val)}') def mkdirp(dirname): """ Make a directory or path of directories. Suppresses the error that is normally raised when the directory already exists """ - LOG("creating output directory: '{}'".format(dirname)) + logger.info(f"creating output directory: '{dirname}'") try: os.makedirs(dirname) except OSError as exc: # Python >2.5: http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python @@ -193,7 +151,7 @@ def filter_on_overlap(bpps, regions_by_reference_name): bpps (List[mavis.breakpoint.BreakpointPair]): list of breakpoint pairs to be filtered regions_by_reference_name (Dict[str,List[mavis.annotate.base.BioInterval]]): regions to filter against """ - LOG('filtering from', len(bpps), 'using overlaps with regions filter') + logger.info(f'filtering from {len(bpps)} using overlaps with regions filter') failed = [] passed = [] for bpp in bpps: @@ -213,7 +171,7 @@ def filter_on_overlap(bpps, regions_by_reference_name): failed.append(bpp) else: passed.append(bpp) - LOG('filtered from', len(bpps), 'down to', len(passed), '(removed {})'.format(len(failed))) + logger.info(f'filtered from {len(bpps)} down to {len(passed)} (removed {len(failed)})') return passed, failed @@ -221,13 +179,13 @@ def read_inputs(inputs, required_columns=[], **kwargs): bpps = [] for finput in bash_expands(*inputs): - LOG('loading:', finput) + logger.info('loading: {finput}') bpps.extend( read_bpp_from_input_file( finput, required_columns=[COLUMNS.protocol, *required_columns], **kwargs ) ) - LOG('loaded', len(bpps), 'breakpoint pairs') + logger.info(f'loaded {len(bpps)} breakpoint pairs') return bpps @@ -245,14 +203,14 @@ def output_tabbed_file(bpps: List[BreakpointPair], filename: str, header=None): if not custom_header: header.update(row.keys()) # type: ignore header = sort_columns(header) - LOG('writing:', filename) + logger.info(f'writing: {filename}') df = pd.DataFrame.from_records(rows, columns=header) df = df.fillna('None') df.to_csv(filename, columns=header, index=False, sep='\t') def write_bed_file(filename, bed_rows): - LOG('writing:', filename) + logger.info(f'writing: {filename}') with open(filename, 'w') as fh: for bed in bed_rows: fh.write('\t'.join([str(c) for c in bed]) + '\n') @@ -279,7 +237,7 @@ def get_connected_components(adj_matrix): return components -def generate_complete_stamp(output_dir, log=DEVNULL, prefix='MAVIS.', start_time=None): +def generate_complete_stamp(output_dir, prefix='MAVIS.', start_time=None): """ writes a complete stamp, optionally including the run time if start_time is given @@ -297,7 +255,7 @@ def generate_complete_stamp(output_dir, log=DEVNULL, prefix='MAVIS.', start_time 'some_output_dir/MAVIS.COMPLETE' """ stamp = os.path.join(output_dir, str(prefix) + 'COMPLETE') - log('complete:', stamp) + logger.info('complete: {stamp}') with open(stamp, 'w') as fh: if start_time is not None: duration = int(time.time()) - start_time diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index 1225d69c..6866930b 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -24,7 +24,7 @@ ) from ..error import NotSpecifiedError from ..interval import Interval -from ..util import DEVNULL +from ..util import logger class Evidence(BreakpointPair): @@ -836,7 +836,7 @@ def decide_sequenced_strand(self, reads: Set[pysam.AlignedSegment]): strand_calls, ) - def assemble_contig(self, log=DEVNULL): + def assemble_contig(self): """ uses the split reads and the partners of the half mapped reads to create a contig representing the sequence across the breakpoints @@ -875,7 +875,7 @@ def assemble_contig(self, log=DEVNULL): rqs_comp = reverse_complement(mate.query_sequence) assembly_sequences.setdefault(rqs_comp, set()).add(mate) - log('assembly size of {} sequences'.format(len(assembly_sequences) // 2)) + logger.info(f'assembly size of {len(assembly_sequences) // 2} sequences') kmer_size = self.read_length * self.config['validate.assembly_kmer_size'] remap_min_overlap = max( @@ -888,7 +888,6 @@ def assemble_contig(self, log=DEVNULL): min_edge_trim_weight=self.config['validate.assembly_min_edge_trim_weight'], assembly_max_paths=self.config['validate.assembly_max_paths'], min_contig_length=self.read_length, - log=log, remap_min_overlap=remap_min_overlap, remap_min_exact_match=self.config['validate.assembly_min_exact_match_to_remap'], assembly_min_uniq=self.config['validate.assembly_min_uniq'], @@ -919,11 +918,8 @@ def assemble_contig(self, log=DEVNULL): not self.interchromosomal and len(self.break1 | self.break2) < self.read_length ): filtered_contigs.append(ctg) - log( - 'filtered contigs from {} to {} based on remapped reads from both breakpoints'.format( - len(contigs), len(filtered_contigs) - ), - time_stamp=False, + logger.info( + f'filtered contigs from {len(contigs)} to {len(filtered_contigs)} based on remapped reads from both breakpoints' ) contigs = filtered_contigs @@ -985,7 +981,7 @@ def assemble_contig(self, log=DEVNULL): list(filtered_contigs.values()), key=lambda x: (x.remap_score() * -1, x.seq) ) - def load_evidence(self, log=DEVNULL): + def load_evidence(self): """ open the associated bam file and read and store the evidence does some preliminary read-quality filtering @@ -1117,7 +1113,7 @@ def filter_if_true(read): mates = self.bam_cache.get_mate(flanking_read, allow_file_access=False) for mate in mates: if mate.is_unmapped: - log('ignoring unmapped mate', mate.query_name, level=logging.DEBUG) + logger.debug(f'ignoring unmapped mate {mate.query_name}') continue self.collect_flanking_pair(flanking_read, mate) except KeyError: @@ -1163,7 +1159,7 @@ def filter_if_true(read): mates = self.bam_cache.get_mate(flanking_read, allow_file_access=False) for mate in mates: if mate.is_unmapped: - log('ignoring unmapped mate', mate.query_name, level=logging.DEBUG) + logger.debug(f'ignoring unmapped mate {mate.query_name}') continue try: self.collect_compatible_flanking_pair( @@ -1175,11 +1171,8 @@ def filter_if_true(read): pass # now collect the half mapped reads - log( - 'collected', - len(half_mapped_partners1 | half_mapped_partners2), - 'putative half mapped reads', - time_stamp=False, + logger.info( + f'collected {len(half_mapped_partners1 | half_mapped_partners2)} putative half mapped reads', ) mates_found = 0 for read in half_mapped_partners1 | half_mapped_partners2: @@ -1191,7 +1184,7 @@ def filter_if_true(read): self.collect_half_mapped(read, mate) except KeyError: pass - log(mates_found, 'half-mapped mates found') + logger.info(f'{mates_found} half-mapped mates found') def copy(self): raise NotImplementedError('not appropriate for copy of evidence') diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 4bec5fcd..ab8a48df 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -17,9 +17,9 @@ from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL from ..util import ( - LOG, filter_on_overlap, generate_complete_stamp, + logger, mkdirp, output_tabbed_file, read_inputs, @@ -109,13 +109,11 @@ def main( read_length=config['libraries'][library]['read_length'], median_fragment_size=config['libraries'][library]['median_fragment_size'], config=config, - **bpp.data + **bpp.data, ) evidence_clusters.append(evidence) except ValueError as err: - warnings.warn( - 'Dropping breakpoint pair ({}) as bad input {}'.format(str(bpp), str(err)) - ) + logger.warning(f'Dropping breakpoint pair ({bpp}) as bad input {err}') elif bpp.data[COLUMNS.protocol] == PROTOCOL.TRANS: try: evidence = TranscriptomeEvidence( @@ -132,11 +130,11 @@ def main( median_fragment_size=config['libraries'][library]['median_fragment_size'], strand_determining_read=config['libraries'][library]['strand_determining_read'], config=config, - **bpp.data + **bpp.data, ) evidence_clusters.append(evidence) except ValueError as err: - warnings.warn('Dropping ({}) as bad input {}'.format(str(bpp), str(err))) + logger.warning(f'Dropping ({bpp}) as bad input {err}') else: raise ValueError('protocol error', bpp.data[COLUMNS.protocol]) @@ -158,66 +156,40 @@ def main( ) contig_sequences = {} for i, evidence in enumerate(evidence_clusters): - LOG() - LOG( - '({} of {})'.format(i + 1, len(evidence_clusters)), - 'gathered evidence for:', - evidence.cluster_id, - '' - if COLUMNS.tracking_id not in evidence.data - else '(tracking_id: {})'.format(evidence.tracking_id), - time_stamp=True, - ) - LOG(evidence, time_stamp=False) - LOG('possible event type(s):', BreakpointPair.classify(evidence), time_stamp=False) - LOG( - 'outer window regions: {}:{}-{} {}:{}-{}'.format( - evidence.break1.chr, - evidence.outer_window1[0], - evidence.outer_window1[1], - evidence.break2.chr, - evidence.outer_window2[0], - evidence.outer_window2[1], + logger.info( + f'({i + 1} of {len(evidence_clusters)}) gathered evidence for: {evidence.cluster_id}' + + ( + '' + if COLUMNS.tracking_id not in evidence.data + else f' (tracking_id: {evidence.tracking_id})' ), - time_stamp=False, ) - LOG( - 'inner window regions: {}:{}-{} {}:{}-{}'.format( - evidence.break1.chr, - evidence.inner_window1[0], - evidence.inner_window1[1], - evidence.break2.chr, - evidence.inner_window2[0], - evidence.inner_window2[1], - ), - time_stamp=False, + logger.info(repr(evidence)) + logger.info(f'possible event type(s): {BreakpointPair.classify(evidence)}') + logger.info( + f'outer window regions: {evidence.break1.chr}:{evidence.outer_window1[0]}-{evidence.outer_window1[1]} {evidence.break2.chr}:{evidence.outer_window2[0]}-{evidence.outer_window2[1]}' ) - evidence.load_evidence(log=LOG) - LOG( - 'flanking pairs: {};'.format(len(evidence.flanking_pairs)), - 'split reads: {}, {};'.format(*[len(a) for a in evidence.split_reads]), - 'half-mapped reads: {}, {};'.format(*[len(a) for a in evidence.half_mapped]), - 'spanning-reads: {};'.format(len(evidence.spanning_reads)), - 'compatible flanking pairs:', - len(evidence.compatible_flanking_pairs), - time_stamp=False, + logger.info( + f'inner window regions: {evidence.break1.chr}:{evidence.inner_window1[0]}-{evidence.inner_window1[1]} {evidence.break2.chr}:{evidence.inner_window2[0]}-{evidence.inner_window2[1]}' ) - evidence.assemble_contig(log=LOG) - LOG('assembled {} contigs'.format(len(evidence.contigs)), time_stamp=False) + evidence.load_evidence() + logger.info( + f'flanking pairs: {len(evidence.flanking_pairs)}' + + '; split reads: {}, {}'.format(*[len(a) for a in evidence.split_reads]) + + '; half-mapped reads: {}, {}'.format(*[len(a) for a in evidence.half_mapped]) + + f'; spanning-reads: {len(evidence.spanning_reads)}; compatible flanking pairs: {len(evidence.compatible_flanking_pairs)}', + ) + evidence.assemble_contig() + logger.info(f'assembled {len(evidence.contigs)} contigs') for contig in evidence.contigs: name = 'seq-{}'.format(hashlib.md5(contig.seq.encode('utf-8')).hexdigest()) - LOG( - '>', - name, - '(size={}; reads={:.0f}; coverage={:.2f})'.format( - len(contig.seq), contig.remap_score(), contig.remap_coverage() - ), - time_stamp=False, + logger.info( + f'> {name} (size={len(contig.seq)}; reads={contig.remap_score():.0f}; coverage={contig.remap_coverage():.2f})' ) - LOG(contig.seq[:140], time_stamp=False) + logger.info(contig.seq[:140]) contig_sequences[name] = contig.seq - LOG('will output:', contig_aligner_fa, contig_aligner_output) + logger.info(f'will output: {contig_aligner_fa} ${contig_aligner_output}') raw_contig_alignments = align_sequences( contig_sequences, input_bam_cache, @@ -230,11 +202,10 @@ def main( aligner_output_log=contig_aligner_log, blat_min_identity=config['validate.blat_min_identity'], blat_limit_top_aln=config['validate.blat_limit_top_aln'], - log=LOG, ) for evidence in evidence_clusters: select_contig_alignments(evidence, raw_contig_alignments) - LOG('alignment complete', time_stamp=True) + logger.info('alignment complete') event_calls = [] total_pass = 0 write_bed_file( @@ -243,25 +214,17 @@ def main( ) validation_counts = {} for index, evidence in enumerate(evidence_clusters): - LOG() - LOG( - '({} of {}) calling events for: {} {} (tracking_id: {})'.format( - index + 1, - len(evidence_clusters), - evidence.cluster_id, - evidence.putative_event_types(), - evidence.tracking_id, - ), - time_stamp=True, + logger.info( + f'({index + 1} of {len(evidence_clusters)}) calling events for: {evidence.cluster_id} {evidence.putative_event_types()} (tracking_id: {evidence.tracking_id})' ) - LOG('source:', evidence) + logger.info(f'source: {evidence}') calls = [] failure_comment = None try: calls = call_events(evidence) event_calls.extend(calls) except UserWarning as err: - LOG('warning: error in calling events', repr(err)) + logger.warning('error in calling events {repr(err)}') failure_comment = str(err) if not calls: @@ -273,34 +236,26 @@ def main( else: total_pass += 1 - LOG('called {} event(s)'.format(len(calls)), time_stamp=True) + logger.info(f'called {len(calls)} event(s)') for call in calls: - LOG(call) + logger.info(call) if call.call_method == CALL_METHOD.CONTIG: - LOG( - '\t{} {} [{}] contig_alignment_score: {}, contig_alignment_mq: {} contig_alignment_rank: {}'.format( - call.event_type, - call.call_method, - call.contig_alignment.query_name, - round(call.contig_alignment.score(), 2), - tuple(call.contig_alignment.mapping_quality()), - tuple(call.contig_alignment.alignment_rank()), - ) + logger.info( + f'{call.event_type} {call.call_method} [{call.contig_alignment.query_name}] contig_alignment_score: {round(call.contig_alignment.score(), 2)}, contig_alignment_mq: {tuple(call.contig_alignment.mapping_quality())} contig_alignment_rank: {tuple(call.contig_alignment.alignment_rank())}' ) - LOG('\talignment:', call.contig_alignment.alignment_id()) + logger.info(f'alignment: {call.contig_alignment.alignment_id()}') elif call.contig_alignment: - LOG( - '\t{} {} alignment:'.format(call.event_type, call.call_method), - call.contig_alignment.alignment_id(), + logger.info( + f'{call.event_type} {call.call_method} alignment: {call.contig_alignment.alignment_id()}' ) else: - LOG('\t{} {}'.format(call.event_type, call.call_method), time_stamp=False) + logger.info('{call.event_type} {call.call_method}') validation_counts[call.cluster_id] = validation_counts.get(call.cluster_id, 0) + 1 call.data[COLUMNS.validation_id] = '{}-v{}'.format( call.cluster_id, validation_counts[call.cluster_id] ) - LOG( - '\tremapped reads: {}; spanning reads: {}; split reads: [{} ({}), {} ({}), {}]' + logger.info( + 'remapped reads: {}; spanning reads: {}; split reads: [{} ({}), {} ({}), {}]' ', flanking pairs: {}{}'.format( 0 if not call.contig else len(call.contig.input_reads), len(call.spanning_reads), @@ -327,11 +282,8 @@ def main( call.data.update( {COLUMNS.break1_homologous_seq: b1_homseq, COLUMNS.break2_homologous_seq: b2_homseq} ) - LOG( - '{} putative calls resulted in {} events with 1 or more event call'.format( - len(evidence_clusters), total_pass - ), - time_stamp=True, + logger.info( + f'{len(evidence_clusters)} putative calls resulted in {total_pass} events with 1 or more event call' ) output_tabbed_file(event_calls, passed_output_file) output_tabbed_file(filtered_evidence_clusters, failed_output_file) @@ -342,7 +294,7 @@ def main( if config['validate.write_evidence_files']: with pysam.AlignmentFile(contig_bam, 'wb', template=input_bam_cache.fh) as fh: - LOG('writing:', contig_bam, time_stamp=True) + logger.info(f'writing: {contig_bam}') for evidence in evidence_clusters: for contig in evidence.contigs: for aln in contig.alignments: @@ -354,7 +306,7 @@ def main( # write the evidence with pysam.AlignmentFile(raw_evidence_bam, 'wb', template=input_bam_cache.fh) as fh: - LOG('writing:', raw_evidence_bam, time_stamp=True) + logger.info(f'writing: {raw_evidence_bam}') reads = set() for evidence in evidence_clusters: reads.update(evidence.supporting_reads()) @@ -363,23 +315,23 @@ def main( fh.write(read) # now sort the contig bam sort = re.sub(r'.bam$', '.sorted.bam', contig_bam) - LOG('sorting the bam file:', contig_bam, time_stamp=True) + logger.info(f'sorting the bam file: {contig_bam}') pysam.sort('-o', sort, contig_bam) contig_bam = sort - LOG('indexing the sorted bam:', contig_bam) + logger.info(f'indexing the sorted bam: {contig_bam}') pysam.index(contig_bam) # then sort the evidence bam file sort = re.sub(r'.bam$', '.sorted.bam', raw_evidence_bam) - LOG('sorting the bam file:', raw_evidence_bam, time_stamp=True) + logger.info(f'sorting the bam file: {raw_evidence_bam}') pysam.sort('-o', sort, raw_evidence_bam) raw_evidence_bam = sort - LOG('indexing the sorted bam:', raw_evidence_bam) + logger.info(f'indexing the sorted bam: {raw_evidence_bam}') pysam.index(raw_evidence_bam) # write the igv batch file with open(igv_batch_file, 'w') as fh: - LOG('writing:', igv_batch_file, time_stamp=True) + logger.info(f'writing: {igv_batch_file}') fh.write('load {} name="{}"\n'.format(passed_bed_file, 'passed events')) fh.write('load {} name="{}"\n'.format(contig_bam, 'aligned contigs')) @@ -392,4 +344,4 @@ def main( config['libraries'][library]['protocol'], ) ) - generate_complete_stamp(output, LOG, start_time=start_time) + generate_complete_stamp(output, start_time=start_time) diff --git a/src/tools/calculate_ref_alt_counts.py b/src/tools/calculate_ref_alt_counts.py index cbb3be43..70992f7a 100644 --- a/src/tools/calculate_ref_alt_counts.py +++ b/src/tools/calculate_ref_alt_counts.py @@ -9,8 +9,7 @@ import pysam from mavis.annotate.file_io import load_reference_genome from mavis.constants import SVTYPE -from mavis.util import LOG as log -from mavis.util import output_tabbed_file, read_inputs +from mavis.util import logger, output_tabbed_file, read_inputs from mavis.validate.call import EventCall @@ -122,7 +121,7 @@ class RefAltCalculator: def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1): if isinstance(reference_genome, str): - log('loading:', reference_genome, time_stamp=True) + logger.info(f'loading: {reference_genome}') self.reference_genome = load_reference_genome(reference_genome) else: self.reference_genome = reference_genome @@ -154,19 +153,17 @@ def calculate_ref_counts(self, bpp): raise ValueError("Cannot determine ref and alt count for non precise breakpoint pairs") if bpp not in self.bpp_cache: - log("processing {}".format(bpp)) + logger.info(f'processing {bpp}') data = dict() for name, read_length, bam in self.input_bams: ref, alt, ign, mul, ref_sequence, alt_sequence = calculate_ref_count( bpp, read_length, self.reference_genome, bam, self.buffer ) - log(bpp, name) - log( - 'Calculated counts: Ref: {}, Alt: {}, Mul: {}, Ignored: {} '.format( - len(ref), len(alt), len(mul), len(ign) - ) + logger.info(f'{bpp} {name}') + logger.info( + f'Calculated counts: Ref: {len(ref)}, Alt: {len(alt)}, Mul: {len(mul)}, Ignored: {len(ign)}' ) - log('Ref_probe: {}, Alt_probe: {}'.format(ref_sequence, alt_sequence)) + logger.info(f'Ref_probe: {ref_sequence}, Alt_probe: {alt_sequence}') info = { '{}_ref_count'.format(name): len(ref), '{}_alt_count'.format(name): len(alt), @@ -201,7 +198,7 @@ def calculate_all_counts(self, input_files, output_file): filtered_events.append(bpp) continue - log('filtered {} events'.format(len(filtered_events))) + logger.info(f'filtered {len(filtered_events)} events') output_tabbed_file(processed_bpps.values(), output_file) return processed_bpps, filtered_events diff --git a/tests/integration/test_annotate_fileio.py b/tests/integration/test_annotate_fileio.py index 7b1a09de..53572a15 100644 --- a/tests/integration/test_annotate_fileio.py +++ b/tests/integration/test_annotate_fileio.py @@ -7,5 +7,5 @@ class TestAnnotationLoading: def test_load_json(self): - result = load_annotations(JSON, warn=print) + result = load_annotations(JSON) assert len(result.keys()) == 12 diff --git a/tests/integration/test_args.py b/tests/integration/test_args.py index db14bc55..492509f3 100644 --- a/tests/integration/test_args.py +++ b/tests/integration/test_args.py @@ -1,7 +1,7 @@ import json import sys import tempfile -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest from mavis import util @@ -65,7 +65,7 @@ def test_trans_multiple_annotations_no_masking(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(cluster_main, 'main', util.DEVNULL): + with patch.object(cluster_main, 'main', Mock()): with patch.object(sys, 'argv', args): mavis_main() @@ -102,7 +102,7 @@ def test_trans_multiple_annotations_with_masking(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(cluster_main, 'main', util.DEVNULL): + with patch.object(cluster_main, 'main', Mock()): with patch.object(sys, 'argv', args): mavis_main() @@ -123,7 +123,7 @@ def test_error_missing_annotations_translib_uninform(self, configpath, output_di ) ) args = ['mavis', 'cluster', '--library', 'translib', '--output', output_dir] - with patch.object(cluster_main, 'main', util.DEVNULL): + with patch.object(cluster_main, 'main', Mock()): with patch.object(sys, 'argv', args): expect_error(self, mavis_main) @@ -163,7 +163,7 @@ def test_error_missing_annotations_translib(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(validate_main, 'main', util.DEVNULL): + with patch.object(validate_main, 'main', Mock()): with patch.object(sys, 'argv', args): expect_error(self, mavis_main) @@ -208,7 +208,7 @@ def test_ok_multi_ref_genome(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(validate_main, 'main', util.DEVNULL): + with patch.object(validate_main, 'main', Mock()): with patch.object(sys, 'argv', args): mavis_main() @@ -256,7 +256,7 @@ def test_error_multi_aligner_ref(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(validate_main, 'main', util.DEVNULL): + with patch.object(validate_main, 'main', Mock()): with patch.object(sys, 'argv', args): expect_error(self, mavis_main) @@ -300,7 +300,7 @@ def test_error_missing_aligner_ref(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(validate_main, 'main', util.DEVNULL): + with patch.object(validate_main, 'main', Mock()): with patch.object(sys, 'argv', args): expect_error(self, mavis_main) @@ -344,7 +344,7 @@ def test_error_missing_reference_genome(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(validate_main, 'main', util.DEVNULL): + with patch.object(validate_main, 'main', Mock()): with patch.object(sys, 'argv', args): expect_error(self, mavis_main) @@ -391,6 +391,6 @@ def test_error_bad_aligner_ref(self, configpath, output_dir): '--config', str(configpath), ] - with patch.object(validate_main, 'main', util.DEVNULL): + with patch.object(validate_main, 'main', Mock()): with patch.object(sys, 'argv', args): expect_error(self, mavis_main) diff --git a/tests/integration/test_assemble.py b/tests/integration/test_assemble.py index b91b1c4d..07e22b64 100644 --- a/tests/integration/test_assemble.py +++ b/tests/integration/test_assemble.py @@ -5,7 +5,6 @@ from mavis.assemble import Contig, assemble, filter_contigs from mavis.constants import reverse_complement from mavis.interval import Interval -from mavis.util import LOG from mavis_config import DEFAULTS from ..util import get_data, long_running_test @@ -363,7 +362,6 @@ def test_large_assembly(self, large_assembly_seq): min_edge_trim_weight=DEFAULTS['validate.assembly_min_edge_trim_weight'], assembly_max_paths=DEFAULTS['validate.assembly_max_paths'], min_contig_length=150, - log=LOG, remap_min_exact_match=30, assembly_min_uniq=DEFAULTS['validate.assembly_min_uniq'], ) @@ -615,7 +613,6 @@ def test_assemble_short_contig(self): assembly_min_uniq=0.1, min_contig_length=125, remap_min_exact_match=15, - log=LOG, ) target = 'GGGCACGGCTGCAGCGTCGCGGTGCATCAAGCTTGCTATGGCATTGTTCAAGTACCCACTGGACCGTGGTTTTGCAGGAAATGTGAATCTCAGGAGAGAGCAGCCAGAGTGATACAGTTTATGTAACTTGATGGAAGAA' @@ -627,7 +624,7 @@ def test_assemble_short_contig(self): @timeout_decorator.timeout(120) @long_running_test def test_long_filter_bug(self, long_filter_seq): - contigs = assemble(long_filter_seq, 111, 3, 8, 0.1, 0.1, log=LOG) + contigs = assemble(long_filter_seq, 111, 3, 8, 0.1, 0.1) for c in contigs: print(c.seq, c.remap_score()) assert len(contigs) diff --git a/tests/integration/test_bam.py b/tests/integration/test_bam.py index 7f8b87f7..9ccbc09d 100644 --- a/tests/integration/test_bam.py +++ b/tests/integration/test_bam.py @@ -54,7 +54,7 @@ def test_add_read(self): b.add_read(r) assert len(b.cache.values()) == 1 - @mock.patch('mavis.util.LOG') + @mock.patch('mavis.util.logger') def test_add_invalid_read(self, log_patcher): bad_read = mock.Mock( is_unmapped=False, reference_start=0, reference_end=0, query_name='BAD_READ' @@ -62,9 +62,9 @@ def test_add_invalid_read(self, log_patcher): cache = BamCache(MockBamFileHandle()) cache.add_read(bad_read) assert len(cache.cache) == 0 - log_patcher.assert_called_with('ignoring invalid read', 'BAD_READ', level=logging.DEBUG) + log_patcher.method_calls[0].assert_called_with('ignoring invalid read: BAD_READ') - @mock.patch('mavis.util.LOG') + @mock.patch('mavis.util.logger') def test_fetch_invalid_read(self, log_patcher): bad_read = mock.Mock( is_unmapped=False, reference_start=0, reference_end=0, query_name='BAD_READ' @@ -74,9 +74,9 @@ def test_fetch_invalid_read(self, log_patcher): cache = BamCache(fh) cache.fetch('chr', 1, 10) assert len(cache.cache) == 0 - log_patcher.assert_called_with('ignoring invalid read', 'BAD_READ', level=logging.DEBUG) + log_patcher.method_calls[0].assert_called_with('ignoring invalid read: BAD_READ') - @mock.patch('mavis.util.LOG') + @mock.patch('mavis.util.logger') def test_bin_fetch_invalid_read(self, log_patcher): bad_read = mock.Mock( is_unmapped=False, reference_start=0, reference_end=0, query_name='BAD_READ' @@ -86,7 +86,7 @@ def test_bin_fetch_invalid_read(self, log_patcher): cache = BamCache(fh) cache.fetch_from_bins('chr', 1, 10) assert len(cache.cache) == 0 - log_patcher.assert_called_with('ignoring invalid read', 'BAD_READ', level=logging.DEBUG) + log_patcher.method_calls[0].assert_called_with('ignoring invalid read: BAD_READ') def test_reference_id(self): fh = MockBamFileHandle({'1': 0}) diff --git a/tests/snakemake/test_mini_workflow.py b/tests/snakemake/test_mini_workflow.py index 2b559234..4d725a67 100644 --- a/tests/snakemake/test_mini_workflow.py +++ b/tests/snakemake/test_mini_workflow.py @@ -1,3 +1,4 @@ +import glob import json import os import shutil @@ -12,6 +13,19 @@ from ..util import glob_exists, long_running_test, package_relative_file +def tail_logfiles(dirname, n_lines=10): + """ + Prints the tail of txt files in this dir. This is useful for debugging snakemake tests since + the logs are deleted with the temp dir when the test fails + """ + for filename in glob.glob(os.path.join(dirname, '*.log.txt')): + with open(filename, 'r') as fh: + lines = fh.readlines() + start_line = max([0, len(lines) - n_lines]) + print(f'TAIL: {filename}') + print('\n'.join(lines[start_line:])) + + @pytest.fixture def blat_output_dir(): temp_output = tempfile.mkdtemp() @@ -85,6 +99,7 @@ def test_workflow(output_dir): except SystemExit as err: if err.code != 0: + tail_logfiles(os.path.join(output_dir, 'output_dir', 'logs')) raise err for expected_file in [ @@ -121,6 +136,7 @@ def test_no_validate_worflow(output_dir): except SystemExit as err: if err.code != 0: + tail_logfiles(os.path.join(output_dir, 'output_dir', 'logs')) raise err for expected_file in [ From 9a93374be3e00271c32c6080e39ebeca530f4421 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 26 Jan 2022 12:47:55 -0800 Subject: [PATCH 088/137] Fix linting errors --- src/mavis/annotate/constants.py | 2 +- src/mavis/annotate/genomic.py | 7 +++---- src/mavis/bam/read.py | 19 +++++++++---------- src/mavis/bam/stats.py | 1 - src/mavis/breakpoint.py | 2 +- src/mavis/illustrate/elements.py | 13 ++++++------- src/mavis/overlay.py | 2 +- src/mavis/pairing/constants.py | 2 +- src/mavis/pairing/main.py | 2 +- src/mavis/summary/main.py | 3 +-- src/mavis/tools/__init__.py | 2 +- src/mavis/tools/breakdancer.py | 3 +-- src/mavis/validate/base.py | 2 -- src/mavis/validate/evidence.py | 1 - src/mavis/validate/main.py | 1 - 15 files changed, 26 insertions(+), 36 deletions(-) diff --git a/src/mavis/annotate/constants.py b/src/mavis/annotate/constants.py index 62882a9b..3f80c171 100644 --- a/src/mavis/annotate/constants.py +++ b/src/mavis/annotate/constants.py @@ -1,6 +1,6 @@ import re -from ..constants import MavisNamespace, float_fraction +from ..constants import MavisNamespace PASS_FILENAME = 'annotations.tab' diff --git a/src/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py index 9891cebe..a6d18439 100644 --- a/src/mavis/annotate/genomic.py +++ b/src/mavis/annotate/genomic.py @@ -1,12 +1,11 @@ -from copy import copy import itertools +from ..constants import ORIENT, STRAND, reverse_complement +from ..error import NotSpecifiedError +from ..interval import Interval from .base import BioInterval, ReferenceName from .constants import SPLICE_SITE_TYPE from .splicing import SpliceSite, SplicingPattern -from ..constants import ORIENT, reverse_complement, STRAND -from ..error import NotSpecifiedError -from ..interval import Interval class Template(BioInterval): diff --git a/src/mavis/bam/read.py b/src/mavis/bam/read.py index 753c8414..f41f31cb 100644 --- a/src/mavis/bam/read.py +++ b/src/mavis/bam/read.py @@ -1,28 +1,27 @@ -from copy import copy import itertools import re -import subprocess +from copy import copy import pysam from Bio.Data import IUPACData as iupac -from . import cigar as _cigar -from .cigar import ( - EVENT_STATES, - QUERY_ALIGNED_STATES, - REFERENCE_ALIGNED_STATES, - convert_cigar_to_string, -) from ..constants import ( CIGAR, DNA_ALPHABET, + NA_MAPPING_QUALITY, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE, - NA_MAPPING_QUALITY, ) from ..interval import Interval +from . import cigar as _cigar +from .cigar import ( + EVENT_STATES, + QUERY_ALIGNED_STATES, + REFERENCE_ALIGNED_STATES, + convert_cigar_to_string, +) class SamRead(pysam.AlignedSegment): diff --git a/src/mavis/bam/stats.py b/src/mavis/bam/stats.py index 7901c044..ea6d1603 100644 --- a/src/mavis/bam/stats.py +++ b/src/mavis/bam/stats.py @@ -2,7 +2,6 @@ import math import os import statistics as stats -import warnings from ..constants import STRAND from ..util import logger diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 10218017..81518527 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -3,7 +3,7 @@ from copy import copy as _copy from typing import Callable, Dict, List, Optional, Set, Tuple -from .constants import CIGAR, COLUMNS, DNA_ALPHABET, ORIENT, STRAND, SVTYPE, reverse_complement +from .constants import COLUMNS, DNA_ALPHABET, ORIENT, STRAND, SVTYPE, reverse_complement from .error import InvalidRearrangement, NotSpecifiedError from .interval import Interval diff --git a/src/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py index cb79549b..a5f1e762 100644 --- a/src/mavis/illustrate/elements.py +++ b/src/mavis/illustrate/elements.py @@ -2,20 +2,19 @@ This is the primary module responsible for generating svg visualizations """ -import itertools import re +from ..annotate.variant import FusionTranscript +from ..constants import CODON_SIZE, GIEMSA_STAIN, ORIENT, STRAND +from ..error import DrawingFitError, NotSpecifiedError +from ..interval import Interval from .util import ( + LabelMapping, + Tag, dynamic_label_color, generate_interval_mapping, - LabelMapping, split_intervals_into_tracks, - Tag, ) -from ..annotate.variant import FusionTranscript -from ..constants import CODON_SIZE, GIEMSA_STAIN, ORIENT, STRAND -from ..error import DrawingFitError, NotSpecifiedError -from ..interval import Interval # draw gene level view # draw gene box diff --git a/src/mavis/overlay.py b/src/mavis/overlay.py index fccd5e34..71f6c239 100644 --- a/src/mavis/overlay.py +++ b/src/mavis/overlay.py @@ -1,5 +1,5 @@ import os -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple from . import annotate as _annotate from . import util as _util diff --git a/src/mavis/pairing/constants.py b/src/mavis/pairing/constants.py index cb55e123..ea0a63a4 100644 --- a/src/mavis/pairing/constants.py +++ b/src/mavis/pairing/constants.py @@ -2,7 +2,7 @@ from mavis_config import DEFAULTS -from ..constants import CALL_METHOD, MavisNamespace +from ..constants import CALL_METHOD PAIRING_DISTANCES: Dict[str, int] = { CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'], diff --git a/src/mavis/pairing/main.py b/src/mavis/pairing/main.py index 8b92586c..48809ba9 100644 --- a/src/mavis/pairing/main.py +++ b/src/mavis/pairing/main.py @@ -7,7 +7,7 @@ from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair -from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SPLICE_TYPE, SVTYPE +from ..constants import CALL_METHOD, COLUMNS, SPLICE_TYPE, SVTYPE from ..util import generate_complete_stamp, logger, output_tabbed_file, read_inputs from .pairing import inferred_equivalent, pair_by_distance, product_key diff --git a/src/mavis/summary/main.py b/src/mavis/summary/main.py index 309da1b8..73c9240d 100644 --- a/src/mavis/summary/main.py +++ b/src/mavis/summary/main.py @@ -1,7 +1,6 @@ import os import re import time -from functools import partial from typing import Dict, List, Tuple import pandas as pd @@ -9,7 +8,7 @@ from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SPLICE_TYPE, SVTYPE -from ..util import generate_complete_stamp, logger, output_tabbed_file, read_inputs, soft_cast +from ..util import generate_complete_stamp, logger, output_tabbed_file, read_inputs from .constants import HOMOPOLYMER_MIN_LENGTH from .summary import ( annotate_dgv, diff --git a/src/mavis/tools/__init__.py b/src/mavis/tools/__init__.py index a1b814bd..8649b31f 100644 --- a/src/mavis/tools/__init__.py +++ b/src/mavis/tools/__init__.py @@ -1,5 +1,5 @@ import itertools -from typing import Callable, Dict, List +from typing import Dict, List import pandas as pd from shortuuid import uuid diff --git a/src/mavis/tools/breakdancer.py b/src/mavis/tools/breakdancer.py index e1f8361e..4f8d2562 100644 --- a/src/mavis/tools/breakdancer.py +++ b/src/mavis/tools/breakdancer.py @@ -1,5 +1,4 @@ import re -from argparse import Namespace import pandas as pd @@ -34,7 +33,7 @@ def convert_file(input_file): }, ) if 'num_Reads_lib' not in df: - raise KeyError(f'missing required column: num_Reads_lib') + raise KeyError('missing required column: num_Reads_lib') for bam, lib in bam_to_lib.items(): df['num_Reads_lib'] = df['num_Reads_lib'].str.replace(bam, lib) diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index 6866930b..e73d4846 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -1,5 +1,4 @@ import itertools -import logging from abc import abstractmethod from typing import Dict, List, Optional, Set, Tuple @@ -16,7 +15,6 @@ COLUMNS, NA_MAPPING_QUALITY, ORIENT, - PROTOCOL, PYSAM_READ_FLAGS, STRAND, SVTYPE, diff --git a/src/mavis/validate/evidence.py b/src/mavis/validate/evidence.py index 19f2fbdb..610c4206 100644 --- a/src/mavis/validate/evidence.py +++ b/src/mavis/validate/evidence.py @@ -241,7 +241,6 @@ def distance(self, start: int, end: int, strand: str = STRAND.NS, chrom: Optiona mixed = [] inter = [] transcripts = self._select_transcripts(chrom, strand) - genomic_distance = Evidence.distance(start, end).end # try to calculate assuming the positions are exonic for transcript in itertools.chain.from_iterable([t.transcripts for t in transcripts]): if not transcript.reference_object.position & Interval(start, end): diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index ab8a48df..325dbf3c 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -3,7 +3,6 @@ import os import re import time -import warnings from typing import Dict, List import pysam From 31119fcabeb512e7587440413a4d8f7a3ef23cef Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 26 Jan 2022 12:53:28 -0800 Subject: [PATCH 089/137] Import subcommand from mavis_config --- src/mavis/cluster/main.py | 3 ++- src/mavis/main.py | 2 +- tests/end_to_end/test_convert.py | 3 ++- tests/end_to_end/test_help.py | 2 +- tests/end_to_end/test_overlay.py | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mavis/cluster/main.py b/src/mavis/cluster/main.py index aec884ba..17567f52 100644 --- a/src/mavis/cluster/main.py +++ b/src/mavis/cluster/main.py @@ -3,11 +3,12 @@ import time from typing import Dict, List +from mavis_config.constants import SUBCOMMAND from shortuuid import uuid from ..annotate.file_io import ReferenceFile from ..breakpoint import BreakpointPair -from ..constants import COLUMNS, SUBCOMMAND +from ..constants import COLUMNS from ..util import ( filter_on_overlap, filter_uninformative, diff --git a/src/mavis/main.py b/src/mavis/main.py index d417d7d5..0bf4eb82 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -9,6 +9,7 @@ from typing import Dict from mavis_config import validate_config +from mavis_config.constants import SUBCOMMAND from . import __version__ from . import config as _config @@ -16,7 +17,6 @@ from .align import get_aligner_version from .annotate import main as annotate_main from .cluster import main as cluster_main -from .constants import SUBCOMMAND from .overlay import check_overlay_args from .overlay import main as overlay_main from .pairing import main as pairing_main diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index 671e4d10..00f8ea4a 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -5,10 +5,11 @@ import unittest from unittest.mock import patch -from mavis.constants import ORIENT, SUBCOMMAND, SVTYPE +from mavis.constants import ORIENT, SVTYPE from mavis.main import main from mavis.tools import SUPPORTED_TOOL from mavis.util import read_bpp_from_input_file +from mavis_config.constants import SUBCOMMAND from ..util import get_data, glob_exists diff --git a/tests/end_to_end/test_help.py b/tests/end_to_end/test_help.py index 6d3cdd24..d73fa2fc 100644 --- a/tests/end_to_end/test_help.py +++ b/tests/end_to_end/test_help.py @@ -1,8 +1,8 @@ import sys from unittest.mock import patch -from mavis.constants import SUBCOMMAND from mavis.main import main +from mavis_config.constants import SUBCOMMAND class TestHelpMenu: diff --git a/tests/end_to_end/test_overlay.py b/tests/end_to_end/test_overlay.py index 5950701d..b0584de8 100644 --- a/tests/end_to_end/test_overlay.py +++ b/tests/end_to_end/test_overlay.py @@ -6,8 +6,8 @@ from unittest.mock import patch import pytest -from mavis.constants import SUBCOMMAND from mavis.main import main +from mavis_config.constants import SUBCOMMAND from ..util import get_data, glob_exists From fefc0958e26008fb9282f6bdf6ad1f44d364e11f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 26 Jan 2022 13:14:33 -0800 Subject: [PATCH 090/137] Add time and level to log format --- src/mavis/main.py | 6 +++++- src/mavis/pairing/main.py | 2 +- src/mavis/summary/main.py | 2 +- src/mavis/util.py | 4 ++-- src/mavis/validate/main.py | 2 +- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/mavis/main.py b/src/mavis/main.py index 0bf4eb82..5c7c8ecd 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -176,7 +176,11 @@ def main(argv=None): if args.command == SUBCOMMAND.OVERLAY: args = check_overlay_args(args, parser) - log_conf = {'format': '{message}', 'style': '{', 'level': args.log_level} + log_conf = { + 'format': '{asctime} [{levelname}] {message}', + 'style': '{', + 'level': args.log_level, + } original_logging_handlers = logging.root.handlers[:] for handler in logging.root.handlers: diff --git a/src/mavis/pairing/main.py b/src/mavis/pairing/main.py index 48809ba9..270e88d9 100644 --- a/src/mavis/pairing/main.py +++ b/src/mavis/pairing/main.py @@ -136,4 +136,4 @@ def main( fname = os.path.join(output, 'mavis_paired.tab') output_tabbed_file(bpps, fname) - generate_complete_stamp(output) + generate_complete_stamp(output, start_time=start_time) diff --git a/src/mavis/summary/main.py b/src/mavis/summary/main.py index 73c9240d..93f49726 100644 --- a/src/mavis/summary/main.py +++ b/src/mavis/summary/main.py @@ -343,4 +343,4 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time( ): lib_rows.append(row) output_tabbed_file(lib_rows, filename, header=output_columns) - generate_complete_stamp(output) + generate_complete_stamp(output, start_time=start_time) diff --git a/src/mavis/util.py b/src/mavis/util.py index cfb269da..7009cb48 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -179,7 +179,7 @@ def read_inputs(inputs, required_columns=[], **kwargs): bpps = [] for finput in bash_expands(*inputs): - logger.info('loading: {finput}') + logger.info(f'loading: {finput}') bpps.extend( read_bpp_from_input_file( finput, required_columns=[COLUMNS.protocol, *required_columns], **kwargs @@ -255,7 +255,7 @@ def generate_complete_stamp(output_dir, prefix='MAVIS.', start_time=None): 'some_output_dir/MAVIS.COMPLETE' """ stamp = os.path.join(output_dir, str(prefix) + 'COMPLETE') - logger.info('complete: {stamp}') + logger.info(f'complete: {stamp}') with open(stamp, 'w') as fh: if start_time is not None: duration = int(time.time()) - start_time diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 325dbf3c..9cbc1383 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -163,7 +163,7 @@ def main( else f' (tracking_id: {evidence.tracking_id})' ), ) - logger.info(repr(evidence)) + logger.info(str(evidence)) logger.info(f'possible event type(s): {BreakpointPair.classify(evidence)}') logger.info( f'outer window regions: {evidence.break1.chr}:{evidence.outer_window1[0]}-{evidence.outer_window1[1]} {evidence.break2.chr}:{evidence.outer_window2[0]}-{evidence.outer_window2[1]}' From b1b890ac9ac4cca73696072f3a718fc5ff5328e2 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 26 Jan 2022 13:27:38 -0800 Subject: [PATCH 091/137] adddressed Sniffles support --- src/mavis/interval.py | 2 + src/mavis/tools/constants.py | 4 + src/mavis/tools/vcf.py | 203 +++++++++++++++-------------------- tests/unit/test_tools_vcf.py | 30 +++++- 4 files changed, 120 insertions(+), 119 deletions(-) diff --git a/src/mavis/interval.py b/src/mavis/interval.py index c78e3aa8..7d0d5d96 100644 --- a/src/mavis/interval.py +++ b/src/mavis/interval.py @@ -30,6 +30,8 @@ def __init__(self, start: int, end: Optional[int] = None, freq: int = 1, number_ self.start = self.number_type(self.start) self.end = self.number_type(self.end) + if self.start == 0 and self.end == 1: + self.start = 1 if self.start > self.end: raise AttributeError('interval start > end is not allowed', self.start, self.end) self.freq = int(freq) diff --git a/src/mavis/tools/constants.py b/src/mavis/tools/constants.py index d2412dfa..821d6a6a 100644 --- a/src/mavis/tools/constants.py +++ b/src/mavis/tools/constants.py @@ -51,6 +51,10 @@ class SUPPORTED_TOOL(MavisNamespace): 'dup': [SVTYPE.DUP], 'ITD': [SVTYPE.DUP], 'IDP': [SVTYPE.INS], + 'DEL/INV': [SVTYPE.DEL, SVTYPE.INV], + 'DUP/INS': [SVTYPE.DUP, SVTYPE.INS], + 'INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS], + 'INV/INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS], } ) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index c25fe27d..04dc7bfb 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -2,7 +2,6 @@ import re from dataclasses import dataclass from typing import Dict, List, Optional, Tuple -from copy import deepcopy import pandas as pd @@ -17,19 +16,19 @@ from .constants import SUPPORTED_TOOL PANDAS_DEFAULT_NA_VALUES = [ - "-1.#IND", - "1.#QNAN", - "1.#IND", - "-1.#QNAN", - "#N/A", - "N/A", - "NA", - "#NA", - "NULL", - "NaN", - "-NaN", - "nan", - "-nan", + '-1.#IND', + '1.#QNAN', + '1.#IND', + '-1.#QNAN', + '#N/A', + 'N/A', + 'NA', + '#NA', + 'NULL', + 'NaN', + '-NaN', + 'nan', + '-nan', ] @@ -54,7 +53,7 @@ class VcfRecordType: @property def stop(self) -> Optional[int]: - return self.info.get("END", self.pos) + return self.info.get('END', self.pos) def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: @@ -76,51 +75,51 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: | ru]p] | LL | """ # ru[p[ - match = re.match(r"^(?P\w)(?P\w*)\[(?P[^:]+):(?P\d+)\[$", alt) + match = re.match(r'^(?P\w)(?P\w*)\[(?P[^:]+):(?P\d+)\[$', alt) if match: return ( - match.group("chr"), - int(match.group("pos")), + match.group('chr'), + int(match.group('pos')), ORIENT.LEFT, ORIENT.RIGHT, - match.group("ref"), - match.group("useq"), + match.group('ref'), + match.group('useq'), ) # [p[ur - match = re.match(r"^\[(?P[^:]+):(?P\d+)\[(?P\w*)(?P\w)$", alt) + match = re.match(r'^\[(?P[^:]+):(?P\d+)\[(?P\w*)(?P\w)$', alt) if match: return ( - match.group("chr"), - int(match.group("pos")), + match.group('chr'), + int(match.group('pos')), ORIENT.RIGHT, ORIENT.RIGHT, - match.group("ref"), - match.group("useq"), + match.group('ref'), + match.group('useq'), ) # ]p]ur - match = re.match(r"^\](?P[^:]+):(?P\d+)\](?P\w*)(?P\w)$", alt) + match = re.match(r'^\](?P[^:]+):(?P\d+)\](?P\w*)(?P\w)$', alt) if match: return ( - match.group("chr"), - int(match.group("pos")), + match.group('chr'), + int(match.group('pos')), ORIENT.RIGHT, ORIENT.LEFT, - match.group("ref"), - match.group("useq"), + match.group('ref'), + match.group('useq'), ) # ru]p] - match = re.match(r"^(?P\w)(?P\w*)\](?P[^:]+):(?P\d+)\]$", alt) + match = re.match(r'^(?P\w)(?P\w*)\](?P[^:]+):(?P\d+)\]$', alt) if match: return ( - match.group("chr"), - int(match.group("pos")), + match.group('chr'), + int(match.group('pos')), ORIENT.LEFT, ORIENT.LEFT, - match.group("ref"), - match.group("useq"), + match.group('ref'), + match.group('useq'), ) else: - raise NotImplementedError("alt specification in unexpected format", alt) + raise NotImplementedError('alt specification in unexpected format', alt) def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: @@ -144,7 +143,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: try: value = record.info[key] except UnicodeDecodeError as err: - log("Ignoring invalid INFO field {} with error: {}".format(key, err)) + log('Ignoring invalid INFO field {} with error: {}'.format(key, err)) else: try: value = value[0] if len(value) == 1 else value @@ -153,27 +152,27 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: info[key] = value std_row = {} - if record.id and record.id != "N": # to account for NovoBreak N in the ID field - std_row["id"] = record.id + if record.id and record.id != 'N': # to account for NovoBreak N in the ID field + std_row['id'] = record.id - if info.get("SVTYPE") == "BND": + if info.get('SVTYPE') == 'BND': chr2, end, orient1, orient2, ref, alt = parse_bnd_alt(alt) std_row[COLUMNS.break1_orientation] = orient1 std_row[COLUMNS.break2_orientation] = orient2 std_row[COLUMNS.untemplated_seq] = alt if record.ref != ref: raise AssertionError( - "Expected the ref specification in the vcf record to match the sequence " - "in the alt string: {} vs {}".format(record.ref, ref) + 'Expected the ref specification in the vcf record to match the sequence ' + 'in the alt string: {} vs {}'.format(record.ref, ref) ) else: - chr2 = info.get("CHR2", record.chrom) + chr2 = info.get('CHR2', record.chrom) end = record.stop if ( alt and record.ref - and re.match(r"^[A-Z]+$", alt) - and re.match(r"^[A-Z]+", record.ref) + and re.match(r'^[A-Z]+$', alt) + and re.match(r'^[A-Z]+', record.ref) ): std_row[COLUMNS.untemplated_seq] = alt[1:] size = len(alt) - len(record.ref) @@ -183,7 +182,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: std_row[COLUMNS.event_type] = SVTYPE.DEL std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2}) if info.get( - "PRECISE", False + 'PRECISE', False ): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged std_row.update( { @@ -197,97 +196,65 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: std_row.update( { COLUMNS.break1_position_start: max( - 1, record.pos + info.get("CIPOS", (0, 0))[0] + 1, record.pos + info.get('CIPOS', (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos + info.get("CIPOS", (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get("CIEND", (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get("CIEND", (0, 0))[1], + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, end + info.get('CIEND', (0, 0))[0]), + COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - std_row2 = {} - - if "SVTYPE" in info: - if info["SVTYPE"] in dir(SVTYPE): - std_row[COLUMNS.event_type] = info["SVTYPE"] - elif "/" in info["SVTYPE"]: - std_row2 = deepcopy(std_row) - std_row[COLUMNS.event_type] = info["SVTYPE"].split("/")[0] - std_row2[COLUMNS.event_type] = info["SVTYPE"].split("/")[1] + if 'SVTYPE' in info: + std_row[COLUMNS.event_type] = info['SVTYPE'] try: - orient1, orient2 = info["CT"].split("to") - connection_type = {"3": ORIENT.LEFT, "5": ORIENT.RIGHT, "N": ORIENT.NS} + orient1, orient2 = info['CT'].split('to') + connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} std_row[COLUMNS.break1_orientation] = connection_type[orient1] std_row[COLUMNS.break2_orientation] = connection_type[orient2] - if bool(std_row2): - std_row2[COLUMNS.break1_orientation] = connection_type[orient1] - std_row2[COLUMNS.break2_orientation] = connection_type[orient2] except KeyError: pass - if bool(std_row2): - std_row2.update( - { - k: v - for k, v in info.items() - if k not in {"CHR2", "SVTYPE", "CIPOS", "CIEND", "CT"} - } - ) - std_row.update( - { - k: v - for k, v in info.items() - if k not in {"CHR2", "SVTYPE", "CIPOS", "CIEND", "CT"} - } - ) - records.append(std_row) - records.append(std_row2) - else: - std_row.update( - { - k: v - for k, v in info.items() - if k not in {"CHR2", "SVTYPE", "CIPOS", "CIEND", "CT"} - } - ) - records.append(std_row) + std_row.update( + {k: v for k, v in info.items() if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CT'}} + ) + records.append(std_row) return records def convert_pandas_rows_to_variants(df): def parse_info(info_field): info = {} - for pair in info_field.split(";"): - if "=" in pair: - key, value = pair.split("=", 1) + for pair in info_field.split(';'): + if '=' in pair: + key, value = pair.split('=', 1) info[key] = value else: info[pair] = True # convert info types for key in info: - if key in {"CIPOS", "CIEND"}: - ci_start, ci_end = info[key].split(",") + if key in {'CIPOS', 'CIEND'}: + ci_start, ci_end = info[key].split(',') info[key] = (int(ci_start), int(ci_end)) - elif key == "END": + elif key == 'END': info[key] = int(info[key]) return info - df["info"] = df["INFO"].apply(parse_info) - df["alts"] = df["ALT"].apply(lambda a: a.split(",")) + df['info'] = df['INFO'].apply(parse_info) + df['alts'] = df['ALT'].apply(lambda a: a.split(',')) rows = [] for _, row in df.iterrows(): rows.append( VcfRecordType( - id=row["ID"], - pos=row["POS"], - info=VcfInfoType(row["info"]), - chrom=row["CHROM"], - ref=row["REF"], - alts=row["alts"], + id=row['ID'], + pos=row['POS'], + info=VcfInfoType(row['info']), + chrom=row['CHROM'], + ref=row['REF'], + alts=row['alts'], ) ) return rows @@ -299,9 +266,9 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: """ # read the comment/header information header_lines = [] - with open(input_file, "r") as fh: - line = "##" - while line.startswith("##"): + with open(input_file, 'r') as fh: + line = '##' + while line.startswith('##'): header_lines.append(line) line = fh.readline().strip() header_lines = header_lines[1:] @@ -311,21 +278,21 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: sep="\t", skiprows=len(header_lines), dtype={ - "CHROM": str, - "POS": int, - "ID": str, - "INFO": str, - "FORMAT": str, - "REF": str, - "ALT": str, + 'CHROM': str, + 'POS': int, + 'ID': str, + 'INFO': str, + 'FORMAT': str, + 'REF': str, + 'ALT': str, }, na_values=PANDAS_DEFAULT_NA_VALUES + ["."], ) - df = df.rename(columns={df.columns[0]: df.columns[0].replace("#", "")}) - required_columns = ["CHROM", "INFO", "POS", "REF", "ALT", "ID"] + df = df.rename(columns={df.columns[0]: df.columns[0].replace('#', '')}) + required_columns = ['CHROM', 'INFO', 'POS', 'REF', 'ALT', 'ID'] for col in required_columns: if col not in df.columns: - raise KeyError(f"Missing required column: {col}") + raise KeyError(f'Missing required column: {col}') # convert the format fields using the header return header_lines, df @@ -349,4 +316,4 @@ def convert_file(input_file: str, file_type: str, log): rows.extend(convert_record(variant_record, log=log)) except NotImplementedError as err: logging.warning(str(err)) - return rows + return rows \ No newline at end of file diff --git a/tests/unit/test_tools_vcf.py b/tests/unit/test_tools_vcf.py index cffe9ade..8af3067f 100644 --- a/tests/unit/test_tools_vcf.py +++ b/tests/unit/test_tools_vcf.py @@ -1,4 +1,5 @@ -from mavis.tools.vcf import pandas_vcf +from mavis.tools import SUPPORTED_TOOL, _convert_tool_row +from mavis.tools.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf from ..util import get_data @@ -7,3 +8,30 @@ def test_read_vcf(): header, df = pandas_vcf(get_data('delly_events.vcf')) assert len(header) == 63 assert df.shape[0] == 31 + + +def test_convert_record(): + variant = VcfRecordType( + 9000, + 12000, + 'chr14_KI270722v1_random', + alts=['N[chr17_GL000205v2_random:0['], + ref='N', + info=VcfInfoType( + IMPRECISE=True, + SVMETHOD="Snifflesv1.0.11", + SVTYPE="BND", + SUPTYPE="SR", + SVLEN="0", + STRANDS="+-", + RE="5", + REF_strand="0,0", + AF="1", + ), + ) + records = convert_record(variant) + assert len(records) == 1 + record = records[0] + assert record.get('break2_position_start') == 1 + assert record.get('break2_position_end') == 1 + assert record.get('break2_chromosome') == 'chr17_GL000205v2_random' From fda62136eb5ff1cffbcd161f345ee23e72134602 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 26 Jan 2022 13:31:43 -0800 Subject: [PATCH 092/137] re-lint files --- src/mavis/interval.py | 2 +- src/mavis/tools/vcf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mavis/interval.py b/src/mavis/interval.py index 7d0d5d96..03c788f9 100644 --- a/src/mavis/interval.py +++ b/src/mavis/interval.py @@ -30,7 +30,7 @@ def __init__(self, start: int, end: Optional[int] = None, freq: int = 1, number_ self.start = self.number_type(self.start) self.end = self.number_type(self.end) - if self.start == 0 and self.end == 1: + if self.start == 0 and self.end == 1: self.start = 1 if self.start > self.end: raise AttributeError('interval start > end is not allowed', self.start, self.end) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index 04dc7bfb..92a6c987 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -316,4 +316,4 @@ def convert_file(input_file: str, file_type: str, log): rows.extend(convert_record(variant_record, log=log)) except NotImplementedError as err: logging.warning(str(err)) - return rows \ No newline at end of file + return rows From a1849d52c29a2c62e3dea33786b4ab8c492efae0 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 26 Jan 2022 14:09:58 -0800 Subject: [PATCH 093/137] changed unit test as sample Sniffles file changed --- tests/end_to_end/test_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index e0b29e8e..ff3870a6 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -111,7 +111,7 @@ def test_vcf(self): def test_sniffle(self): results = self.run_main(get_data('sniffles.vcf'), SUPPORTED_TOOL.VCF, False) print(results.keys()) - record = results['vcf-35777'][0] + record = results['vcf-30259'][0] print(record, record.data) assert record.data['event_type'] == 'translocation' From 6c5076a2ffb9f82a5f777c4ed2073aff90bdeb85 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 28 Jan 2022 15:32:26 -0800 Subject: [PATCH 094/137] Add more type annotations --- docs/hooks.py | 8 +- src/mavis/align.py | 22 +-- src/mavis/annotate/base.py | 3 +- src/mavis/annotate/file_io.py | 8 +- src/mavis/annotate/fusion.py | 18 ++- src/mavis/annotate/genomic.py | 249 +++++++++++++++++-------------- src/mavis/annotate/protein.py | 148 ++++++++++-------- src/mavis/annotate/splicing.py | 29 ++-- src/mavis/annotate/variant.py | 82 ++++++---- src/mavis/assemble.py | 58 +++---- src/mavis/bam/cache.py | 127 ++++++++-------- src/mavis/bam/cigar.py | 72 +++++---- src/mavis/bam/read.py | 92 ++++++------ src/mavis/blat.py | 46 +++--- src/mavis/breakpoint.py | 5 +- src/mavis/cluster/cluster.py | 29 ++-- src/mavis/cluster/main.py | 4 +- src/mavis/illustrate/diagram.py | 59 +++++--- src/mavis/illustrate/elements.py | 20 ++- src/mavis/interval.py | 74 ++++----- src/mavis/tools/vcf.py | 8 +- src/mavis/types.py | 11 ++ 22 files changed, 662 insertions(+), 510 deletions(-) create mode 100644 src/mavis/types.py diff --git a/docs/hooks.py b/docs/hooks.py index 44931755..b76386d5 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -1,11 +1,9 @@ import json import os -import re from textwrap import dedent +import pkg_resources from markdown_refdocs.main import extract_to_markdown -from mavis.schemas import DEFAULTS -from mavis.util import ENV_VAR_PREFIX def json_to_pytype(record): @@ -130,7 +128,7 @@ def generate_settings_doc(schema_file): def build_package_docs(config): - schema_file = os.path.join(os.path.dirname(__file__), '../src/mavis/schemas/config.json') + schema_file = pkg_resources.resource_filename('mavis_config', 'config.json') generate_settings_doc(schema_file) package_dir = os.path.join(os.path.dirname(__file__), '../src/mavis') output_dir = os.path.join(os.path.dirname(__file__), 'package') @@ -142,5 +140,5 @@ def build_package_docs(config): hide_private=True, hide_undoc=True, hide_undoc_args=True, - namespace_headers=True, + namespace_headers=False, ) diff --git a/src/mavis/align.py b/src/mavis/align.py index df74feb7..2d6b6898 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -5,6 +5,7 @@ import os import re import subprocess +from typing import Dict import pysam @@ -13,6 +14,7 @@ from .breakpoint import Breakpoint, BreakpointPair from .constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement from .interval import Interval +from .types import ReferenceGenome from .util import logger @@ -140,7 +142,7 @@ def breakpoint_contig_remapped_depth(breakpoint, contig, read): return contig.remap_depth(qrange) -def get_aligner_version(aligner): +def get_aligner_version(aligner: str) -> str: """ executes a subprocess to try and run the aligner without arguments and parse the version number from the output @@ -167,10 +169,10 @@ def get_aligner_version(aligner): raise NotImplementedError(aligner) -def query_coverage_interval(read): +def query_coverage_interval(read: pysam.AlignedSegment) -> Interval: """ Returns: - mavis.interval.Interval: The portion of the original query sequence that is aligned by this read + The portion of the original query sequence that is aligned by this read """ seq = read.query_sequence st = 0 @@ -182,7 +184,7 @@ def query_coverage_interval(read): return Interval(st, end) -def convert_to_duplication(alignment, reference_genome): +def convert_to_duplication(alignment, reference_genome: ReferenceGenome): """ Given a breakpoint call, tests if the untemplated sequences matches the preceding reference sequence. If it does this is annotated as a duplication and the new @@ -382,11 +384,11 @@ def call_paired_read_event(read1, read2, is_stranded=False): def align_sequences( - sequences, + sequences: Dict[str, str], input_bam_cache, - reference_genome, - aligner, - aligner_reference, + reference_genome: ReferenceGenome, + aligner: str, + aligner_reference: str, aligner_output_file='aligner_out.temp', aligner_fa_input_file='aligner_in.fa', aligner_output_log='aligner_out.log', @@ -399,11 +401,11 @@ def align_sequences( calls the alignment tool and parses the return output for a set of sequences Args: - sequences (Dict[str,str]): dictionary of sequences by name + sequences: dictionary of sequences by name input_bam_cache (BamCache): bam cache to be used as a template for reading the alignments reference_genome: the reference genome aligner (SUPPORTED_ALIGNER): the name of the aligner to be used - aligner_reference (str): path to the aligner reference file + aligner_reference: path to the aligner reference file """ try: # write the input sequences to a fasta file diff --git a/src/mavis/annotate/base.py b/src/mavis/annotate/base.py index bf0dea8f..950444fa 100644 --- a/src/mavis/annotate/base.py +++ b/src/mavis/annotate/base.py @@ -3,6 +3,7 @@ from ..constants import STRAND from ..interval import Interval +from ..types import ReferenceGenome class ReferenceName(str): @@ -133,7 +134,7 @@ def __lt__(self, other): def __hash__(self): return hash(self.key()) - def get_seq(self, reference_genome=None, ignore_cache=False): + def get_seq(self, reference_genome: Optional[ReferenceGenome] = None, ignore_cache=False): """ get the sequence for the current annotation object diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 9e6c92c2..4b6ba264 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -8,11 +8,11 @@ import pandas as pd from Bio import SeqIO -from Bio.SeqRecord import SeqRecord from snakemake.utils import validate as snakemake_validate from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, translate from ..interval import Interval +from ..types import ReferenceGenome from ..util import logger from .base import BioInterval, ReferenceName from .genomic import Exon, Gene, PreTranscript, Template, Transcript @@ -60,7 +60,7 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]: def load_annotations( *filepaths: str, - reference_genome: Optional[Dict[str, SeqRecord]] = None, + reference_genome: Optional[ReferenceGenome] = None, best_transcripts_only: bool = False, ) -> Dict[str, List[Gene]]: """ @@ -95,7 +95,7 @@ def load_annotations( def parse_annotations_json( data, - reference_genome: Optional[Dict[str, SeqRecord]] = None, + reference_genome: Optional[ReferenceGenome] = None, best_transcripts_only=False, ) -> Dict[str, List[Gene]]: """ @@ -202,7 +202,7 @@ def parse_annotations_json( return genes_by_chr -def load_reference_genome(*filepaths: str) -> Dict[str, SeqRecord]: +def load_reference_genome(*filepaths: str) -> ReferenceGenome: """ Args: filepaths: the paths to the files containing the input fasta genomes diff --git a/src/mavis/annotate/fusion.py b/src/mavis/annotate/fusion.py index e0bf0026..b02e3259 100644 --- a/src/mavis/annotate/fusion.py +++ b/src/mavis/annotate/fusion.py @@ -1,9 +1,10 @@ -from .genomic import Exon, Transcript, PreTranscript -from .protein import calculate_orf, Domain, Translation from ..breakpoint import Breakpoint -from ..constants import ORIENT, PRIME, PROTOCOL, reverse_complement, STRAND, SVTYPE +from ..constants import ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, reverse_complement from ..error import NotSpecifiedError from ..interval import Interval, IntervalMapping +from ..types import ReferenceGenome +from .genomic import Exon, PreTranscript, Transcript +from .protein import Domain, Translation, calculate_orf def determine_prime(transcript, breakpoint): @@ -87,7 +88,12 @@ def map_region_to_genome(self, chr, interval_on_fusion, genome_interval, flipped @classmethod def _build_single_gene_inversion( - cls, ann, reference_genome, min_orf_size, max_orf_cap, min_domain_mapping_match + cls, + ann, + reference_genome: ReferenceGenome, + min_orf_size, + max_orf_cap, + min_domain_mapping_match, ): """ builds a fusion transcript for a single gene inversion. Note that this is an incomplete @@ -283,7 +289,7 @@ def _build_single_gene_duplication( def build( cls, ann, - reference_genome, + reference_genome: ReferenceGenome, min_orf_size=None, max_orf_cap=None, min_domain_mapping_match=None, @@ -291,7 +297,7 @@ def build( """ Args: ann (Annotation): the annotation object we want to build a FusionTranscript for - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence + reference_genome: dict of reference sequence by template/chr name Returns: diff --git a/src/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py index a6d18439..14ca0a7b 100644 --- a/src/mavis/annotate/genomic.py +++ b/src/mavis/annotate/genomic.py @@ -1,10 +1,13 @@ import itertools +from typing import Dict, List, Optional, Tuple from ..constants import ORIENT, STRAND, reverse_complement from ..error import NotSpecifiedError from ..interval import Interval +from ..types import ReferenceGenome from .base import BioInterval, ReferenceName from .constants import SPLICE_SITE_TYPE +from .protein import Translation from .splicing import SpliceSite, SplicingPattern @@ -29,12 +32,12 @@ def __hash__(self): class IntergenicRegion(BioInterval): - def __init__(self, chr, start, end, strand): + def __init__(self, chr: str, start: int, end: int, strand: str): """ Args: - chr (str): the reference object/chromosome for this region - start (int): the start of the IntergenicRegion - end (int): the end of the IntergenicRegion + chr: the reference object/chromosome for this region + start: the start of the IntergenicRegion + end: the end of the IntergenicRegion strand (STRAND): the strand the region is defined on Example: @@ -65,14 +68,23 @@ def to_dict(self): class Gene(BioInterval): """ """ - def __init__(self, chr, start, end, name=None, strand=STRAND.NS, aliases=None, seq=None): + def __init__( + self, + chr: str, + start: int, + end: int, + name: Optional[str] = None, + strand: str = STRAND.NS, + aliases: Optional[List[str]] = None, + seq: Optional[str] = None, + ): """ Args: - chr (str): the chromosome - name (str): the gene name/id i.e. ENSG0001 + chr: the chromosome + name: the gene name/id i.e. ENSG0001 strand (STRAND): the genomic strand '+' or '-' - aliases (List[str]): a list of aliases. For example the hugo name could go here - seq (str): genomic seq of the gene + aliases: a list of aliases. For example the hugo name could go here + seq: genomic seq of the gene Example: >>> Gene('X', 1, 1000, 'ENG0001', '+', ['KRAS']) """ @@ -102,13 +114,13 @@ def sort_key(t): raise ValueError('input transcript is not associated with this gene', transcript) @property - def transcripts(self): - """List[PreTranscript] list of unspliced transcripts""" + def transcripts(self) -> List['PreTranscript']: + """list of unspliced transcripts""" return self.unspliced_transcripts @property - def translations(self): - """List[mavis.annotate.protein.Translation] list of translations""" + def translations(self) -> List[Translation]: + """list of translations""" translations = [] for pre_transcript in self.unspliced_transcripts: for tx in pre_transcript.transcripts: @@ -125,14 +137,13 @@ def key(self): """see :func:`structural_variant.annotate.base.BioInterval.key`""" return BioInterval.key(self), self.strand - def get_seq(self, reference_genome, ignore_cache=False): + def get_seq(self, reference_genome: ReferenceGenome, ignore_cache: bool = False) -> str: """ gene sequence is always given wrt to the positive forward strand regardless of gene strand Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence by - template/chr name - ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome + reference_genome: dict of reference sequence by template/chr name + ignore_cache: if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: str: the sequence of the gene @@ -145,8 +156,8 @@ def get_seq(self, reference_genome, ignore_cache=False): return str(reference_genome[self.chr].seq[self.start - 1 : self.end]).upper() @property - def spliced_transcripts(self): - """List[Transcript]: list of transcripts""" + def spliced_transcripts(self) -> List['Transcript']: + """list of transcripts""" spl = [] for t in self.unspliced_transcripts: spl.extend(t.spliced_transcripts) @@ -164,23 +175,23 @@ class Exon(BioInterval): def __init__( self, - start, - end, - transcript=None, - name=None, - intact_start_splice=True, - intact_end_splice=True, - seq=None, - strand=None, + start: int, + end: int, + transcript: Optional['PreTranscript'] = None, + name: Optional[str] = None, + intact_start_splice: bool = True, + intact_end_splice: bool = True, + seq: Optional[str] = None, + strand: Optional[str] = None, ): """ Args: - start (int): the genomic start position - end (int): the genomic end position - name (str): the name of the exon - transcript (PreTranscript): the 'parent' transcript this exon belongs to - intact_start_splice (bool): if the starting splice site has been abrogated - intact_end_splice (bool): if the end splice site has been abrogated + start: the genomic start position + end: the genomic end position + name: the name of the exon + transcript: the 'parent' transcript this exon belongs to + intact_start_splice: if the starting splice site has been abrogated + intact_end_splice: if the end splice site has been abrogated Raises: AttributeError: if the exon start > the exon end Example: @@ -233,32 +244,32 @@ def transcript(self): return self.reference_object @property - def donor_splice_site(self): - """mavis.interval.Interval: the genomic range describing the splice site""" + def donor_splice_site(self) -> Interval: + """the genomic range describing the splice site""" if self.is_reverse: return self.start_splice_site else: return self.end_splice_site @property - def acceptor_splice_site(self): - """mavis.interval.Interval: the genomic range describing the splice site""" + def acceptor_splice_site(self) -> Interval: + """the genomic range describing the splice site""" if self.is_reverse: return self.end_splice_site else: return self.start_splice_site @property - def donor(self): - """`int`: returns the genomic exonic position of the donor splice site""" + def donor(self) -> int: + """returns the genomic exonic position of the donor splice site""" if self.is_reverse: return self.start else: return self.end @property - def acceptor(self): - """`int`: returns the genomic exonic position of the acceptor splice site""" + def acceptor(self) -> int: + """returns the genomic exonic position of the acceptor splice site""" if self.is_reverse: return self.end else: @@ -278,24 +289,24 @@ class PreTranscript(BioInterval): def __init__( self, - exons, - gene=None, - name=None, - strand=None, - spliced_transcripts=None, - seq=None, - is_best_transcript=False, + exons: List[Exon], + gene: Optional[Gene] = None, + name: Optional[str] = None, + strand: Optional[str] = None, + spliced_transcripts: Optional[List['Transcript']] = None, + seq: Optional[str] = None, + is_best_transcript: bool = False, ): """creates a new transcript object Args: - exons (List[Exon]): list of Exon that make up the transcript - genomic_start (int): genomic start position of the transcript - genomic_end (int): genomic end position of the transcript - gene (Gene): the gene this transcript belongs to - name (str): name of the transcript + exons: list of Exon that make up the transcript + genomic_start: genomic start position of the transcript + genomic_end: genomic end position of the transcript + gene: the gene this transcript belongs to + name: name of the transcript strand (STRAND): strand the transcript is on, defaults to the strand of the Gene if not specified - seq (str): unspliced cDNA seq + seq: unspliced cDNA seq """ # cannot use mutable default args in the function decl self.exons = exons @@ -332,12 +343,12 @@ def __init__( except AttributeError: pass - def generate_splicing_patterns(self): + def generate_splicing_patterns(self) -> List[SplicingPattern]: """ returns a list of splice sites to be connected as a splicing pattern Returns: - List[SplicingPattern]: List of positions to be spliced together + List of positions to be spliced together Note: see [theory - predicting splicing patterns](/background/theory/#predicting-splicing-patterns) @@ -355,10 +366,12 @@ def gene(self): """Gene: the gene this transcript belongs to""" return self.reference_object - def _genomic_to_cdna_mapping(self, splicing_pattern): + def _genomic_to_cdna_mapping( + self, splicing_pattern: SplicingPattern + ) -> Dict[Interval, Interval]: """ Args: - splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating + splicing_pattern: list of genomic splice sites 3'5' repeating """ mapping = {} length = 1 @@ -377,7 +390,9 @@ def _genomic_to_cdna_mapping(self, splicing_pattern): length += len(exon) return mapping - def _cdna_to_genomic_mapping(self, splicing_pattern): + def _cdna_to_genomic_mapping( + self, splicing_pattern: SplicingPattern + ) -> Dict[Interval, Interval]: """ Args: splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating @@ -385,18 +400,17 @@ def _cdna_to_genomic_mapping(self, splicing_pattern): mapping = {v: k for k, v in self._genomic_to_cdna_mapping(splicing_pattern).items()} return mapping - def convert_genomic_to_cdna(self, pos, splicing_pattern): + def convert_genomic_to_cdna(self, pos: int, splicing_pattern: SplicingPattern) -> int: """ Args: - pos (int): the genomic position to be converted - splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating + pos: the genomic position to be converted + splicing_pattern: list of genomic splice sites 3'5' repeating Returns: - int: the cdna equivalent + the cdna equivalent Raises: - mavis.error.IndexError: when a genomic position not present in the - cdna is attempted to be converted + mavis.error.IndexError: when a genomic position not present in the cdna is attempted to be converted """ cdna_pos, shift = self.convert_genomic_to_nearest_cdna(pos, splicing_pattern) if shift != 0: @@ -404,17 +418,17 @@ def convert_genomic_to_cdna(self, pos, splicing_pattern): return cdna_pos def convert_genomic_to_nearest_cdna( - self, pos, splicing_pattern, stick_direction=None, allow_outside=True - ): + self, pos: int, splicing_pattern: SplicingPattern, stick_direction=None, allow_outside=True + ) -> Tuple[int, int]: """ converts a genomic position to its cdna equivalent or (if intronic) the nearest cdna and shift Args: - pos (int): the genomic position - splicing_pattern (SplicingPattern): the splicing pattern + pos: the genomic position + splicing_pattern: the splicing pattern Returns: - Tuple[int,int]: the exonic cdna position and the intronic shift + the exonic cdna position and the intronic shift """ mapping = self._genomic_to_cdna_mapping(splicing_pattern) @@ -459,11 +473,11 @@ def convert_genomic_to_nearest_cdna( raise NotImplementedError('Unexpected error', self.exons, pos) raise IndexError('position does not fall within the current transcript', pos, mapping) - def convert_cdna_to_genomic(self, pos, splicing_pattern): + def convert_cdna_to_genomic(self, pos: int, splicing_pattern: SplicingPattern): """ Args: - pos (int): cdna position - splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating + pos: cdna position + splicing_pattern: list of genomic splice sites 3'5' repeating Returns: int: the genomic equivalent @@ -484,15 +498,15 @@ def convert_cdna_to_genomic(self, pos, splicing_pattern): mapping, pos, True if self.get_strand() == STRAND.NEG else False ) - def exon_number(self, exon): + def exon_number(self, exon: Exon) -> int: """ exon numbering is based on the direction of translation Args: - exon (Exon): the exon to be numbered + exon: the exon to be numbered Returns: - int: the exon number (1 based) + the exon number (1 based) Raises: AttributeError: if the strand is not given or the exon does not belong to the transcript @@ -508,15 +522,16 @@ def exon_number(self, exon): raise NotSpecifiedError('strand must be pos or neg to calculate the exon number') raise AttributeError('can only calculate phase on associated exons') - def get_seq(self, reference_genome=None, ignore_cache=False): + def get_seq( + self, reference_genome: Optional[ReferenceGenome] = None, ignore_cache: bool = False + ) -> str: """ Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name - ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome + reference_genome: dict of reference sequence by template/chr name + ignore_cache: if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: - str: the sequence of the transcript including introns (but relative to strand) + the sequence of the transcript including introns (but relative to strand) """ if self.seq and not ignore_cache: return self.seq @@ -535,16 +550,20 @@ def get_seq(self, reference_genome=None, ignore_cache=False): ).upper() return str(reference_genome[self.gene.chr].seq[self.start - 1 : self.end]).upper() - def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=False): + def get_cdna_seq( + self, + splicing_pattern: SplicingPattern, + reference_genome: Optional[ReferenceGenome] = None, + ignore_cache: bool = False, + ) -> str: """ Args: - splicing_pattern (SplicingPattern): the list of splicing positions - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name - ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome + splicing_pattern: the list of splicing positions + reference_genom: dict of reference sequence by template/chr name + ignore_cache: if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: - str: the spliced cDNA sequence + the spliced cDNA sequence """ temp = sorted([self.start] + [s.pos for s in splicing_pattern] + [self.end]) cdna_start = min(temp) @@ -560,8 +579,8 @@ def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=Fal return spliced_seq if self.get_strand() == STRAND.POS else reverse_complement(spliced_seq) @property - def translations(self): - """List[mavis.annotate.protein.Translation]: list of translations associated with this transcript""" + def translations(self) -> List[Translation]: + """list of translations associated with this transcript""" translations = [] for spl_tx in self.spliced_transcripts: for translation in spl_tx.translations: @@ -569,28 +588,35 @@ def translations(self): return translations @property - def transcripts(self): - """List[Transcript]: list of spliced transcripts""" + def transcripts(self) -> List['Transcript']: + """list of spliced transcripts""" return self.spliced_transcripts class Transcript(BioInterval): - def __init__(self, pre_transcript, splicing_patt, seq=None, translations=None): + reference_object: PreTranscript + + def __init__( + self, + pre_transcript: PreTranscript, + splicing_patt: List[int], + seq: Optional[str] = None, + translations: Optional[List[Translation]] = None, + ): """ splicing pattern is given in genomic coordinates Args: - pre_transcript (PreTranscript): the unspliced transcript - splicing_patt (List[int]): the list of splicing positions - seq (str): the cdna sequence - translations (List[mavis.annotate.protein.Translation]): - the list of translations of this transcript + pre_transcript: the unspliced transcript + splicing_patt: the list of splicing positions + seq: the cdna sequence + translations: the list of translations of this transcript """ pos = sorted([pre_transcript.start, pre_transcript.end] + [s.pos for s in splicing_patt]) splicing_patt.sort() self.splicing_pattern = splicing_patt length = sum([t - s + 1 for s, t in zip(pos[::2], pos[1::2])]) - BioInterval.__init__(self, pre_transcript, 1, length, seq=None) + BioInterval.__init__(self, pre_transcript, 1, length, seq=seq) self.exons = [Exon(s, t, self) for s, t in zip(pos[::2], pos[1::2])] self.translations = [] if translations is None else [tx for tx in translations] @@ -606,13 +632,13 @@ def __init__(self, pre_transcript, splicing_patt, seq=None, translations=None): elif len(splicing_patt) % 2 != 0: raise AssertionError('splicing pattern must be a list of 3\'5\' splicing positions') - def convert_genomic_to_cdna(self, pos): + def convert_genomic_to_cdna(self, pos: int) -> int: """ Args: - pos (int): the genomic position to be converted + pos: the genomic position to be converted Returns: - int: the cdna equivalent + the cdna equivalent Raises: IndexError: when a genomic position not present in the cdna is attempted to be converted @@ -624,25 +650,26 @@ def convert_genomic_to_nearest_cdna(self, pos, **kwargs): pos, self.splicing_pattern, **kwargs ) - def convert_cdna_to_genomic(self, pos): + def convert_cdna_to_genomic(self, pos: int) -> int: """ Args: - pos (int): cdna position + pos: cdna position Returns: - int: the genomic equivalent + the genomic equivalent """ return self.unspliced_transcript.convert_cdna_to_genomic(pos, self.splicing_pattern) - def get_seq(self, reference_genome=None, ignore_cache=False): + def get_seq( + self, reference_genome: Optional[ReferenceGenome] = None, ignore_cache: bool = False + ) -> str: """ Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence by - template/chr name - ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome + reference_genome: dict of reference sequence by template/chr name + ignore_cache: if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: - str: the sequence corresponding to the spliced cdna + the sequence corresponding to the spliced cdna """ if self.seq and not ignore_cache: return self.seq @@ -652,6 +679,6 @@ def get_seq(self, reference_genome=None, ignore_cache=False): return seq[self.start - 1 : self.end] @property - def unspliced_transcript(self): - """PreTranscript: the unspliced transcript this splice variant belongs to""" + def unspliced_transcript(self) -> PreTranscript: + """the unspliced transcript this splice variant belongs to""" return self.reference_object diff --git a/src/mavis/annotate/protein.py b/src/mavis/annotate/protein.py index acf5172d..50d31f33 100644 --- a/src/mavis/annotate/protein.py +++ b/src/mavis/annotate/protein.py @@ -1,20 +1,27 @@ import itertools +from typing import TYPE_CHECKING, List, Optional, Tuple, Union -from .base import BioInterval from ..constants import CODON_SIZE, START_AA, STOP_AA, translate from ..error import NotSpecifiedError from ..interval import Interval +from ..types import ReferenceGenome +from .base import BioInterval +if TYPE_CHECKING: + from .genomic import Transcript -def calculate_orf(spliced_cdna_sequence, min_orf_size=None): + +def calculate_orf( + spliced_cdna_sequence: str, min_orf_size: Optional[Union[float, int]] = None +) -> List[Interval]: """ calculate all possible open reading frames given a spliced cdna sequence (no introns) Args: - spliced_cdna_sequence (str): the sequence + spliced_cdna_sequence: the sequence Returns: - List[Interval]: list of open reading frame positions on the input sequence + list of open reading frame positions on the input sequence """ # do not revcomp assert START_AA != STOP_AA @@ -48,16 +55,22 @@ def __init__(self, start, end, seq=None, domain=None, name=None): class Domain: - def __init__(self, name, regions, translation=None, data=None): + def __init__( + self, + name: str, + regions: List[DomainRegion], + translation: Optional['Translation'] = None, + data=None, + ): """ Args: - name (str): the name of the domain i.e. PF00876 - regions (List[DomainRegion]): the amino acid ranges that are part of the domain - transcript (Transcript): the 'parent' transcript this domain belongs to + name: the name of the domain i.e. PF00876 + regions: the amino acid ranges that are part of the domain + translation: the 'parent' translation this domain belongs to Raises: AttributeError: if the end of any region is less than the start Example: - >>> Domain('DNA binding domain', [(1, 4), (10, 24)], transcript) + >>> Domain('DNA binding domain', [(1, 4), (10, 24)], translation) """ self.reference_object = translation self.name = name @@ -77,28 +90,27 @@ def __init__(self, name, regions, translation=None, data=None): self.regions[i] = DomainRegion(curr[0], curr[1]) @property - def translation(self): - """mavis.annotate.Translation: the Translation this domain belongs to""" + def translation(self) -> Optional['Translation']: + """the Translation this domain belongs to""" return self.reference_object def key(self): """Tuple: a tuple representing the items expected to be unique. for hashing and comparing""" return tuple([self.name, self.translation]) - def score_region_mapping(self, reference_genome=None): + def score_region_mapping( + self, reference_genome: Optional[ReferenceGenome] = None + ) -> Tuple[int, int]: """ compares the sequence in each DomainRegion to the sequence collected for that domain region from the translation object Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name + reference_genome: dict of reference sequence by template/chr name Returns: - tuple of int and int: tuple contains - - - int: the number of matching amino acids - - int: the total number of amino acids + - int: the number of matching amino acids + - int: the total number of amino acids """ if self.translation: aa_seq = self.translation.get_aa_seq(reference_genome) @@ -116,17 +128,18 @@ def score_region_mapping(self, reference_genome=None): else: raise NotSpecifiedError('insufficient sequence information') - def get_seqs(self, reference_genome=None, ignore_cache=False): + def get_seqs( + self, reference_genome: ReferenceGenome = None, ignore_cache: bool = False + ) -> List[str]: """ returns the amino acid sequences for each of the domain regions associated with this domain in the order of the regions (sorted by start) Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name + reference_genome: dict of reference sequence by template/chr name Returns: - List[str]: list of amino acid sequences for each DomainRegion + list of amino acid sequences for each DomainRegion Raises: AttributeError: if there is not enough sequence information given to determine this @@ -147,7 +160,12 @@ def get_seqs(self, reference_genome=None, ignore_cache=False): raise NotSpecifiedError('insufficient sequence information') return [sequences[r] for r in self.regions] - def align_seq(self, input_sequence, reference_genome=None, min_region_match=0.5): + def align_seq( + self, + input_sequence: str, + reference_genome: Optional[ReferenceGenome] = None, + min_region_match: float = 0.5, + ) -> Tuple[int, int, List[DomainRegion]]: """ align each region to the input sequence starting with the last one. then take the subset of sequence that remains to align the second last and so on @@ -155,16 +173,14 @@ def align_seq(self, input_sequence, reference_genome=None, min_region_match=0.5) then raise an error Args: - input_sequence (str): the sequence to be aligned to - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name - min_region_match (float): percent between 0 and 1. Each region must have a score len(seq) * min_region_match + input_sequence: the sequence to be aligned to + reference_genome: dict of reference sequence by template/chr name + min_region_match: percent between 0 and 1. Each region must have a score len(seq) * min_region_match Returns: - Tuple[int,int,List[DomainRegion]]: - - the number of matches - - the total number of amino acids to be aligned - - the list of domain regions on the new input sequence + - the number of matches + - the total number of amino acids to be aligned + - the list of domain regions on the new input sequence Raises: AttributeError: if sequence information is not available @@ -234,7 +250,15 @@ def align_seq(self, input_sequence, reference_genome=None, min_region_match=0.5) class Translation(BioInterval): - def __init__(self, start, end, transcript=None, domains=None, seq=None, name=None): + def __init__( + self, + start: int, + end: int, + transcript: Optional['Transcript'] = None, + domains: Optional[List[Domain]] = None, + seq=None, + name=None, + ): """ describes the splicing pattern and cds start and end with reference to a particular transcript @@ -262,27 +286,27 @@ def __init__(self, start, end, transcript=None, domains=None, seq=None, name=Non domain.reference_object = self @property - def transcript(self): - """mavis.annotate.genomic.Transcript: the spliced transcript this translation belongs to""" + def transcript(self) -> 'Transcript': + """the spliced transcript this translation belongs to""" return self.reference_object - def convert_aa_to_cdna(self, pos): + def convert_aa_to_cdna(self, pos: int) -> Interval: """ Args: - pos (int): the amino acid position + pos: the amino acid position Returns: Interval: the cdna equivalent (with CODON_SIZE uncertainty) """ return Interval(self.start - 1 + (pos - 1) * 3 + 1, self.start - 1 + pos * 3) - def convert_cdna_to_aa(self, pos): + def convert_cdna_to_aa(self, pos: int) -> int: """ Args: - pos (int): the cdna position + pos: the cdna position Returns: - int: the protein/amino-acid position + the protein/amino-acid position Raises: AttributeError: the cdna position is not translated @@ -295,32 +319,31 @@ def convert_cdna_to_aa(self, pos): aa_pos += 1 return aa_pos - def convert_genomic_to_cds(self, pos): + def convert_genomic_to_cds(self, pos: int) -> int: """ converts a genomic position to its cds (coding sequence) equivalent Args: - pos (int): the genomic position + pos: the genomic position Returns: - int: the cds position (negative if before the initiation start site) + the cds position (negative if before the initiation start site) """ cds, shift = self.convert_genomic_to_nearest_cds(pos) if shift != 0: raise IndexError('conversion failed. position is outside the exonic region') return cds - def convert_genomic_to_nearest_cds(self, pos): + def convert_genomic_to_nearest_cds(self, pos: str) -> Tuple[int, int]: """ converts a genomic position to its cds equivalent or (if intronic) the nearest cds and shift Args: - pos (int): the genomic position + pos: the genomic position Returns: - tuple of int and int: - * *int* - the cds position - * *int* - the intronic shift + - the cds position + - the intronic shift """ cds_pos, shift = self.transcript.convert_genomic_to_nearest_cdna(pos) @@ -330,16 +353,16 @@ def convert_genomic_to_nearest_cds(self, pos): cds_pos -= self.start return cds_pos, shift - def convert_genomic_to_cds_notation(self, pos): + def convert_genomic_to_cds_notation(self, pos: int) -> str: """ converts a genomic position to its cds (coding sequence) equivalent using `hgvs `_ cds notation Args: - pos (int): the genomic position + pos: the genomic position Returns: - str: the cds position notation + the cds position notation Example: >>> tl = Translation(...) @@ -366,14 +389,15 @@ def convert_genomic_to_cds_notation(self, pos): return '*{}{}'.format(cds_pos - len(self), offset_suffix) return '{}{}'.format(cds_pos, offset_suffix) - def get_cds_seq(self, reference_genome=None, ignore_cache=False): + def get_cds_seq( + self, reference_genome: Optional[ReferenceGenome] = None, ignore_cache: bool = False + ) -> str: """ Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name + reference_genome: dict of reference sequence by template/chr name Returns: - str: the cds sequence + the cds sequence Raises: AttributeError: if the reference sequence has not been given and is not set @@ -385,24 +409,26 @@ def get_cds_seq(self, reference_genome=None, ignore_cache=False): return seq[self.start - 1 : self.end] raise NotSpecifiedError('insufficient seq information') - def get_seq(self, reference_genome=None, ignore_cache=False): + def get_seq( + self, reference_genome: Optional[ReferenceGenome] = None, ignore_cache: bool = False + ): """ wrapper for the sequence method Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name + reference_genome: dict of reference sequence by template/chr name """ return self.get_cds_seq(reference_genome, ignore_cache) - def get_aa_seq(self, reference_genome=None, ignore_cache=False): + def get_aa_seq( + self, reference_genome: Optional[ReferenceGenome] = None, ignore_cache: bool = False + ) -> str: """ Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence - by template/chr name + reference_genome: dict of reference sequence by template/chr name Returns: - str: the amino acid sequence + the amino acid sequence Raises: AttributeError: if the reference sequence has not been given and is not set diff --git a/src/mavis/annotate/splicing.py b/src/mavis/annotate/splicing.py index ae9d4ef4..b2160ffd 100644 --- a/src/mavis/annotate/splicing.py +++ b/src/mavis/annotate/splicing.py @@ -1,4 +1,5 @@ import itertools +from typing import Iterable, List, Optional from ..constants import SPLICE_TYPE, STRAND, reverse_complement from ..interval import Interval @@ -7,7 +8,7 @@ class SplicingPattern(list): - def __init__(self, *args, splice_type=SPLICE_TYPE.NORMAL): + def __init__(self, *args, splice_type: str = SPLICE_TYPE.NORMAL): list.__init__(self, *args) self.splice_type = splice_type @@ -24,7 +25,7 @@ def __str__(self): return '[{}]'.format(', '.join(temp)) @classmethod - def classify(cls, pattern, original_sites): + def classify(cls, pattern: List[int], original_sites: Iterable[int]) -> str: # now need to decide the type for each set pattern = sorted(pattern) r_introns = [] @@ -80,12 +81,14 @@ def classify(cls, pattern, original_sites): return SPLICE_TYPE.COMPLEX @classmethod - def generate_patterns(cls, sites, is_reverse=False): + def generate_patterns( + cls, sites: Iterable['SpliceSite'], is_reverse=False + ) -> List['SplicingPattern']: """ returns a list of splice sites to be connected as a splicing pattern Returns: - List[SplicingPattern]: List of positions to be spliced together + List of positions to be spliced together Note: see [theory - predicting splicing patterns](/background/theory/#predicting-splicing-patterns) @@ -115,7 +118,15 @@ def generate_patterns(cls, sites, is_reverse=False): class SpliceSite(BioInterval): def __init__( - self, ref, pos, site_type, intact=True, start=None, end=None, strand=None, seq=None + self, + ref: BioInterval, + pos: int, + site_type: str, + intact: bool = True, + start: Optional[int] = None, + end: Optional[int] = None, + strand: Optional[str] = None, + seq: Optional[str] = None, ): if start is None or end is None: self.strand = strand if strand else ref.get_strand() @@ -170,17 +181,17 @@ def __repr__(self): ) -def predict_splice_sites(input_sequence, is_reverse=False): +def predict_splice_sites(input_sequence: str, is_reverse: bool = False) -> List[SpliceSite]: """ looks for the expected splice site sequence patterns in the input strings and returns a list of putative splice sites Args: - input_sequence (str): input sequence with respect to the positive/forward strand - is_reverse (bool): True when the sequences is transcribed on the reverse strand + input_sequence: input sequence with respect to the positive/forward strand + is_reverse: True when the sequences is transcribed on the reverse strand Return: - List[SpliceSite]: list of putative splice sites + list of putative splice sites """ if is_reverse: sequence = reverse_complement(input_sequence) diff --git a/src/mavis/annotate/variant.py b/src/mavis/annotate/variant.py index 7eaf2149..bfa9e3a0 100644 --- a/src/mavis/annotate/variant.py +++ b/src/mavis/annotate/variant.py @@ -8,9 +8,11 @@ from ..constants import COLUMNS, GENE_PRODUCT_TYPE, PROTOCOL, STOP_AA, STRAND, SVTYPE from ..error import NotSpecifiedError from ..interval import Interval +from ..types import ReferenceGenome from ..util import logger from .fusion import FusionTranscript, determine_prime from .genomic import Gene, IntergenicRegion, PreTranscript, Transcript +from .protein import Translation class Annotation(BreakpointPair): @@ -280,24 +282,33 @@ def flatten_fusion_translation(translation): class IndelCall: - def __init__(self, refseq, mutseq): + nterm_aligned: int + cterm_aligned: int + ref_seq: str + mut_seq: str + ins_seq: str + del_seq: str + is_dup: bool + terminates: bool + + def __init__(self, refseq: str, mutseq: str): """ Given two sequences, Assuming there exists a single difference between the two call an indel which accounts for the change Args: - refseq (str): The reference (amino acid) sequence - mutseq (str): The mutated (amino acid) sequence + refseq: The reference (amino acid) sequence + mutseq: The mutated (amino acid) sequence Attributes: - nterm_aligned (int): the number of characters aligned consecutively from the start of both strings - cterm_aligned (int): the number of characters aligned consecutively from the end of both strings - is_dup (bool): flag to indicate a duplication - ref_seq (str): the reference sequence - mut_seq (str): the mutated sequence - ins_seq (str): the inserted sequence - del_seq (str): the deleted sequence - terminates (bool): both sequences end in stop AAs + nterm_aligned: the number of characters aligned consecutively from the start of both strings + cterm_aligned: the number of characters aligned consecutively from the end of both strings + is_dup: flag to indicate a duplication + ref_seq: the reference sequence + mut_seq: the mutated sequence + ins_seq: the inserted sequence + del_seq: the deleted sequence + terminates: both sequences end in stop AAs """ self.nterm_aligned = 0 self.cterm_aligned = 0 @@ -379,7 +390,7 @@ def __init__(self, refseq, mutseq): self.del_seq = self.ref_seq[self.nterm_aligned : 0 - self.cterm_aligned] self.ins_seq = self.mut_seq[self.nterm_aligned : 0 - self.cterm_aligned] - def hgvs_protein_notation(self): + def hgvs_protein_notation(self) -> Optional[str]: """ returns the HGVS protein notation for an indel call """ @@ -454,17 +465,21 @@ def __str__(self): ) -def call_protein_indel(ref_translation, fusion_translation, reference_genome=None): +def call_protein_indel( + ref_translation: Translation, + fusion_translation: Translation, + reference_genome: Optional[ReferenceGenome] = None, +) -> str: """ compare the fusion protein/aa sequence to the reference protein/aa sequence and return an hgvs notation indel call Args: - ref_translation (Translation): the reference protein/translation - fusion_translation (Translation): the fusion protein/translation + ref_translation: the reference protein/translation + fusion_translation: the fusion protein/translation reference_genome: the reference genome object used to fetch the reference translation AA sequence Returns: - str: the [HGVS](/glossary/#HGVS) protein indel notation + the [HGVS](/glossary/#HGVS) protein indel notation """ ref_aa_seq = ref_translation.get_aa_seq(reference_genome) call = IndelCall(ref_aa_seq, fusion_translation.get_aa_seq()) @@ -519,14 +534,14 @@ def flatten_fusion_transcript(spliced_fusion_transcript): return row -def overlapping_transcripts(ref_ann, breakpoint): +def overlapping_transcripts(ref_ann, breakpoint: Breakpoint) -> List[PreTranscript]: """ Args: ref_ann (Dict[str,List[Gene]]): the reference list of genes split by chromosome - breakpoint (Breakpoint): the breakpoint in question + breakpoint: the breakpoint in question Returns: - List[PreTranscript]: a list of possible transcripts + a list of possible transcripts """ putative_annotations = set() for gene in ref_ann.get(breakpoint.chr, []): @@ -636,7 +651,9 @@ def _gather_breakpoint_annotations( ) -def _gather_annotations(ref: Dict[str, List[Gene]], bp: BreakpointPair, proximity=None): +def _gather_annotations( + ref: Dict[str, List[Gene]], bp: BreakpointPair, proximity=None +) -> List[Annotation]: """ each annotation is defined by the annotations selected at the breakpoints the other annotations are given relative to this @@ -647,7 +664,7 @@ def _gather_annotations(ref: Dict[str, List[Gene]], bp: BreakpointPair, proximit breakpoint_pairs: breakpoint pair we wish to annotate as events Returns: - List[Annotation]: The annotations + The annotations """ annotations = dict() break1_pos, break1_neg = _gather_breakpoint_annotations(ref, bp.break1) @@ -781,21 +798,21 @@ def choose_more_annotated(ann_list: List[Annotation]) -> List[Annotation]: return intergenic -def choose_transcripts_by_priority(ann_list: List[Annotation]): +def choose_transcripts_by_priority(ann_list: List[Annotation]) -> List[Annotation]: """ for each set of annotations with the same combinations of genes, choose the annotation with the most "best_transcripts" or most "alphanumeric" choices of transcript. Throw an error if they are identical Args: - ann_list (List[Annotation]): input annotations + ann_list: input annotations Warning: input annotations are assumed to be the same event (the same validation_id) the logic used would not apply to different events Returns: - List[Annotation]: the filtered list + the filtered list """ annotations_by_gene_combination: Dict[ Tuple[Optional[Gene], Optional[Gene]], List[Annotation] @@ -845,7 +862,7 @@ def choose_transcripts_by_priority(ann_list: List[Annotation]): def annotate_events( bpps: List[BreakpointPair], annotations: Dict[str, List[Gene]], - reference_genome: Dict[str, str], + reference_genome: ReferenceGenome, max_proximity: int = 5000, min_orf_size: int = 200, min_domain_mapping_match: float = 0.95, @@ -854,18 +871,17 @@ def annotate_events( ) -> List[Annotation]: """ Args: - bpps (List[mavis.breakpoint.BreakpointPair]): list of events + bpps: list of events annotations: reference annotations - reference_genome (Dict[string,string]): dictionary of reference sequences by name - max_proximity (int): see [max_proximity](/configuration/settings/#max_proximity) - min_orf_size (int): see [min_orf_size](/configuration/settings/#min_orf_size) + reference_genome: dictionary of reference sequences by name + max_proximity: see [max_proximity](/configuration/settings/#max_proximity) + min_orf_size: see [min_orf_size](/configuration/settings/#min_orf_size) min_domain_mapping_match (float): see [min_domain_mapping_match](/configuration/settings/#min_domain_mapping_match) - max_orf_cap (int): see [max_orf_cap](/configuration/settings/#max_orf_cap) - log (Callable): callable function to take in strings and time_stamp args - filters (List[callable]): list of functions taking in a list and returning a list for filtering + max_orf_cap: see [max_orf_cap](/configuration/settings/#max_orf_cap) + filters: list of functions taking in a list and returning a list for filtering Returns: - List[Annotation]: list of the putative annotations + list of the putative annotations """ if filters is None: filters = [choose_more_annotated, choose_transcripts_by_priority] diff --git a/src/mavis/assemble.py b/src/mavis/assemble.py index c5654924..c87b4208 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/assemble.py @@ -1,4 +1,5 @@ import itertools +from typing import List, Optional import distance import networkx as nx @@ -27,7 +28,7 @@ def __hash__(self): def complexity(self): return sequence_complexity(self.seq) - def add_mapped_sequence(self, read, multimap=1): + def add_mapped_sequence(self, read, multimap: int = 1): self.remapped_sequences[read] = 1 / multimap def remap_score(self): @@ -40,7 +41,7 @@ def remap_coverage(self): cov = sum([len(i) for i in itvls]) return cov / len(self.seq) - def remap_depth(self, query_range=None): + def remap_depth(self, query_range: Optional[Interval] = None): """ the average depth of remapped reads over a give range of the contig sequence @@ -95,12 +96,12 @@ def add_edge(self, n1, n2, freq=1): def all_edges(self, *nodes, data=False): return self.get_in_edges(*nodes, data=data) + self.get_out_edges(*nodes, data=data) - def trim_tails_by_freq(self, min_weight): + def trim_tails_by_freq(self, min_weight: int): """ for any paths where all edges are lower than the minimum weight trim Args: - min_weight (int): the minimum weight for an edge to be retained + min_weight: the minimum weight for an edge to be retained """ ends = sorted( [n for n in self.get_nodes() if self.out_degree(n) == 0 or self.in_degree(n) == 0] @@ -220,7 +221,7 @@ def get_sources(self, subgraph=None): return nodeset -def digraph_connected_components(graph, subgraph=None): +def digraph_connected_components(graph: nx.DiGraph, subgraph=None) -> List[List]: """ the networkx module does not support deriving connected components from digraphs (only simple graphs) @@ -229,10 +230,10 @@ def digraph_connected_components(graph, subgraph=None): in a simple graph and a digraph Args: - graph (networkx.DiGraph): the input graph to gather components from + graph: the input graph to gather components from Returns: - List[List]: returns a list of compnents which are lists of node names + returns a list of compnents which are lists of node names """ if subgraph is None: subgraph = set(graph.get_nodes()) @@ -246,15 +247,17 @@ def digraph_connected_components(graph, subgraph=None): return nx.connected_components(g) -def pull_contigs_from_component(assembly, component, min_edge_trim_weight, assembly_max_paths): +def pull_contigs_from_component( + assembly: DeBruijnGraph, component: List, min_edge_trim_weight: int, assembly_max_paths: int +): """ builds contigs from the a connected component of the assembly DeBruijn graph Args: - assembly (DeBruijnGraph): the assembly graph - component (list): list of nodes which make up the connected component - min_edge_trim_weight (int): the minimum weight to not remove a non cutting edge/path - assembly_max_paths (int): the maximum number of paths allowed before the graph is further simplified + assembly: the assembly graph + component: list of nodes which make up the connected component + min_edge_trim_weight: the minimum weight to not remove a non cutting edge/path + assembly_max_paths: the maximum number of paths allowed before the graph is further simplified Returns: Dict[str,int]: the paths/contigs and their scores @@ -304,7 +307,7 @@ def pull_contigs_from_component(assembly, component, min_edge_trim_weight, assem return path_scores -def filter_contigs(contigs, assembly_min_uniq=0.01): +def filter_contigs(contigs, assembly_min_uniq: float = 0.01): """ given a list of contigs, removes similar contigs to leave the highest (of the similar) scoring contig only """ @@ -339,14 +342,15 @@ def filter_contigs(contigs, assembly_min_uniq=0.01): def assemble( - sequences, - kmer_size, - min_edge_trim_weight=3, - assembly_max_paths=20, - assembly_min_uniq=0.01, - min_complexity=0, + sequences: List[str], + kmer_size: float, + min_edge_trim_weight: int = 3, + assembly_max_paths: int = 20, + assembly_min_uniq: float = 0.01, + min_complexity: float = 0, + remap_min_exact_match: int = 6, **kwargs, -): +) -> List[Contig]: """ for a set of sequences creates a DeBruijnGraph simplifies trailing and leading paths where edges fall @@ -355,17 +359,18 @@ def assemble( drops any sequences too small to fit the kmer size Args: - sequences (List[str]): a list of strings/sequences to assemble - kmer_size: see [assembly_kmer_size](/configuration/settings/#assembly_kmer_size) the size of the kmer to use - min_edge_trim_weight: see [assembly_min_edge_trim_weight](/configuration/settings/#assembly_min_edge_trim_weight) + sequences: a list of strings/sequences to assemble + kmer_size: see [assembly_kmer_size](/configuration/settings/#validateassembly_kmer_size) the size of the kmer to use + min_edge_trim_weight: see [assembly_min_edge_trim_weight](/configuration/settings/#validateassembly_min_edge_trim_weight) remap_min_match: Minimum match percentage of the remapped read (based on the exact matches in the cigar) remap_min_overlap: defaults to the kmer size. Minimum amount of overlap between the contig and the remapped read min_contig_length: Minimum length of contigs assemble to attempt remapping reads to. Shorter contigs will be ignored - remap_min_exact_match: see [assembly_min_exact_match_to_remap](/configuration/settings/#assembly_min_exact_match_to_remap) - assembly_max_paths: see [assembly_max_paths](/configuration/settings/#assembly_max_paths) + remap_min_exact_match: see [assembly_min_exact_match_to_remap](/configuration/settings/#validateassembly_min_exact_match_to_remap) + assembly_max_paths: see [assembly_max_paths](/configuration/settings/#validateassembly_max_paths) + min_complexity: see [min_call_complexity](/configuration/settings/#validatemin_call_complexity) Returns: - List[Contig]: a list of putative contigs + a list of putative contigs """ if not sequences: return [] @@ -373,7 +378,6 @@ def assemble( kmer_size = int(round(kmer_size, 0)) min_contig_length = kwargs.pop('min_contig_length', min_seq + 1) remap_min_overlap = kwargs.pop('remap_min_overlap', kmer_size) - remap_min_exact_match = kwargs.pop('remap_min_exact_match', 6) remap_min_match = kwargs.pop('remap_min_match', 0.95) if kwargs: diff --git a/src/mavis/bam/cache.py b/src/mavis/bam/cache.py index 866d4003..aee2cc99 100644 --- a/src/mavis/bam/cache.py +++ b/src/mavis/bam/cache.py @@ -1,5 +1,6 @@ import atexit import re +from typing import Callable, Dict, List, Set, Union import pysam @@ -15,24 +16,28 @@ class BamCache: the file if we've already read that section """ - def __init__(self, bamfile, stranded=False): + fh: pysam.AlignmentFile + stranded: bool + cache: Dict + + def __init__(self, bamfile: Union[pysam.AlignmentFile, str], stranded: bool = False): """ Args: - bamfile (str): path to the input bam file + bamfile: path to the input bam file """ self.cache = {} self.stranded = stranded - self.fh = bamfile + if not hasattr(bamfile, 'fetch'): self.fh = pysam.AlignmentFile(bamfile, 'rb') else: try: self.fh = bamfile.fh except AttributeError: - pass + self.fh = bamfile atexit.register(self.close) # makes the file 'auto close' on normal python exit - def valid_chr(self, chrom): + def valid_chr(self, chrom: str) -> bool: """ checks if a reference name exists in the bam file header """ @@ -42,10 +47,10 @@ def valid_chr(self, chrom): except KeyError: return False - def add_read(self, read): + def add_read(self, read: pysam.AlignedSegment): """ Args: - read (pysam.AlignedSegment): the read to add to the cache + read: the read to add to the cache """ if not read.is_unmapped and read.reference_start == read.reference_end: _util.logger.debug(f'ignoring invalid read: {read.query_name}') @@ -56,7 +61,7 @@ def add_read(self, read): if read not in self.cache[read.query_name]: self.cache[read.query_name].add(read) - def has_read(self, read): + def has_read(self, read: pysam.AlignedSegment) -> bool: """ checks if a read query name exists in the current cache """ @@ -66,12 +71,12 @@ def has_read(self, read): return True return False - def reference_id(self, chrom): + def reference_id(self, chrom: str) -> int: """ Args: - chrom (str): the chromosome/reference name + chrom: the chromosome/reference name Returns: - int: the reference id corresponding to input chromosome name + the reference id corresponding to input chromosome name """ tid = self.fh.get_tid(chrom) if tid == -1: @@ -82,23 +87,25 @@ def reference_id(self, chrom): raise KeyError('invalid reference name not present in bam file', chrom) return tid - def get_read_reference_name(self, read): + def get_read_reference_name(self, read: pysam.AlignedSegment) -> str: """ Args: - read (pysam.AlignedSegment): the read we want the chromosome name for + read: the read we want the chromosome name for Returns: - str: the name of the chromosome + the name of the chromosome """ return ReferenceName(self.fh.get_reference_name(read.reference_id)) @classmethod - def _generate_fetch_bins(cls, start, stop, sample_bins, min_bin_size): + def _generate_fetch_bins( + cls, start: int, stop: int, sample_bins: int, min_bin_size: int + ) -> List[Interval]: """ Args: - start (int): the start if the area to fetch reads from - stop (int): the end of the region - sample_bins (int): the number of bins to split the region into - min_bin_size (int): the minimum bin size + start: the start if the area to fetch reads from + stop: the end of the region + sample_bins: the number of bins to split the region into + min_bin_size: the minimum bin size """ assert min_bin_size > 0 length = stop - start + 1 @@ -120,28 +127,28 @@ def _generate_fetch_bins(cls, start, stop, sample_bins, min_bin_size): def fetch( self, - input_chrom, - start, - stop, - limit=10000, - cache_if=lambda x: True, - filter_if=lambda x: False, - stop_on_cached_read=False, - ): + input_chrom: str, + start: int, + stop: int, + limit: int = 10000, + cache_if: Callable = lambda x: True, + filter_if: Callable = lambda x: False, + stop_on_cached_read: bool = False, + ) -> Set[pysam.AlignedSegment]: """ Args: - input_chrom (str): chromosome name - start (int): start position - end (int): end position - limit (int): maximum number of reads to fetch - cache_if (Callable): if returns True then the read is added to the cache - filter_if (Callable): if returns True then the read is not returned as part of the result - stop_on_cached_read (bool): stop reading at the first read found that is already in the cache + input_chrom: chromosome name + start: start position + end: end position + limit: maximum number of reads to fetch + cache_if: if returns True then the read is added to the cache + filter_if: if returns True then the read is not returned as part of the result + stop_on_cached_read: stop reading at the first read found that is already in the cache Note: the cache_if and filter_if functions must be any function that takes a read as input and returns a boolean Returns: - Set[pysam.AlignedSegment]: a set of reads which overlap the input region + a set of reads which overlap the input region """ # try using the cache to avoid fetching regions more than once result = [] @@ -180,32 +187,32 @@ def fetch( def fetch_from_bins( self, - input_chrom, - start, - stop, - read_limit=10000, - cache=False, - sample_bins=3, - cache_if=lambda x: True, - min_bin_size=10, - filter_if=lambda x: False, - ): + input_chrom: str, + start: int, + stop: int, + read_limit: int = 10000, + cache: bool = False, + sample_bins: int = 3, + cache_if: Callable = lambda x: True, + min_bin_size: int = 10, + filter_if: Callable = lambda x: False, + ) -> Set[pysam.AlignedSegment]: """ wrapper around the fetch method, returns a list to avoid errors with changing the file pointer position from within the loop. Also caches reads if requested and can return a limited read number Args: - chrom (str): the chromosome - start (int): the start position - stop (int): the end position - read_limit (int): the maximum number of reads to parse - cache (bool): flag to store reads - sample_bins (int): number of bins to split the region into - cache_if (Callable): function to check to against a read to determine if it should be cached - bin_gap_size (int): gap between the bins for the fetch area + input_chrom: the chromosome + start: the start position + stop: the end position + read_limit: the maximum number of reads to parse + cache: flag to store reads + sample_bins: number of bins to split the region into + cache_if: function to check to against a read to determine if it should be cached + bin_gap_size: gap between the bins for the fetch area Returns: - Set[pysam.AlignedSegment]: set of reads gathered from the region + set of reads gathered from the region """ # try using the cache to make grabbing mate pairs easier result = [] @@ -242,14 +249,16 @@ def fetch_from_bins( running_surplus -= count return set(result) - def get_mate(self, read, primary_only=True, allow_file_access=False): + def get_mate( + self, read: pysam.AlignedSegment, primary_only: bool = True, allow_file_access: bool = False + ) -> List[pysam.AlignedSegment]: """ Args: - read (pysam.AlignedSegment): the read - primary_only (bool): ignore secondary alignments - allow_file_access (bool): determines if the bam can be accessed to try to find the mate + read: the read + primary_only: ignore secondary alignments + allow_file_access: determines if the bam can be accessed to try to find the mate Returns: - List[pysam.AlignedSegment]: list of mates of the input read + list of mates of the input read """ # NOTE: will return all mate alignments that have been cached putative_mates = self.cache.get(read.query_name, set()) diff --git a/src/mavis/bam/cigar.py b/src/mavis/bam/cigar.py index 7c003bce..2610bf7f 100644 --- a/src/mavis/bam/cigar.py +++ b/src/mavis/bam/cigar.py @@ -4,7 +4,12 @@ CIGAR value (i.e. 1 for an insertion), and the second value is the frequency """ import re +from typing import Tuple + +import pysam + from ..constants import CIGAR, DNA_ALPHABET, GAP +from ..types import CigarTuples EVENT_STATES = {CIGAR.D, CIGAR.I, CIGAR.X} ALIGNED_STATES = {CIGAR.M, CIGAR.X, CIGAR.EQ} @@ -13,27 +18,26 @@ CLIPPING_STATE = {CIGAR.S, CIGAR.H} -def recompute_cigar_mismatch(read, ref): +def recompute_cigar_mismatch(read: pysam.AlignedSegment, ref: str) -> CigarTuples: """ for cigar tuples where M is used, recompute to replace with X/= for increased utility and specificity Args: - read (pysam.AlignedSegment): the input read - ref (str): the reference sequence + read: the input read + ref: the reference sequence Returns: - List[Tuple[int,int]]: the cigar tuple + the cigar tuple """ - result = [] - offset = 0 + result: CigarTuples = [] ref_pos = read.reference_start seq_pos = 0 for cigar_value, freq in read.cigar: if cigar_value in ALIGNED_STATES: - for offset in range(0, freq): + for _ in range(0, freq): if DNA_ALPHABET.match(ref[ref_pos], read.query_sequence[seq_pos]): if len(result) == 0 or result[-1][0] != CIGAR.EQ: result.append((CIGAR.EQ, 1)) @@ -56,13 +60,13 @@ def recompute_cigar_mismatch(read, ref): return result -def longest_fuzzy_match(cigar, max_fuzzy_interupt=1): +def longest_fuzzy_match(cigar: CigarTuples, max_fuzzy_interupt: int = 1) -> int: """ computes the longest sequence of exact matches allowing for 'x' event interrupts Args: cigar: cigar tuples - max_fuzzy_interupt (int): number of mismatches allowed + max_fuzzy_interupt: number of mismatches allowed """ temp = join(cigar) @@ -85,28 +89,28 @@ def longest_fuzzy_match(cigar, max_fuzzy_interupt=1): return longest_fuzzy_match -def longest_exact_match(cigar): +def longest_exact_match(cigar: CigarTuples) -> int: """ returns the longest consecutive exact match Args: - cigar (List[Tuple[int,int]]): the cigar tuples + cigar: the cigar tuples """ return longest_fuzzy_match(cigar, 0) -def score(cigar, **kwargs): +def score(cigar: CigarTuples, **kwargs) -> int: """scoring based on sw alignment properties with gap extension penalties Args: - cigar (List[Tuple[mavis.constants.CIGAR,int]]): list of cigar tuple values + cigar: list of cigar tuple values MISMATCH (int): mismatch penalty MATCH (int): match penalty GAP (int): initial gap penalty GAP_EXTEND (int): gap extension penalty Returns: - int: the score value + the score value """ mismatch = kwargs.pop('MISMATCH', -1) @@ -129,7 +133,7 @@ def score(cigar, **kwargs): return score -def match_percent(cigar): +def match_percent(cigar: CigarTuples) -> float: """ calculates the percent of aligned bases (matches or mismatches) that are matches """ @@ -169,7 +173,9 @@ def join(*pos): return result -def extend_softclipping(cigar, min_exact_to_stop_softclipping): +def extend_softclipping( + cigar: CigarTuples, min_exact_to_stop_softclipping: int +) -> Tuple[CigarTuples, int]: """ given some input cigar, extends softclipping if there are mismatches/insertions/deletions close to the end of the aligned portion. The stopping point is defined by the @@ -177,11 +183,11 @@ def extend_softclipping(cigar, min_exact_to_stop_softclipping): exact match aligned portion to signal stop Args: - original_cigar (List[Tuple[mavis.constants.CIGAR,int]]): the input cigar - min_exact_to_stop_softclipping (int): number of exact matches to terminate extension + original_cigar: the input cigar + min_exact_to_stop_softclipping: number of exact matches to terminate extension Returns: - Tuple[List[Tuple[mavis.constants.CIGAR,int]], int]: new cigar list and shift from the original start position + new cigar list and shift from the original start position """ new_cigar = [] anchors = [ @@ -215,7 +221,9 @@ def extend_softclipping(cigar, min_exact_to_stop_softclipping): return new_cigar, start_ref_aligned -def compute(ref, alt, force_softclipping=True, min_exact_to_stop_softclipping=6): +def compute( + ref: str, alt: str, force_softclipping: bool = True, min_exact_to_stop_softclipping: int = 6 +) -> Tuple[CigarTuples, int]: """ given a ref and alt sequence compute the cigar string representing the alt @@ -247,7 +255,7 @@ def compute(ref, alt, force_softclipping=True, min_exact_to_stop_softclipping=6) return cigar, 0 -def convert_for_igv(cigar): +def convert_for_igv(cigar: CigarTuples) -> CigarTuples: """ igv does not support the extended CIGAR values for match v mismatch @@ -263,7 +271,7 @@ def convert_for_igv(cigar): return join(result) -def alignment_matches(cigar): +def alignment_matches(cigar: CigarTuples) -> int: """ counts the number of aligned bases irrespective of match/mismatch this is equivalent to counting all CIGAR.M @@ -275,7 +283,7 @@ def alignment_matches(cigar): return result -def merge_indels(cigar): +def merge_indels(cigar: CigarTuples) -> CigarTuples: """ For a given cigar tuple, merges adjacent insertions/deletions @@ -298,7 +306,7 @@ def merge_indels(cigar): return new_cigar -def hgvs_standardize_cigar(read, reference_seq): +def hgvs_standardize_cigar(read: pysam.AlignedSegment, reference_seq: str) -> CigarTuples: """ extend alignments as long as matches are possible. call insertions before deletions @@ -420,7 +428,7 @@ def hgvs_standardize_cigar(read, reference_seq): return join(cigar) -def convert_string_to_cigar(string): +def convert_string_to_cigar(string: str) -> CigarTuples: """ Given a cigar string, converts it to the appropriate cigar tuple @@ -436,11 +444,13 @@ def convert_string_to_cigar(string): return cigar -def convert_cigar_to_string(cigar): +def convert_cigar_to_string(cigar: CigarTuples) -> str: return ''.join(['{}{}'.format(f, CIGAR.reverse(s) if s != CIGAR.EQ else '=') for s, f in cigar]) -def merge_internal_events(cigar, inner_anchor=10, outer_anchor=10): +def merge_internal_events( + cigar: CigarTuples, inner_anchor: int = 10, outer_anchor: int = 10 +) -> CigarTuples: """ merges events (insertions, deletions, mismatches) within a cigar if they are between exact matches on either side (anchors) and separated by less exact @@ -449,12 +459,12 @@ def merge_internal_events(cigar, inner_anchor=10, outer_anchor=10): does not merge two mismatches, must contain a deletion/insertion Args: - cigar (List): a list of tuples of cigar states and counts - inner_anchor (int): minimum number of consecutive exact matches separating events - outer_anchor (int): minimum consecutively aligned exact matches to anchor an end for merging + cigar: a list of tuples of cigar states and counts + inner_anchor: minimum number of consecutive exact matches separating events + outer_anchor: minimum consecutively aligned exact matches to anchor an end for merging Returns: - List: new list of cigar tuples with merged events + new list of cigar tuples with merged events Example: >>> merge_internal_events([(CIGAR.EQ, 10), (CIGAR.X, 1), (CIGAR.EQ, 2), (CIGAR.D, 1), (CIGAR.EQ, 10)]) diff --git a/src/mavis/bam/read.py b/src/mavis/bam/read.py index f41f31cb..5a986178 100644 --- a/src/mavis/bam/read.py +++ b/src/mavis/bam/read.py @@ -1,6 +1,7 @@ import itertools import re from copy import copy +from typing import Callable, Iterable, List, Optional, Tuple import pysam from Bio.Data import IUPACData as iupac @@ -158,16 +159,18 @@ def __hash__(self): return hash(self.key()) -def pileup(reads, filter_func=None): +def pileup( + reads: Iterable[pysam.AlignedSegment], filter_func: Optional[Callable] = None +) -> List[Tuple[int, int]]: """ For a given set of reads generate a pileup of all reads (excluding those for which the filter_func returns True) Args: - reads (Iterable[pysam.AlignedSegment]): reads to pileup - filter_func (Callable): function which takes in a read and returns True if it should be ignored and False otherwise + reads: reads to pileup + filter_func: function which takes in a read and returns True if it should be ignored and False otherwise Returns: - Iterable[Tuple[int,int]]: tuples of genomic position and read count at that position + tuples of genomic position and read count at that position Note: returns positions using 1-based indexing @@ -181,13 +184,13 @@ def pileup(reads, filter_func=None): return sorted(hist.items()) -def map_ref_range_to_query_range(read, ref_range): +def map_ref_range_to_query_range(read: pysam.AlignedSegment, ref_range: Interval) -> Interval: """ Args: - ref_range (Interval): 1-based inclusive - read (pysam.AlignedSegment): read used for the mapping + ref_range: 1-based inclusive + read: read used for the mapping Returns: - Interval: 1-based inclusive range + 1-based inclusive range """ rpos = read.reference_start qpos = 0 @@ -213,17 +216,17 @@ def map_ref_range_to_query_range(read, ref_range): return Interval(qstart, qend) -def breakpoint_pos(read, orient=ORIENT.NS): +def breakpoint_pos(read: pysam.AlignedSegment, orient: str = ORIENT.NS) -> int: """ assumes the breakpoint is the position following softclipping on the side with more softclipping (unless and orientation has been specified) Args: - read (pysam.AlignedSegment): the read object - orient (ORIENT): the orientation + read: the read object + orient: the orientation Returns: - int: the position of the breakpoint in the input read + the position of the breakpoint in the input read """ typ, freq = read.cigar[0] end_typ, end_freq = read.cigar[-1] @@ -266,15 +269,15 @@ def breakpoint_pos(read, orient=ORIENT.NS): return read.reference_end - 1 -def calculate_alignment_score(read, consec_bonus=1): +def calculate_alignment_score(read: pysam.AlignedSegment, consec_bonus=1) -> float: """ calculates a score for comparing alignments Args: - read (pysam.AlignedSegment): the input read + read: the input read Returns: - float: the score + the score """ score = 0 qlen = read.reference_end - read.reference_start @@ -290,32 +293,27 @@ def calculate_alignment_score(read, consec_bonus=1): def nsb_align( - ref, - seq, - weight_of_score=0.5, - min_overlap_percent=1, - min_match=0, + ref: str, + seq: str, + min_overlap_percent: float = 1, + min_match: float = 0, min_consecutive_match=1, - scoring_function=calculate_alignment_score, -): + scoring_function: Callable = calculate_alignment_score, +) -> List[SamRead]: """ given some reference string and a smaller sequence string computes the best non-space-breaking alignment i.e. an alignment that does not allow for indels (straight-match). Positions in the aligned segments are given relative to the length of the reference sequence (1-based) Args: - ref (str): the reference sequence - seq (str): the sequence being aligned - weight_of_score (float): when scoring alignments this determines the amount - of weight to place on the cigar match. Should be a number between 0 and 1 - min_overlap_percent (float): the minimum amount of overlap of the input sequence to the reference - should be a number between 0 and 1 - min_match (float): the minimum number of matches compared to total - scoring_function (Callable): any function that will take a read as input and return a float - used in comparing alignments to choose the best alignment + ref: the reference sequence + seq: the sequence being aligned + min_overlap_percent: the minimum amount of overlap of the input sequence to the reference should be a number between 0 and 1 + min_match: the minimum number of matches compared to total + scoring_function: any function that will take a read as input and return a float used in comparing alignments to choose the best alignment Returns: - List[pysam.AlignedSegment]: list of aligned segments + list of aligned segments Note: using a higher min_match may improve performance as low quality alignments are rejected more quickly. However @@ -399,13 +397,13 @@ def nsb_align( return filtered -def sequenced_strand(read, strand_determining_read=2): +def sequenced_strand(read: pysam.AlignedSegment, strand_determining_read: int = 2) -> str: """ determines the strand that was sequenced Args: - read (pysam.AlignedSegment): the read being used to determine the strand - strand_determining_read (int): which read in the read pair is the same as the sequenced strand + read: the read being used to determine the strand + strand_determining_read: which read in the read pair is the same as the sequenced strand Returns: STRAND: the strand that was sequenced @@ -435,13 +433,13 @@ def sequenced_strand(read, strand_determining_read=2): return strand -def read_pair_type(read): +def read_pair_type(read: pysam.AlignedSegment) -> str: # check if the read pair is in the expected orientation """ assumptions based on illumina pairs: only 4 possible combinations Args: - read (pysam.AlignedSegment): the input read + read: the input read Returns: READ_PAIR_TYPE: the type of input read pair @@ -474,18 +472,17 @@ def read_pair_type(read): raise NotImplementedError('unexpected orientation for pair') -def orientation_supports_type(read, event_type): +def orientation_supports_type(read: pysam.AlignedSegment, event_type: str) -> bool: """ checks if the orientation is compatible with the type of event Args: - read (pysam.AlignedSegment): a read from the pair - event_type (SVTYPE): the type of event to check + read: a read from the pair + event_type: the type of event to check Returns: - bool: - - ``True`` - the read pair is in the correct orientation for this event type - - ``False`` - the read is not in the correct orientation + - ``True`` - the read pair is in the correct orientation for this event type + - ``False`` - the read is not in the correct orientation """ if event_type == SVTYPE.DEL or event_type == SVTYPE.INS: if read_pair_type(read) != READ_PAIR_TYPE.LR: @@ -504,7 +501,12 @@ def orientation_supports_type(read, event_type): return True -def convert_events_to_softclipping(read, orientation, max_event_size, min_anchor_size=None): +def convert_events_to_softclipping( + read: pysam.AlignedSegment, + orientation: str, + max_event_size: int, + min_anchor_size: Optional[int] = None, +) -> pysam.AlignedSegment: """ given an alignment, simplifies the alignment by grouping everything past the first anchor and including the first event considered too large and unaligning them turning them into softclipping @@ -574,7 +576,7 @@ def convert_events_to_softclipping(read, orientation, max_event_size, min_anchor return read -def sequence_complexity(seq): +def sequence_complexity(seq: str) -> float: """ basic measure of sequence complexity """ diff --git a/src/mavis/blat.py b/src/mavis/blat.py index de95d5cb..f379df1b 100644 --- a/src/mavis/blat.py +++ b/src/mavis/blat.py @@ -11,11 +11,13 @@ """ import math import re +from typing import Dict, List, Tuple import pandas as pd from .align import query_coverage_interval from .bam import cigar as _cigar +from .bam.cache import BamCache from .bam.cigar import QUERY_ALIGNED_STATES from .bam.read import SamRead from .constants import ( @@ -27,6 +29,7 @@ reverse_complement, ) from .interval import Interval +from .types import ReferenceGenome from .util import logger @@ -34,11 +37,10 @@ class Blat: """ """ @staticmethod - def millibad(row, is_protein=False, is_mrna=True): + def millibad(row: Dict, is_protein: bool = False, is_mrna: bool = True) -> float: """ this function is used in calculating percent identity - direct translation of the perl code - # https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 + direct translation of the perl code () """ size_mul = 1 if not is_protein else 3 if is_protein and is_mrna: @@ -76,7 +78,7 @@ def millibad(row, is_protein=False, is_mrna=True): return 0 @staticmethod - def score(row, is_protein=False): + def score(row: Dict, is_protein: bool = False) -> int: """ direct translation from ucsc guidelines on replicating the web blat score https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 @@ -101,11 +103,16 @@ def score(row, is_protein=False): return score @staticmethod - def percent_identity(row, is_protein=False, is_mrna=True): + def percent_identity(row: Dict, is_protein: bool = False, is_mrna: bool = True) -> float: return 100 - int(Blat.millibad(row, is_protein, is_mrna)) * 0.1 @staticmethod - def read_pslx(filename, seqid_to_sequence_mapping, is_protein=False, verbose=True): + def read_pslx( + filename: str, + seqid_to_sequence_mapping: Dict[str, str], + is_protein: bool = False, + verbose: bool = True, + ) -> Tuple[List[str], Dict]: header = [ 'match', 'mismatch', @@ -205,15 +212,16 @@ def split_csv_trailing_ints(x): return header, final_rows @staticmethod - def pslx_row_to_pysam(row, bam_cache, reference_genome): + def pslx_row_to_pysam( + row: Dict, bam_cache: BamCache, reference_genome: ReferenceGenome + ) -> SamRead: """ given a 'row' from reading a pslx file. converts the row to a BlatAlignedSegment object Args: - row Dict[str]: a row object from the 'read_pslx' method + row: a row object from the 'read_pslx' method bam_cache (BamCache): the bam file/cache to use as a template for creating reference_id from chr name - reference_genome (Dict[str,Bio.SeqRecord]): - dict of reference sequence by template/chr name + reference_genome: reference sequence by template/chr name """ chrom = bam_cache.reference_id(row['tname']) @@ -356,15 +364,15 @@ def pslx_row_to_pysam(row, bam_cache, reference_genome): def process_blat_output( - input_bam_cache, - query_id_mapping, - reference_genome, - aligner_output_file='aligner_out.temp', - blat_min_percent_of_max_score=0.8, - blat_min_identity=0.7, - blat_limit_top_aln=25, - is_protein=False, -): + input_bam_cache: BamCache, + query_id_mapping: Dict[str, str], + reference_genome: ReferenceGenome, + aligner_output_file: str = 'aligner_out.temp', + blat_min_percent_of_max_score: float = 0.8, + blat_min_identity: float = 0.7, + blat_limit_top_aln: int = 25, + is_protein: bool = False, +) -> Dict[str, List[SamRead]]: """ converts the blat output pslx (unheadered file) to bam reads """ diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 81518527..b28e4e4c 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -6,6 +6,7 @@ from .constants import COLUMNS, DNA_ALPHABET, ORIENT, STRAND, SVTYPE, reverse_complement from .error import InvalidRearrangement, NotSpecifiedError from .interval import Interval +from .types import ReferenceGenome class Breakpoint(Interval): @@ -429,7 +430,7 @@ def is_putative_indel(self) -> bool: return False return True - def breakpoint_sequence_homology(self, reference_genome): + def breakpoint_sequence_homology(self, reference_genome: ReferenceGenome): """ for a given set of breakpoints matches the sequence opposite the partner breakpoint this sequence comparison is done with reference to a reference genome and does not @@ -446,7 +447,7 @@ def breakpoint_sequence_homology(self, reference_genome): -------TT-TT-------- second break homology Args: - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference sequence by template/chr name + reference_genome: dict of reference sequence by template/chr name Returns: Tuple[str,str]: homologous sequence at the first breakpoint and second breakpoints diff --git a/src/mavis/cluster/cluster.py b/src/mavis/cluster/cluster.py index 36941d57..f2468cb1 100644 --- a/src/mavis/cluster/cluster.py +++ b/src/mavis/cluster/cluster.py @@ -1,8 +1,7 @@ -from __future__ import division - import itertools from collections import namedtuple from copy import copy +from typing import Dict, List from ..breakpoint import Breakpoint, BreakpointPair from ..constants import ORIENT, STRAND @@ -67,7 +66,7 @@ def weighted_mean(values, weights=None): return sum(x * w for x, w in zip(values, weights)) / sum(weights) -def merge_integer_intervals(*intervals, weight_adjustment=0): +def merge_integer_intervals(*intervals, weight_adjustment: int = 0) -> Interval: """ Merges a set of integer intervals into a single interval where the center is the weighted mean of the input intervals. The weight is inversely proportional to the @@ -76,7 +75,7 @@ def merge_integer_intervals(*intervals, weight_adjustment=0): input intervals Args: - weight_adjustment (int): add to length to lower weighting differences between small intervals + weight_adjustment: add to length to lower weighting differences between small intervals """ float_offset = 0.99999999 intervals = list(intervals) @@ -146,7 +145,12 @@ def all_pair_group_keys(pair, explicit_strand=False): return result -def merge_by_union(input_pairs, group_key, weight_adjustment=10, cluster_radius=200): +def merge_by_union( + input_pairs: List[BreakpointPair], + group_key: BreakpointPairGroupKey, + weight_adjustment: int = 10, + cluster_radius: int = 200, +) -> List[BreakpointPair]: """ for a given set of breakpoint pairs, merge the union of all pairs that are within the given distance (cluster_radius) @@ -230,8 +234,11 @@ def merge_by_union(input_pairs, group_key, weight_adjustment=10, cluster_radius= def merge_breakpoint_pairs( - input_pairs, cluster_radius=200, cluster_initial_size_limit=25, verbose=False -): + input_pairs: List[BreakpointPair], + cluster_radius: int = 200, + cluster_initial_size_limit: int = 25, + verbose: bool = False, +) -> Dict[BreakpointPair, List[BreakpointPair]]: """ two-step merging process @@ -242,12 +249,12 @@ def merge_breakpoint_pairs( done in order of smallest total breakpoint interval size to largest Args: - input_pairs (List[BreakpointPair]): the pairs to be merged - cluster_radius (int) maximum distance allowed for a node to merge - cluster_initial_size_limit (int): maximum size of breakpoint intervals allowed in the first merging phase + input_pairs: the pairs to be merged + cluster_radius: maximum distance allowed for a node to merge + cluster_initial_size_limit: maximum size of breakpoint intervals allowed in the first merging phase Returns: - Dict[BreakpointPair,List[BreakpointPair]]: mapping of merged breakpoint pairs to the input pairs used in the merge + mapping of merged breakpoint pairs to the input pairs used in the merge """ def pair_center_distance(pair1, pair2): diff --git a/src/mavis/cluster/main.py b/src/mavis/cluster/main.py index 17567f52..861a1aea 100644 --- a/src/mavis/cluster/main.py +++ b/src/mavis/cluster/main.py @@ -29,13 +29,13 @@ def split_clusters( outputdir: str, total_batches: int, write_bed_summary: bool = True, -): +) -> List[str]: """ For a set of clusters creates a bed file representation of all clusters. Also splits the clusters evenly into multiple files based on the user parameters (max_files) Returns: - list: of output file names (not including the bed file) + list of output file names (not including the bed file) """ if write_bed_summary: bedfile = os.path.join(outputdir, 'clusters.bed') diff --git a/src/mavis/illustrate/diagram.py b/src/mavis/illustrate/diagram.py index f9e6fabc..1c59e90b 100644 --- a/src/mavis/illustrate/diagram.py +++ b/src/mavis/illustrate/diagram.py @@ -2,12 +2,18 @@ This is the primary module responsible for generating svg visualizations """ +from typing import Iterable, List, Optional + +from mavis.annotate.genomic import Gene, Template +from mavis.annotate.variant import Annotation +from mavis.types import ReferenceGenome from svgwrite import Drawing from ..annotate.genomic import IntergenicRegion from ..interval import Interval +from .constants import DiagramSettings from .elements import draw_exon_track, draw_genes, draw_template, draw_ustranscript, draw_vmarker -from .scatter import draw_scatter +from .scatter import ScatterPlot, draw_scatter from .util import LabelMapping, generate_interval_mapping # draw gene level view @@ -17,18 +23,18 @@ def draw_sv_summary_diagram( - config, - ann, - reference_genome=None, - templates=None, - ignore_absent_templates=True, - user_friendly_labels=True, - template_display_label_prefix='', - draw_reference_transcripts=True, - draw_reference_genes=True, - draw_reference_templates=True, - draw_fusion_transcript=True, - stack_reference_transcripts=False, + config: DiagramSettings, + ann: Annotation, + reference_genome: ReferenceGenome = None, + templates: List[Template] = None, + ignore_absent_templates: bool = True, + user_friendly_labels: bool = True, + template_display_label_prefix: str = '', + draw_reference_transcripts: bool = True, + draw_reference_genes: bool = True, + draw_reference_templates: bool = True, + draw_fusion_transcript: bool = True, + stack_reference_transcripts: bool = False, ): """ this is the main drawing function. It decides between layouts @@ -43,17 +49,17 @@ def draw_sv_summary_diagram( - fusion transcript/translation Args: - ann (Annotation): the annotation object to be illustrated - reference_genome (Dict[str,str]): reference sequences - templates (List[Template]): list of templates, used in drawing the template-level view - ignore_absent_templates (bool): + ann: the annotation object to be illustrated + reference_genome: reference sequences + templates: list of templates, used in drawing the template-level view + ignore_absent_templates: if true then will not raise an error if the template information is not given but will not draw the template instead - show_template (bool): if false the template-level view is not drawn - user_friendly_labels (bool): + show_template: if false the template-level view is not drawn + user_friendly_labels: if True, genes are labelled by their aliases (where possible) and domains are labeled by their names (where possible) - template_display_label_prefix (str): the character to precede the template label + template_display_label_prefix: the character to precede the template label """ if not any( [ @@ -380,7 +386,18 @@ def draw_sv_summary_diagram( return canvas, legend -def draw_multi_transcript_overlay(config, gene, vmarkers=None, window_buffer=0, plots=None): +def draw_multi_transcript_overlay( + config: DiagramSettings, + gene: Gene, + vmarkers: Iterable[Interval] = None, + window_buffer: int = 0, + plots: Optional[List[ScatterPlot]] = None, +): + """ + Args: + vmarkers: vertical line markers + plots: scatter plots to plot on top of the gene diagram + """ vmarkers = [] if vmarkers is None else vmarkers plots = [] if plots is None else plots diff --git a/src/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py index a5f1e762..32995761 100644 --- a/src/mavis/illustrate/elements.py +++ b/src/mavis/illustrate/elements.py @@ -3,11 +3,13 @@ """ import re +from typing import List, Tuple from ..annotate.variant import FusionTranscript from ..constants import CODON_SIZE, GIEMSA_STAIN, ORIENT, STRAND from ..error import DrawingFitError, NotSpecifiedError -from ..interval import Interval +from ..interval import Interval, IntervalMapping +from .constants import DiagramSettings from .util import ( LabelMapping, Tag, @@ -22,7 +24,9 @@ HEX_BLACK = '#000000' -def draw_legend(config, canvas, swatches, border=True): +def draw_legend( + config: DiagramSettings, canvas, swatches: List[Tuple[str, str]], border: bool = True +): """ generates an svg group object representing the legend """ @@ -83,13 +87,13 @@ def draw_legend(config, canvas, swatches, border=True): def draw_exon_track( - config, + config: DiagramSettings, canvas, transcript, - mapping, + mapping: IntervalMapping, colors=None, - genomic_min=None, - genomic_max=None, + genomic_min: int = None, + genomic_max: int = None, translation=None, ): """ """ @@ -173,12 +177,12 @@ def draw_exon_track( def draw_transcript_with_translation( - config, + config: DiagramSettings, canvas, translation, labels, colors, - mapping, + mapping: IntervalMapping, reference_genome=None, genomic_min=None, genomic_max=None, diff --git a/src/mavis/interval.py b/src/mavis/interval.py index 3c73caf9..754d4ba0 100644 --- a/src/mavis/interval.py +++ b/src/mavis/interval.py @@ -1,10 +1,11 @@ -from typing import Optional +from typing import Dict, List, Optional, Tuple class Interval: start: int end: int freq: int = 1 + forward_to_reverse: Optional[bool] = None def __init__(self, start: int, end: Optional[int] = None, freq: int = 1, number_type=None): """ @@ -133,7 +134,7 @@ def __len__(self): """ return Interval.length(self) - def length(self): + def length(self) -> int: try: if self.number_type == float: return self[1] - self[0] @@ -166,7 +167,7 @@ def __repr__(self): return '{}({}, {}{})'.format(cls, self.start, self.end, number_type) @property - def center(self): + def center(self) -> float: """ the middle of the interval @@ -193,7 +194,7 @@ def __contains__(self, other): return False @classmethod - def dist(cls, first, other): + def dist(cls, first, other) -> int: """returns the minimum distance between intervals Example: @@ -215,7 +216,7 @@ def __hash__(self): return hash((self[0], self[1], self.freq)) @classmethod - def position_in_range(cls, segments, pos): + def position_in_range(cls, segments, pos) -> Tuple[int, bool]: if len(segments) == 0: raise AttributeError('cannot compute on an empty list') @@ -245,7 +246,7 @@ def position_in_range(cls, segments, pos): return num, found_inbetween_segment @classmethod - def convert_pos(cls, mapping, pos, forward_to_reverse=None): + def convert_pos(cls, mapping: 'IntervalMapping', pos: int, forward_to_reverse=None) -> int: i = cls.convert_ratioed_pos(mapping, pos, forward_to_reverse) if i.forward_to_reverse: return i.end @@ -253,15 +254,17 @@ def convert_pos(cls, mapping, pos, forward_to_reverse=None): return i.start @classmethod - def convert_ratioed_pos(cls, mapping, pos, forward_to_reverse=None): + def convert_ratioed_pos( + cls, mapping: 'IntervalMapping', pos: int, forward_to_reverse=None + ) -> 'Interval': """convert any given position given a mapping of intervals to another range Args: - mapping (Dict[Interval,Interval]): a mapping of a set of continuous intervals - pos (int): a position in the first coordinate system + mapping: a mapping of a set of continuous intervals + pos: a position in the first coordinate system Returns: - Interval: the position in the alternate coordinate system given the input mapping + the position in the alternate coordinate system given the input mapping Raises: AttributeError: if the input position is outside the set of input segments @@ -301,31 +304,31 @@ def convert_ratioed_pos(cls, mapping, pos, forward_to_reverse=None): elif not forward_to_reverse: raise AttributeError('direction of mapped intervals is not consistent') - i, previous_flag = Interval.position_in_range( + index, previous_flag = Interval.position_in_range( input_intervals, (pos, pos) ) # get the input position - if i == len(input_intervals) or previous_flag: + if index == len(input_intervals) or previous_flag: raise IndexError(pos, 'is outside mapped range', mapping) else: # fell into a mapped region - curr = input_intervals[i] + curr = input_intervals[index] nexxt = mapping[curr] if curr[1] - curr[0] == 0: - i = Interval(nexxt[0], nexxt[1]) + result = Interval(nexxt[0], nexxt[1]) else: ratio = (nexxt[1] - nexxt[0]) / (curr[1] - curr[0]) shift = round((pos - curr[0]) * ratio, 0) shift2 = round((pos - curr[0]) * ratio + ratio, 0) number_type = int if ratio == 1 else float if forward_to_reverse: - i = Interval(nexxt[1] - shift2, nexxt[1] - shift, number_type=number_type) + result = Interval(nexxt[1] - shift2, nexxt[1] - shift, number_type=number_type) else: - i = Interval(nexxt[0] + shift, nexxt[0] + shift2, number_type=number_type) - setattr(i, 'forward_to_reverse', forward_to_reverse) - return i + result = Interval(nexxt[0] + shift, nexxt[0] + shift2, number_type=number_type) + result.forward_to_reverse = forward_to_reverse + return result @classmethod - def union(cls, *intervals): + def union(cls, *intervals) -> 'Interval': """ returns the union of the set of input intervals @@ -338,7 +341,7 @@ def union(cls, *intervals): return Interval(min([i[0] for i in intervals]), max([i[1] for i in intervals])) @classmethod - def intersection(cls, *intervals): + def intersection(cls, *intervals) -> Optional['Interval']: """ returns None if there is no intersection @@ -357,7 +360,7 @@ def intersection(cls, *intervals): return Interval(low, high) @classmethod - def min_nonoverlapping(cls, *intervals): + def min_nonoverlapping(cls, *intervals: 'Interval') -> List['Interval']: """ for a list of intervals, orders them and merges any overlap to return a list of non-overlapping intervals O(nlogn) @@ -368,9 +371,9 @@ def min_nonoverlapping(cls, *intervals): """ if len(intervals) == 0: return [] - intervals = sorted(list(intervals), key=lambda x: (x[0], x[1])) - new_intervals = [Interval(intervals[0][0], intervals[0][1])] - for i in intervals[1:]: + sorted_intervals = sorted(list(intervals), key=lambda x: (x[0], x[1])) + new_intervals = [Interval(sorted_intervals[0][0], sorted_intervals[0][1])] + for i in sorted_intervals[1:]: if Interval.overlaps(new_intervals[-1], i): new_intervals[-1] = new_intervals[-1] | i else: @@ -463,26 +466,17 @@ def add(self, src_interval, tgt_interval, opposing_directions=True): self.mapping[src_interval] = tgt_interval self.opposing_directions[src_interval] = opposing_directions - def convert_ratioed_pos(self, pos): - """convert any given position given a mapping of intervals to another range + def convert_ratioed_pos(self, pos: int) -> Interval: + """convert any given position given a mapping of intervals to the mapped range Args: - pos (Interval): a position in the first coordinate system + pos: a position in the first coordinate system Returns: - the position in the alternate coordinate system given the input mapping - - int: if simplify is True - - Interval: if simplify is False + the Interval the position lands in in the new coordinate system Raises: IndexError: if the input position is not in any of the mapped intervals - - Example: - >>> mapping = IntervalMapping(mapping={(1, 10): (101, 110), (11, 20): (555, 564)}) - >>> mapping.convert_pos(5) - 5 - >>> mapping.convert_pos(15) - 559 """ for src_interval, tgt_interval in self.mapping.items(): if pos in src_interval: @@ -499,16 +493,14 @@ def convert_ratioed_pos(self, pos): return tgt_interval raise IndexError(pos, 'position not found in mapping', self.mapping.keys()) - def convert_pos(self, pos): + def convert_pos(self, pos: int) -> int: """convert any given position given a mapping of intervals to another range Args: - pos (int): a position in the first coordinate system + pos: a position in the first coordinate system Returns: the position in the alternate coordinate system given the input mapping - - int: if simplify is True - - Interval: if simplify is False Raises: IndexError: if the input position is not in any of the mapped intervals diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index 7717d743..d0ae9c63 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -121,7 +121,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_record(record, record_mapping={}) -> List[Dict]: +def convert_record(record: VcfRecordType) -> List[Dict]: """ converts a vcf record @@ -220,7 +220,7 @@ def convert_record(record, record_mapping={}) -> List[Dict]: return records -def convert_pandas_rows_to_variants(df): +def convert_pandas_rows_to_variants(df: pd.DataFrame) -> List[VcfRecordType]: def parse_info(info_field): info = {} for pair in info_field.split(';'): @@ -259,7 +259,7 @@ def parse_info(info_field): return rows -def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: +def pandas_vcf(input_file: str) -> Tuple[List[str], pd.DataFrame]: """ Read a standard vcf file into a pandas dataframe """ @@ -296,7 +296,7 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: return header_lines, df -def convert_file(input_file: str): +def convert_file(input_file: str) -> List[Dict]: """process a VCF file Args: diff --git a/src/mavis/types.py b/src/mavis/types.py new file mode 100644 index 00000000..23e3de15 --- /dev/null +++ b/src/mavis/types.py @@ -0,0 +1,11 @@ +""" +Helper classes for type hints +""" + +from typing import Dict, List, Tuple + +from Bio.SeqRecord import SeqRecord + +ReferenceGenome = Dict[str, SeqRecord] + +CigarTuples = List[Tuple[int, int]] From 2ac6cc6b332405e250fd62f81bcf13ead84ad06a Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 28 Jan 2022 15:40:25 -0800 Subject: [PATCH 095/137] Add more types --- MANIFEST.in | 1 + setup.cfg | 1 + src/mavis/align.py | 4 ++-- src/mavis/annotate/genomic.py | 4 ++-- src/mavis/annotate/splicing.py | 5 +++-- src/mavis/py.typed | 0 src/mavis/validate/base.py | 1 - 7 files changed, 9 insertions(+), 7 deletions(-) create mode 100644 src/mavis/py.typed diff --git a/MANIFEST.in b/MANIFEST.in index c1af92d1..16491603 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ recursive-include src *.py *.json +include src/mavis/py.typed include README.md include LICENSE prune docs diff --git a/setup.cfg b/setup.cfg index a34fc7ef..d0a4934c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,6 +84,7 @@ dev = mkdocs-material==5.4.0 markdown-include mkdocs-simple-hooks==0.1.2 + types-setuptools>=57.4.7, <58 deploy = twine wheel diff --git a/src/mavis/align.py b/src/mavis/align.py index 2d6b6898..f4c70712 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -5,7 +5,7 @@ import os import re import subprocess -from typing import Dict +from typing import Dict, List import pysam @@ -470,7 +470,7 @@ def align_sequences( with pysam.AlignmentFile( aligner_output_file, 'r', check_sq=bool(len(sequences)) ) as samfile: - reads_by_query = {} + reads_by_query: Dict[str, List[_read.SamRead]] = {} for read in samfile.fetch(): if read.is_unmapped: continue diff --git a/src/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py index 14ca0a7b..22f6831d 100644 --- a/src/mavis/annotate/genomic.py +++ b/src/mavis/annotate/genomic.py @@ -244,7 +244,7 @@ def transcript(self): return self.reference_object @property - def donor_splice_site(self) -> Interval: + def donor_splice_site(self) -> SpliceSite: """the genomic range describing the splice site""" if self.is_reverse: return self.start_splice_site @@ -252,7 +252,7 @@ def donor_splice_site(self) -> Interval: return self.end_splice_site @property - def acceptor_splice_site(self) -> Interval: + def acceptor_splice_site(self) -> SpliceSite: """the genomic range describing the splice site""" if self.is_reverse: return self.end_splice_site diff --git a/src/mavis/annotate/splicing.py b/src/mavis/annotate/splicing.py index b2160ffd..fe303592 100644 --- a/src/mavis/annotate/splicing.py +++ b/src/mavis/annotate/splicing.py @@ -4,7 +4,8 @@ from ..constants import SPLICE_TYPE, STRAND, reverse_complement from ..interval import Interval from .base import BioInterval -from .constants import ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, SPLICE_SITE_TYPE +from .constants import (ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, + SPLICE_SITE_TYPE) class SplicingPattern(list): @@ -121,7 +122,7 @@ def __init__( self, ref: BioInterval, pos: int, - site_type: str, + site_type: int, intact: bool = True, start: Optional[int] = None, end: Optional[int] = None, diff --git a/src/mavis/py.typed b/src/mavis/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index e73d4846..65a70bc5 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -671,7 +671,6 @@ def collect_split_read(self, read: pysam.AlignedSegment, first_breakpoint: bool) w[0] - 1 : w[1] ] - putative_alignments = None # figure out how much of the read must match when remaped min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] min_match_tgt = min( From 841c36412aa3f85c23c777a72c0e03800c37c76f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 28 Jan 2022 15:43:01 -0800 Subject: [PATCH 096/137] Fix linting error --- src/mavis/annotate/splicing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mavis/annotate/splicing.py b/src/mavis/annotate/splicing.py index fe303592..9de8ce24 100644 --- a/src/mavis/annotate/splicing.py +++ b/src/mavis/annotate/splicing.py @@ -4,8 +4,7 @@ from ..constants import SPLICE_TYPE, STRAND, reverse_complement from ..interval import Interval from .base import BioInterval -from .constants import (ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, - SPLICE_SITE_TYPE) +from .constants import ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, SPLICE_SITE_TYPE class SplicingPattern(list): From 43cd30920db60c5dd106b39b964a89cbacebe957 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Fri, 28 Jan 2022 17:59:35 -0800 Subject: [PATCH 097/137] add quick test to vcf module --- src/mavis/interval.py | 2 -- src/mavis/tools/vcf.py | 10 ++++++++++ tests/data/sniffles.vcf | 1 + tests/unit/test_tools_vcf.py | 6 +++--- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/mavis/interval.py b/src/mavis/interval.py index 03c788f9..c78e3aa8 100644 --- a/src/mavis/interval.py +++ b/src/mavis/interval.py @@ -30,8 +30,6 @@ def __init__(self, start: int, end: Optional[int] = None, freq: int = 1, number_ self.start = self.number_type(self.start) self.end = self.number_type(self.end) - if self.start == 0 and self.end == 1: - self.start = 1 if self.start > self.end: raise AttributeError('interval start > end is not allowed', self.start, self.end) self.freq = int(freq) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index 92a6c987..9dd04c6f 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -204,6 +204,16 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]: } ) + ''' + As per VCF 4.2 specifications (https://samtools.github.io/hts-specs/VCFv4.2.pdf): + A start_position = 1, end_position = 0 linkage indicates connections to telomeres + Change 0 a 1 since coordinates are 1-based and we cannot start before the start of a sequence + ''' + + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: + std_row.update({'break1_position_end': 1}) + elif std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: + std_row.update({'break2_position_end': 1}) if 'SVTYPE' in info: std_row[COLUMNS.event_type] = info['SVTYPE'] diff --git a/tests/data/sniffles.vcf b/tests/data/sniffles.vcf index dd631018..c9e475fc 100644 --- a/tests/data/sniffles.vcf +++ b/tests/data/sniffles.vcf @@ -335,3 +335,4 @@ chr1 2657257 89_1 CCCTGCACACCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGTTGAGCAT chr1 125029102 1150_3 N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr1;END=125029168;STD_quant_start=0;STD_quant_stop=0;Kurtosis_quant_start=1.95204;Kurtosis_quant_stop=4.40106;SVTYPE=DEL/INV;RNAMES=0615a1e2-43d8-4ee1-aad8-b3bc30654032,07df46cf-fab9-4b7a-9c61-63d6fb79063d,0c62bf53-a1fa-4cd8-9f65-9b6d896437f6,0c775ec9-8cd4-46fa-b432-d8fadbe12e8c,13f6e3a3-7e12-4fe1-96e8-6fb510c5e51e,25140c43-b936-4ec7-88ac-5f35ee57eb89,425d50b0-d4c0-4128-befb-3cb9a20d0395,42ec5104-09bf-4324-9579-9099acbf7650,61e7dbc6-c78a-4047-aaee-6660f758bf93,711bb46e-6427-4960-9abf-a11838e69701,76aadab8-9dfe-46bc-a906-eb70126c5841,77b0f627-1ffc-43ff-8f07-a3839b73e859,78e435d7-34fa-4d52-9f3f-189868c74142,7bff020f-5745-4030-a649-c2ca270932f4,801d64f6-1205-4a00-a1c0-eeff98c29be7,928578aa-6777-4ba7-a150-7b2eaa900249,993a19f7-ab8d-4636-aa9f-56566d3ab328,9dc57ac2-909f-467e-a15f-26041dee67d0,a5fdd9d5-ed51-4036-88e5-6009ce233bc6;SUPTYPE=NR;SVLEN=-66;STRANDS=+-;RE=19;REF_strand=0,5;AF=0.791667 GT:DR:DV 0/1:5:19 chr10 125502113 16341_1 N . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr10;END=125508658;STD_quant_start=0;STD_quant_stop=3.1305;Kurtosis_quant_start=nan;Kurtosis_quant_stop=2;SVTYPE=INVDUP;RNAMES=11f6d8be-ef3b-44bd-bc04-b7a4c6619129,235ef779-5cc1-4999-91cd-25c6bbcfbb08,3058d7cf-0c2c-493e-8bd7-97adb6e8c721,40788bbd-835b-4706-8abe-a76b9672804f,4ed139db-33bf-4ed2-b709-840d06a92d5f,929a407e-1103-4f89-81fa-20b902c08c6e,a8b62faf-9df4-497e-a1df-f4d6af7a92e0,c34b12b2-7aee-4c9a-89a2-81d8f9d34a8b;SUPTYPE=SR;SVLEN=6545;STRANDS=++;RE=5;REF_strand=6,4;AF=0.333333 GT:DR:DV 0/1:10:5 chr11 3653753 16746_1 N CAACCCTACCTCTGTCTCTCCTCACCGCTATTCTCCCATCAGACATCAGTCTTGTAAATTCCAATTCCTACCTCTGTCTATCCTCACTGCCATTCTCCCACTGACATCAAGTCTTGTAAATCTCCACCTCCTACCTCTGTCTATCCTCACTGCCATTCTCCCATCAGACATCAAGTTCTGTAAATTCCACCTCCTACCTCTCTGTCTATCCTCACTGCCATTCTCCCATCTGGACATCAAGTCTTGTAAATT . PASS IMPRECISE;SVMETHOD=Snifflesv1.0.11;CHR2=chr11;END=3654825;STD_quant_start=139.219;STD_quant_stop=2.84445;Kurtosis_quant_start=-1.96058;Kurtosis_quant_stop=-0.064951;SVTYPE=DUP/INS;RNAMES=00b67655-ed56-4780-b3fe-be29d59e1859,028e509d-93df-4dc8-82e9-a09514d7bc3d,02cce278-7597-4217-8253-5f28a462e699,06eb2443-613b-404c-950a-d4af531205bd,0fcbfcc2-2a80-4da8-b592-d7862d4e5f32,0ffca8ba-4f7f-4475-b7f1-b57ec5c83ed8,130292a2-4dff-4c25-b571-b1ed9fb82f6e,14038c62-2b05-446f-8d58-b34a3f784d23,16732163-f7e5-4bc2-bfc8-a38f908e5ba8,190c6094-4e7b-4f20-91a1-30e6b0d189db,19b242ce-893d-4b98-ac2d-70cf8e216d61,206d22d5-959a-4d55-9d85-eca31de42f0b,21cc7629-0ee7-44b6-9e17-664918ab0ac2,26fd2c01-04d2-40f7-b350-aeea96752a72,2c67fc96-be15-4e19-bb73-662a104fdd1a,329783a5-e55a-4276-8c13-1f8bdbff7700,342c2503-a98b-4a20-9dd9-8190bdd743fa,34c23995-bcaa-4dcd-aefa-3c96d4032492,4f12c658-1e88-44a9-9689-18bc76d12047,5da809c9-cf2c-4562-a703-3d1b12927220,6145c5c6-c4ed-4b30-987f-e653337a0a18,744b6c64-1a96-4dcb-9216-8be6bdcfe7f3,84e6ec27-5a6b-463d-8681-045651b2af07,8eedae6b-ec01-4367-bd47-2081f9df8f33,8f6ff282-26eb-4eb5-89bd-df9010689ba6,9c8fa8a6-da33-4599-b835-24d0220c6139,a2494f89-4dba-472a-9b20-c61d0a0512af,ad3b03b1-1cf7-4a54-a6df-eb7563ddbbea,b2b77f8b-0659-4996-8280-078e8b9463bb,b7819371-05b3-4eac-a229-54a49a852133,bafd9ab5-3cc7-4c21-b48f-186d1a8e5351,bff30357-4e2d-46a7-927e-707223885e25,c2d8bcd3-a488-4709-8d33-f9c000c54d51,c4e7702f-8831-4236-b6a8-6723a3d668f8,ca4a99ee-181d-488a-9eea-e0ef7e9b765e,ca6d0c9c-bc64-4888-b660-18ca49b597b9,ccf84af9-06f8-4bb1-b844-e4512907b8bd,d00d6c06-b2e2-407d-b294-d585efdb53ad,da8571f5-d34e-4e04-a7a8-b2696a4661e0,e1fd56ef-d7ee-4703-8fad-a90383cb4156,ebf37b99-cfbc-4168-a497-a8453d0e698a,f8008b3f-d0e6-474b-82c1-bb28a53b9e01,f8c3fa80-204b-4d1f-b782-358c648e48bd,fa2f213f-f63d-4828-a601-bfdfae84b8e9;SUPTYPE=AL,SR;SVLEN=61;STRANDS=+-;RE=22;REF_strand=1,0;AF=0.956522 GT:DR:DV 1/1:1:22 +chr9 60528555 29582_2 N N[chr17_GL000205v2_random:0[ . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=1.61245;STD_quant_stop=0;Kurtosis_quant_start=6.96082;Kurtosis_quant_stop=7;SVTYPE=BND;RNAMES=0025a6d7-4f51-4eb7-90a6-03540aadf4e7,4839f229-9a4e-4c4c-8652-a5c6da4f9ae4,4d7d382d-6a1a-46cd-beb0-4b7028b78356,54cf27a5-d6b8-4f68-8f72-ae38badc6ea4,56756629-0e3f-4e9a-9b6d-34b2fa97837d,5d1be304-66fb-44a3-8873-a5e291ea8db2,69a5ce72-4ab8-48b4-97c6-8872a310c649,73179d48-8c79-4566-b55b-d69a557b88ab,84e94667-fbad-4f22-ba2c-54e7f96b2daf,8b536a68-ddb2-4b04-9ae3-87df2a939fd4,963e22ef-31dd-4f50-8e46-9d8c48417787,a1a46081-76e9-4e38-8cc3-160c59608548,c67f7f2a-a381-403a-848f-0d63bb1ddd2e,c9806188-0f3c-415b-92cb-92c1cae1a085,ef9cd5af-23cc-4ba8-8c3e-8518716b83c1;SUPTYPE=SR;SVLEN=0;STRANDS=+-;RE=15;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:15chr9 60528555 29582_2 N N[chr17_GL000205v2_random:0[ . PASS PRECISE;SVMETHOD=Snifflesv1.0.11;STD_quant_start=1.61245;STD_quant_stop=0;Kurtosis_quant_start=6.96082;Kurtosis_quant_stop=7;SVTYPE=BND;RNAMES=0025a6d7-4f51-4eb7-90a6-03540aadf4e7,4839f229-9a4e-4c4c-8652-a5c6da4f9ae4,4d7d382d-6a1a-46cd-beb0-4b7028b78356,54cf27a5-d6b8-4f68-8f72-ae38badc6ea4,56756629-0e3f-4e9a-9b6d-34b2fa97837d,5d1be304-66fb-44a3-8873-a5e291ea8db2,69a5ce72-4ab8-48b4-97c6-8872a310c649,73179d48-8c79-4566-b55b-d69a557b88ab,84e94667-fbad-4f22-ba2c-54e7f96b2daf,8b536a68-ddb2-4b04-9ae3-87df2a939fd4,963e22ef-31dd-4f50-8e46-9d8c48417787,a1a46081-76e9-4e38-8cc3-160c59608548,c67f7f2a-a381-403a-848f-0d63bb1ddd2e,c9806188-0f3c-415b-92cb-92c1cae1a085,ef9cd5af-23cc-4ba8-8c3e-8518716b83c1;SUPTYPE=SR;SVLEN=0;STRANDS=+-;RE=15;REF_strand=0,0;AF=1 GT:DR:DV 1/1:0:15 \ No newline at end of file diff --git a/tests/unit/test_tools_vcf.py b/tests/unit/test_tools_vcf.py index 8af3067f..c196418a 100644 --- a/tests/unit/test_tools_vcf.py +++ b/tests/unit/test_tools_vcf.py @@ -5,9 +5,9 @@ def test_read_vcf(): - header, df = pandas_vcf(get_data('delly_events.vcf')) - assert len(header) == 63 - assert df.shape[0] == 31 + header, df = pandas_vcf(get_data('sniffles.vcf')) + assert len(header) == 231 + assert df.shape[0] == 106 def test_convert_record(): From db96aec920c48ff686163879eb8bf3c70680d40b Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Fri, 28 Jan 2022 18:50:49 -0800 Subject: [PATCH 098/137] changed test case to include bp1 --- src/mavis/tools/vcf.py | 6 +++--- tests/unit/test_tools_vcf.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index d5eeba02..ee6995c5 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -211,7 +211,7 @@ def convert_record(record, record_mapping={}) -> List[Dict]: if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: std_row.update({'break1_position_end': 1}) - elif std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: + if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: std_row.update({'break2_position_end': 1}) if 'SVTYPE' in info: std_row[COLUMNS.event_type] = info['SVTYPE'] @@ -284,7 +284,7 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: # read the data df = pd.read_csv( input_file, - sep="\t", + sep='\t', skiprows=len(header_lines), dtype={ 'CHROM': str, @@ -295,7 +295,7 @@ def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]: 'REF': str, 'ALT': str, }, - na_values=PANDAS_DEFAULT_NA_VALUES + ["."], + na_values=PANDAS_DEFAULT_NA_VALUES + ['.'], ) df = df.rename(columns={df.columns[0]: df.columns[0].replace('#', '')}) required_columns = ['CHROM', 'INFO', 'POS', 'REF', 'ALT', 'ID'] diff --git a/tests/unit/test_tools_vcf.py b/tests/unit/test_tools_vcf.py index c196418a..122a640d 100644 --- a/tests/unit/test_tools_vcf.py +++ b/tests/unit/test_tools_vcf.py @@ -12,8 +12,8 @@ def test_read_vcf(): def test_convert_record(): variant = VcfRecordType( - 9000, - 12000, + 1, + 0, 'chr14_KI270722v1_random', alts=['N[chr17_GL000205v2_random:0['], ref='N', @@ -32,6 +32,8 @@ def test_convert_record(): records = convert_record(variant) assert len(records) == 1 record = records[0] + assert record.get('break1_position_start') == 1 + assert record.get('break1_position_end') == 1 assert record.get('break2_position_start') == 1 assert record.get('break2_position_end') == 1 assert record.get('break2_chromosome') == 'chr17_GL000205v2_random' From ec1e54bfcd33ef17c837707a3a56d3ea7b79562f Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Sun, 30 Jan 2022 02:34:19 -0800 Subject: [PATCH 099/137] update hooks.py to import from config --- docs/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/hooks.py b/docs/hooks.py index 44931755..02411323 100644 --- a/docs/hooks.py +++ b/docs/hooks.py @@ -4,7 +4,7 @@ from textwrap import dedent from markdown_refdocs.main import extract_to_markdown -from mavis.schemas import DEFAULTS +from mavis_config import DEFAULTS from mavis.util import ENV_VAR_PREFIX From 4e1c1e94c2c9dd873b8c82472e096b1ef40234e0 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 31 Jan 2022 12:10:15 -0800 Subject: [PATCH 100/137] fixed checks --- src/mavis/tools/vcf.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index ee6995c5..474fc5a1 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -156,6 +156,8 @@ def convert_record(record, record_mapping={}) -> List[Dict]: if info.get('SVTYPE') == 'BND': chr2, end, orient1, orient2, ref, alt = parse_bnd_alt(alt) + if end == 0: + end = 1 # telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row[COLUMNS.break1_orientation] = orient1 std_row[COLUMNS.break2_orientation] = orient2 std_row[COLUMNS.untemplated_seq] = alt @@ -202,20 +204,6 @@ def convert_record(record, record_mapping={}) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - - ''' - As per VCF 4.2 specifications (https://samtools.github.io/hts-specs/VCFv4.2.pdf): - A start_position = 1, end_position = 0 linkage indicates connections to telomeres - Change 0 a 1 since coordinates are 1-based and we cannot start before the start of a sequence - ''' - - if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: - std_row.update({'break1_position_end': 1}) - if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: - std_row.update({'break2_position_end': 1}) - if 'SVTYPE' in info: - std_row[COLUMNS.event_type] = info['SVTYPE'] - try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} From 4c3365fc93b660bf6edcca3b59bf98058d282c3e Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 31 Jan 2022 12:17:11 -0800 Subject: [PATCH 101/137] unfix deleted lines --- src/mavis/tools/vcf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index f8b6c58b..e52a0cde 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -204,6 +204,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) + + if 'SVTYPE' in info: + std_row[COLUMNS.event_type] = info['SVTYPE'] + try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} From 107253499fa080228c67138d6b3d81c3770a978e Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 31 Jan 2022 12:22:40 -0800 Subject: [PATCH 102/137] re-lint vcf file --- src/mavis/tools/vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index e52a0cde..ff4c0cb0 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -204,7 +204,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - + if 'SVTYPE' in info: std_row[COLUMNS.event_type] = info['SVTYPE'] From debbf27171661dbb48d0c917811fb2b64fbcede9 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 31 Jan 2022 14:43:18 -0800 Subject: [PATCH 103/137] change unit tests to accomodate changes --- tests/unit/test_tools_vcf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/test_tools_vcf.py b/tests/unit/test_tools_vcf.py index 122a640d..c4eac443 100644 --- a/tests/unit/test_tools_vcf.py +++ b/tests/unit/test_tools_vcf.py @@ -32,8 +32,5 @@ def test_convert_record(): records = convert_record(variant) assert len(records) == 1 record = records[0] - assert record.get('break1_position_start') == 1 - assert record.get('break1_position_end') == 1 - assert record.get('break2_position_start') == 1 assert record.get('break2_position_end') == 1 assert record.get('break2_chromosome') == 'chr17_GL000205v2_random' From 847752cfc2f1191536b84b9ca62cdb9e34e2f1c4 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 1 Feb 2022 13:56:18 -0800 Subject: [PATCH 104/137] Add more type annotations --- src/mavis/align.py | 9 +- src/mavis/annotate/base.py | 23 +++-- src/mavis/annotate/file_io.py | 14 +-- src/mavis/annotate/fusion.py | 48 ++++----- src/mavis/annotate/genomic.py | 2 +- src/mavis/annotate/main.py | 4 +- src/mavis/annotate/protein.py | 12 +-- src/mavis/annotate/variant.py | 44 ++++---- src/mavis/assemble.py | 8 +- src/mavis/bam/cache.py | 4 +- src/mavis/bam/stats.py | 71 ++++++------- src/mavis/blat.py | 2 +- src/mavis/breakpoint.py | 20 ++-- src/mavis/cluster/cluster.py | 2 +- src/mavis/constants.py | 8 +- src/mavis/illustrate/elements.py | 168 +++++++++++++++++++------------ src/mavis/illustrate/scatter.py | 61 ++++++----- src/mavis/interval.py | 14 +-- src/mavis/main.py | 6 +- src/mavis/types.py | 7 +- src/mavis/util.py | 30 ++++-- src/mavis/validate/evidence.py | 4 +- 22 files changed, 317 insertions(+), 244 deletions(-) diff --git a/src/mavis/align.py b/src/mavis/align.py index f4c70712..43e67be6 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -5,7 +5,7 @@ import os import re import subprocess -from typing import Dict, List +from typing import TYPE_CHECKING, Dict, List import pysam @@ -17,6 +17,9 @@ from .types import ReferenceGenome from .util import logger +if TYPE_CHECKING: + from .bam.cache import BamCache + class SUPPORTED_ALIGNER(MavisNamespace): """ @@ -385,7 +388,7 @@ def call_paired_read_event(read1, read2, is_stranded=False): def align_sequences( sequences: Dict[str, str], - input_bam_cache, + input_bam_cache: BamCache, reference_genome: ReferenceGenome, aligner: str, aligner_reference: str, @@ -402,7 +405,7 @@ def align_sequences( Args: sequences: dictionary of sequences by name - input_bam_cache (BamCache): bam cache to be used as a template for reading the alignments + input_bam_cache: bam cache to be used as a template for reading the alignments reference_genome: the reference genome aligner (SUPPORTED_ALIGNER): the name of the aligner to be used aligner_reference: path to the aligner reference file diff --git a/src/mavis/annotate/base.py b/src/mavis/annotate/base.py index 950444fa..23b7205a 100644 --- a/src/mavis/annotate/base.py +++ b/src/mavis/annotate/base.py @@ -59,15 +59,22 @@ class BioInterval: strand: Optional[str] def __init__( - self, reference_object, start, end=None, name=None, seq=None, data=None, strand=None + self, + reference_object, + start: int, + end: int = None, + name: Optional[str] = None, + seq: Optional[str] = None, + data: Optional[Dict] = None, + strand: Optional[str] = None, ): """ Args: reference_object: the object this interval is on - start (int) start of the interval (inclusive) - end (int): end of the interval (inclusive) + start: start of the interval (inclusive) + end: end of the interval (inclusive) name: optional - seq (str): the seq relating to this interval + seq: the seq relating to this interval Example: >>> b = BioInterval('1', 12572784, 12578898, 'q22.2') @@ -88,13 +95,13 @@ def __init__( self.strand = strand @property - def start(self): - """*int*: the start position""" + def start(self) -> int: + """the start position""" return self.position.start @property - def end(self): - """*int*: the end position""" + def end(self) -> int: + """the end position""" return self.position.end def __getitem__(self, index): diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 4b6ba264..661b8d7c 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -12,7 +12,7 @@ from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, translate from ..interval import Interval -from ..types import ReferenceGenome +from ..types import ReferenceAnnotations, ReferenceGenome from ..util import logger from .base import BioInterval, ReferenceName from .genomic import Exon, Gene, PreTranscript, Template, Transcript @@ -97,7 +97,7 @@ def parse_annotations_json( data, reference_genome: Optional[ReferenceGenome] = None, best_transcripts_only=False, -) -> Dict[str, List[Gene]]: +) -> ReferenceAnnotations: """ parses a json of annotation information into annotation objects """ @@ -112,7 +112,7 @@ def parse_annotations_json( ) # these can get super long raise AssertionError(short_msg) - genes_by_chr: Dict[str, List[Gene]] = {} + genes_by_chr: ReferenceAnnotations = {} for gene_dict in data['genes']: @@ -321,10 +321,10 @@ def __init__( ): """ Args: - *filepaths (str): list of paths to load - file_type (str): Type of file to load - eager_load (bool=False): load the files immeadiately - assert_exists (bool=False): check that all files exist + *filepaths: list of paths to load + file_type: Type of file to load + eager_load: load the files immeadiately + assert_exists: check that all files exist **opt: key word arguments to be passed to the load function and used as part of the file cache key Raises diff --git a/src/mavis/annotate/fusion.py b/src/mavis/annotate/fusion.py index b02e3259..2666e1d9 100644 --- a/src/mavis/annotate/fusion.py +++ b/src/mavis/annotate/fusion.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + from ..breakpoint import Breakpoint from ..constants import ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, reverse_complement from ..error import NotSpecifiedError @@ -6,14 +8,17 @@ from .genomic import Exon, PreTranscript, Transcript from .protein import Domain, Translation, calculate_orf +if TYPE_CHECKING: + from .variant import Annotation + -def determine_prime(transcript, breakpoint): +def determine_prime(transcript: Transcript, breakpoint: Breakpoint) -> int: """ determine the side of the transcript 5' or 3' which is 'kept' given the breakpoint Args: - transcript (Transcript): the transcript - breakpoint (Breakpoint): the breakpoint + transcript: the transcript + breakpoint: the breakpoint Returns: PRIME: 5' or 3' @@ -56,6 +61,9 @@ class FusionTranscript(PreTranscript): The preferred way to construct a FusionTranscript is through the build method. """ + last_five_prime_exon: Exon + first_three_prime_exon: Exon + def __init__(self): self.exon_mapping = {} self.exons = [] @@ -71,10 +79,10 @@ def __init__(self): self.break1 = None # first breakpoint position in the fusion transcript self.break2 = None # second breakpoint position in the fusion transcript - def exon_number(self, exon): + def exon_number(self, exon: Exon) -> int: """ Args: - exon (Exon): the exon to be numbered + exon: the exon to be numbered Returns: int: the number of the exon in the original transcript (prior to fusion) @@ -87,14 +95,7 @@ def map_region_to_genome(self, chr, interval_on_fusion, genome_interval, flipped self.mapping_to_chrs[Interval(interval_on_fusion[0], interval_on_fusion[1])] = chr @classmethod - def _build_single_gene_inversion( - cls, - ann, - reference_genome: ReferenceGenome, - min_orf_size, - max_orf_cap, - min_domain_mapping_match, - ): + def _build_single_gene_inversion(cls, ann, reference_genome: ReferenceGenome): """ builds a fusion transcript for a single gene inversion. Note that this is an incomplete fusion transcript and still requires translations and domain information to be added @@ -209,9 +210,7 @@ def _build_single_gene_inversion( return fusion_pre_transcript @classmethod - def _build_single_gene_duplication( - cls, ann, reference_genome, min_orf_size, max_orf_cap, min_domain_mapping_match - ): + def _build_single_gene_duplication(cls, ann, reference_genome): """ builds a fusion transcript for a single gene duplication. Note that this is an incomplete fusion transcript and still requires translations and domain information to be added @@ -288,20 +287,19 @@ def _build_single_gene_duplication( @classmethod def build( cls, - ann, + ann: 'Annotation', reference_genome: ReferenceGenome, min_orf_size=None, max_orf_cap=None, min_domain_mapping_match=None, - ): + ) -> 'FusionTranscript': """ Args: - ann (Annotation): the annotation object we want to build a FusionTranscript for - reference_genome: dict of reference sequence - by template/chr name + ann: the annotation object we want to build a FusionTranscript for + reference_genome: dict of reference sequence by template/chr name Returns: - FusionTranscript: the newly built fusion transcript + the newly built fusion transcript """ if not ann.transcript1 or not ann.transcript2: @@ -489,15 +487,13 @@ def build( pass return fusion_pre_transcript - def get_seq(self, reference_genome=None, ignore_cache=False): + def get_seq(self): return PreTranscript.get_seq(self) - def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=False): + def get_cdna_seq(self, splicing_pattern): """ Args: splicing_pattern (List[int]): the list of splicing positions - reference_genome (Dict[str,Bio.SeqRecord]): dict of reference seq - by template/chr name Returns: str: the spliced cDNA seq diff --git a/src/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py index 22f6831d..700b045c 100644 --- a/src/mavis/annotate/genomic.py +++ b/src/mavis/annotate/genomic.py @@ -599,7 +599,7 @@ class Transcript(BioInterval): def __init__( self, pre_transcript: PreTranscript, - splicing_patt: List[int], + splicing_patt: SplicingPattern, seq: Optional[str] = None, translations: Optional[List[Translation]] = None, ): diff --git a/src/mavis/annotate/main.py b/src/mavis/annotate/main.py index 9b08f34e..ed426a04 100644 --- a/src/mavis/annotate/main.py +++ b/src/mavis/annotate/main.py @@ -118,8 +118,8 @@ def main( ): """ Args: - inputs (List[str]): list of input files to read - output (str): path to the output directory + inputs: list of input files to read + output: path to the output directory """ reference_genome = ReferenceFile.load_from_config(config, 'reference_genome') annotations = ReferenceFile.load_from_config(config, 'annotations') diff --git a/src/mavis/annotate/protein.py b/src/mavis/annotate/protein.py index 50d31f33..c7e711c9 100644 --- a/src/mavis/annotate/protein.py +++ b/src/mavis/annotate/protein.py @@ -263,11 +263,11 @@ def __init__( describes the splicing pattern and cds start and end with reference to a particular transcript Args: - start (int): start of the coding sequence (cds) relative to the start of the first exon in the transcript - end (int): end of the coding sequence (cds) relative to the start of the first exon in the transcript - transcript (Transcript): the transcript this is a Translation of - domains (List[Domain]): a list of the domains on this translation - sequence (str): the cds sequence + start: start of the coding sequence (cds) relative to the start of the first exon in the transcript + end: end of the coding sequence (cds) relative to the start of the first exon in the transcript + transcript: the transcript this is a Translation of + domains: a list of the domains on this translation + sequence: the cds sequence """ domains = [] if domains is None else domains BioInterval.__init__( @@ -334,7 +334,7 @@ def convert_genomic_to_cds(self, pos: int) -> int: raise IndexError('conversion failed. position is outside the exonic region') return cds - def convert_genomic_to_nearest_cds(self, pos: str) -> Tuple[int, int]: + def convert_genomic_to_nearest_cds(self, pos: int) -> Tuple[int, int]: """ converts a genomic position to its cds equivalent or (if intronic) the nearest cds and shift diff --git a/src/mavis/annotate/variant.py b/src/mavis/annotate/variant.py index bfa9e3a0..aabb2488 100644 --- a/src/mavis/annotate/variant.py +++ b/src/mavis/annotate/variant.py @@ -8,7 +8,7 @@ from ..constants import COLUMNS, GENE_PRODUCT_TYPE, PROTOCOL, STOP_AA, STRAND, SVTYPE from ..error import NotSpecifiedError from ..interval import Interval -from ..types import ReferenceGenome +from ..types import Annotations, ReferenceGenome from ..util import logger from .fusion import FusionTranscript, determine_prime from .genomic import Gene, IntergenicRegion, PreTranscript, Transcript @@ -36,15 +36,20 @@ def validation_id(self) -> Optional[str]: return self.data.get(COLUMNS.validation_id) def __init__( - self, bpp: BreakpointPair, transcript1=None, transcript2=None, proximity=5000, **kwargs + self, + bpp: BreakpointPair, + transcript1: Optional[Transcript] = None, + transcript2: Optional[Transcript] = None, + proximity: int = 5000, + **kwargs, ): """ Holds a breakpoint call and a set of transcripts, other information is gathered relative to these Args: - bpp (BreakpointPair): the breakpoint pair call. Will be adjusted and then stored based on the transcripts - transcript1 (Transcript): transcript at the first breakpoint - transcript2 (Transcript): Transcript at the second breakpoint + bpp: the breakpoint pair call. Will be adjusted and then stored based on the transcripts + transcript1: transcript at the first breakpoint + transcript2: Transcript at the second breakpoint """ # narrow the breakpoint windows by the transcripts being used for annotation temp = bpp.break1 if transcript1 is None else bpp.break1 & transcript1 @@ -86,12 +91,12 @@ def __init__( self.proximity = proximity self.fusion = None - def add_gene(self, input_gene): + def add_gene(self, input_gene: Gene): """ adds a input_gene to the current set of annotations. Checks which set it should be added to Args: - input_gene (input_gene): the input_gene being added + input_gene: the input_gene being added """ if input_gene.chr not in [self.break1.chr, self.break2.chr]: raise AttributeError( @@ -167,12 +172,12 @@ def add_gene(self, input_gene): self.genes_proximal_to_break2 = temp - def flatten(self): + def flatten(self) -> Dict: """ generates a dictionary of the annotation information as strings Returns: - Dict[str,str]: dictionary of attribute names and values + dictionary of attribute names and values """ row = BreakpointPair.flatten(self) row.update( @@ -245,16 +250,16 @@ def flatten(self): pass return row - def single_transcript(self): + def single_transcript(self) -> bool: return bool(self.transcript1 == self.transcript2 and self.transcript1) -def flatten_fusion_translation(translation): +def flatten_fusion_translation(translation: Translation) -> Dict: """ for a given fusion product (translation) gather the information to be output to the tabbed files Args: - translation (Translation): the translation which is on the fusion transcript + translation: the translation which is on the fusion transcript Returns: dict: the dictionary of column names to values """ @@ -534,11 +539,12 @@ def flatten_fusion_transcript(spliced_fusion_transcript): return row -def overlapping_transcripts(ref_ann, breakpoint: Breakpoint) -> List[PreTranscript]: +def overlapping_transcripts( + ref_ann: ReferenceAnnotations, breakpoint: Breakpoint +) -> List[PreTranscript]: """ Args: - ref_ann (Dict[str,List[Gene]]): the reference list of genes split - by chromosome + ref_ann: the reference list of genes split by chromosome breakpoint: the breakpoint in question Returns: a list of possible transcripts @@ -558,7 +564,7 @@ def overlapping_transcripts(ref_ann, breakpoint: Breakpoint) -> List[PreTranscri def _gather_breakpoint_annotations( - ref_ann: Dict[str, List[Gene]], breakpoint: Breakpoint + ref_ann: Annotations, breakpoint: Breakpoint ) -> Tuple[ List[Union[PreTranscript, IntergenicRegion]], List[Union[PreTranscript, IntergenicRegion]] ]: @@ -651,9 +657,7 @@ def _gather_breakpoint_annotations( ) -def _gather_annotations( - ref: Dict[str, List[Gene]], bp: BreakpointPair, proximity=None -) -> List[Annotation]: +def _gather_annotations(ref: Annotations, bp: BreakpointPair, proximity=None) -> List[Annotation]: """ each annotation is defined by the annotations selected at the breakpoints the other annotations are given relative to this @@ -861,7 +865,7 @@ def choose_transcripts_by_priority(ann_list: List[Annotation]) -> List[Annotatio def annotate_events( bpps: List[BreakpointPair], - annotations: Dict[str, List[Gene]], + annotations: Annotations, reference_genome: ReferenceGenome, max_proximity: int = 5000, min_orf_size: int = 200, diff --git a/src/mavis/assemble.py b/src/mavis/assemble.py index c87b4208..235078fa 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/assemble.py @@ -467,16 +467,16 @@ def assemble( return contigs -def kmers(s, size): +def kmers(s: str, size: int) -> List[str]: """ for a sequence, compute and return a list of all kmers of a specified size Args: - s (str): the input sequence - size (int): the size of the kmers + s: the input sequence + size: the size of the kmers Returns: - List[str]: the list of kmers + the list of kmers Example: >>> kmers('abcdef', 2) diff --git a/src/mavis/bam/cache.py b/src/mavis/bam/cache.py index aee2cc99..2724ed0a 100644 --- a/src/mavis/bam/cache.py +++ b/src/mavis/bam/cache.py @@ -225,7 +225,9 @@ def fetch_from_bins( chrom = 'chr' + chrom if chrom not in self.fh.references: raise KeyError('bam file does not contain the expected reference', input_chrom) - bins = self.__class__._generate_fetch_bins(start, stop, sample_bins, min_bin_size) + bins: List[Interval] = self.__class__._generate_fetch_bins( + start, stop, sample_bins, min_bin_size + ) running_surplus = 0 temp_cache = set() for fstart, fend in bins: diff --git a/src/mavis/bam/stats.py b/src/mavis/bam/stats.py index ea6d1603..5890870a 100644 --- a/src/mavis/bam/stats.py +++ b/src/mavis/bam/stats.py @@ -2,11 +2,16 @@ import math import os import statistics as stats +from typing import TYPE_CHECKING from ..constants import STRAND +from ..types import ReferenceAnnotations from ..util import logger from .read import sequenced_strand +if TYPE_CHECKING: + from .cache import BamCache + os.environ["OMP_NUM_THREADS"] = "4" # export OMP_NUM_THREADS=4 os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 os.environ["MKL_NUM_THREADS"] = "4" # export MKL_NUM_THREADS=6 @@ -103,29 +108,28 @@ def __add__(self, other): def compute_transcriptome_bam_stats( - bam_cache, - annotations, - sample_size, - min_mapping_quality=1, - stranded=True, - sample_cap=10000, - distribution_fraction=0.97, -): + bam_cache: 'BamCache', + annotations: ReferenceAnnotations, + sample_size: int, + min_mapping_quality: int = 1, + stranded: bool = True, + sample_cap: int = 10000, + distribution_fraction: float = 0.97, +) -> BamStats: """ computes various statistical measures relating the input bam file Args: - bam_file_handle (BamCache): the input bam file handle - annotations (object): see :func:`mavis.annotate.load_annotations` - sample_size (int): the number of genes to compute stats over - log (Callable): outputs logging information - min_mapping_quality (int): the minimum mapping quality for a read to be used - stranded (bool): if True then reads must match the gene strand - sample_cap (int): maximum number of reads to collect for any given sample region - distribution_fraction (float): the proportion of the distribution to use in computing stdev + bam_file_handle: the input bam file handle + annotations: see :func:`mavis.annotate.load_annotations` + sample_size: the number of genes to compute stats over + min_mapping_quality: the minimum mapping quality for a read to be used + stranded: if True then reads must match the gene strand + sample_cap: maximum number of reads to collect for any given sample region + distribution_fraction: the proportion of the distribution to use in computing stdev Returns: - BamStats: the fragment size median, stdev and the read length in a object + the fragment size median, stdev and the read length in a object """ import numpy as np @@ -151,7 +155,7 @@ def compute_transcriptome_bam_stats( read_lengths = [] for gene in genes: for read in bam_cache.fetch( - gene.chr, gene.start, gene.end, cache_if=lambda x: False, limit=sample_cap + gene.chr, gene.start, gene.end, cache_if=lambda _: False, limit=sample_cap ): if any( [ @@ -213,27 +217,26 @@ def compute_transcriptome_bam_stats( def compute_genome_bam_stats( - bam_file_handle, - sample_bin_size, - sample_size, - min_mapping_quality=1, - sample_cap=10000, - distribution_fraction=0.99, -): + bam_file_handle: 'BamCache', + sample_bin_size: int, + sample_size: int, + min_mapping_quality: int = 1, + sample_cap: int = 10000, + distribution_fraction: float = 0.99, +) -> BamStats: """ computes various statistical measures relating the input bam file Args: - bam_file_handle (pysam.AlignmentFile): the input bam file handle - sample_bin_size (int): how large to make the sample bin (in bp) - sample_size (int): the number of genes to compute stats over - log (Callable): outputs logging information - min_mapping_quality (int): the minimum mapping quality for a read to be used - sample_cap (int): maximum number of reads to collect for any given sample region - distribution_fraction (float): the proportion of the distribution to use in computing stdev + bam_file_handle: the input bam file handle + sample_bin_size: how large to make the sample bin (in bp) + sample_size: the number of genes to compute stats over + min_mapping_quality: the minimum mapping quality for a read to be used + sample_cap: maximum number of reads to collect for any given sample region + distribution_fraction: the proportion of the distribution to use in computing stdev Returns: - BamStats: the fragment size median, stdev and the read length in a object + the fragment size median, stdev and the read length in a object """ import numpy as np @@ -254,7 +257,7 @@ def compute_genome_bam_stats( read_lengths = [] for bin_chr, bin_start, bin_end in bins: for read in bam_file_handle.fetch( - bin_chr, bin_start, bin_end, limit=sample_cap, cache_if=lambda x: False + bin_chr, bin_start, bin_end, limit=sample_cap, cache_if=lambda _: False ): if any( [ diff --git a/src/mavis/blat.py b/src/mavis/blat.py index f379df1b..cbfeaace 100644 --- a/src/mavis/blat.py +++ b/src/mavis/blat.py @@ -220,7 +220,7 @@ def pslx_row_to_pysam( Args: row: a row object from the 'read_pslx' method - bam_cache (BamCache): the bam file/cache to use as a template for creating reference_id from chr name + bam_cache: the bam file/cache to use as a template for creating reference_id from chr name reference_genome: reference sequence by template/chr name """ diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index b28e4e4c..ac3b188f 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -1,5 +1,3 @@ -from __future__ import division - from copy import copy as _copy from typing import Callable, Dict, List, Optional, Set, Tuple @@ -24,15 +22,23 @@ class for storing information about a SV breakpoint def key(self): return (self.chr, self.start, self.end, self.orient, self.strand) - def __init__(self, chr, start, end=None, orient=ORIENT.NS, strand=STRAND.NS, seq=None): + def __init__( + self, + chr: str, + start: int, + end: Optional[int] = None, + orient=ORIENT.NS, + strand=STRAND.NS, + seq: Optional[str] = None, + ): """ Args: - chr (str): the chromosome - start (int): the genomic position of the breakpoint - end (int): if the breakpoint is uncertain (a range) then specify the end of the range here + chr: the chromosome + start: the genomic position of the breakpoint + end: if the breakpoint is uncertain (a range) then specify the end of the range here orient (ORIENT): the orientation (which side is retained at the break) strand (STRAND): the strand - seq (str): the seq + seq: the seq Examples: >>> Breakpoint('1', 1, 2) diff --git a/src/mavis/cluster/cluster.py b/src/mavis/cluster/cluster.py index f2468cb1..0627ccb2 100644 --- a/src/mavis/cluster/cluster.py +++ b/src/mavis/cluster/cluster.py @@ -150,7 +150,7 @@ def merge_by_union( group_key: BreakpointPairGroupKey, weight_adjustment: int = 10, cluster_radius: int = 200, -) -> List[BreakpointPair]: +) -> Dict[BreakpointPairGroupKey, List[BreakpointPair]]: """ for a given set of breakpoint pairs, merge the union of all pairs that are within the given distance (cluster_radius) diff --git a/src/mavis/constants.py b/src/mavis/constants.py index 35a4731b..ba656d4e 100644 --- a/src/mavis/constants.py +++ b/src/mavis/constants.py @@ -68,12 +68,12 @@ class SPLICE_TYPE(MavisNamespace): """the number of bases making up a codon""" -def reverse_complement(s): +def reverse_complement(s: str) -> str: """ wrapper for the Bio.Seq reverse_complement method Args: - s (str): the input DNA sequence + s: the input DNA sequence Returns: str: the reverse complement of the input sequence @@ -88,8 +88,8 @@ def reverse_complement(s): input_string = str(s) if not re.match('^[A-Za-z]*$', input_string): raise ValueError('unexpected sequence format. cannot reverse complement', input_string) - input_string = Seq(input_string, DNA_ALPHABET) - return str(input_string.reverse_complement()) + seq = Seq(input_string, DNA_ALPHABET) + return str(seq.reverse_complement()) def translate(s: str, reading_frame: int = 0) -> str: diff --git a/src/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py index 32995761..ee435ca2 100644 --- a/src/mavis/illustrate/elements.py +++ b/src/mavis/illustrate/elements.py @@ -3,7 +3,9 @@ """ import re -from typing import List, Tuple +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple + +import svgwrite from ..annotate.variant import FusionTranscript from ..constants import CODON_SIZE, GIEMSA_STAIN, ORIENT, STRAND @@ -18,6 +20,10 @@ split_intervals_into_tracks, ) +if TYPE_CHECKING: + from ..annotate.base import BioInterval + from ..annotate.genomic import Exon, Gene + from ..breakpoint import Breakpoint # draw gene level view # draw gene box HEX_WHITE = '#FFFFFF' @@ -26,7 +32,7 @@ def draw_legend( config: DiagramSettings, canvas, swatches: List[Tuple[str, str]], border: bool = True -): +) -> svgwrite.container.Group: """ generates an svg group object representing the legend """ @@ -88,14 +94,14 @@ def draw_legend( def draw_exon_track( config: DiagramSettings, - canvas, + canvas: svgwrite.drawing.Drawing, transcript, mapping: IntervalMapping, colors=None, genomic_min: int = None, genomic_max: int = None, translation=None, -): +) -> svgwrite.container.Group: """ """ colors = {} if colors is None else colors main_group = canvas.g(class_='exon_track') @@ -178,7 +184,7 @@ def draw_exon_track( def draw_transcript_with_translation( config: DiagramSettings, - canvas, + canvas: svgwrite.drawing.Drawing, translation, labels, colors, @@ -186,7 +192,7 @@ def draw_transcript_with_translation( reference_genome=None, genomic_min=None, genomic_max=None, -): +) -> svgwrite.container.Group: main_group = canvas.g() pre_transcript = translation.transcript.reference_object spl_tx = translation.transcript @@ -444,17 +450,16 @@ def draw_transcript_with_translation( def draw_ustranscript( - config, - canvas, + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, pre_transcript, - target_width=None, - breakpoints=[], + target_width: Optional[int] = None, + breakpoints: List['Breakpoint'] = [], labels=LabelMapping(), colors={}, mapping=None, - reference_genome=None, masks=None, -): +) -> svgwrite.container.Group: """ builds an svg group representing the transcript. Exons are drawn in a track with the splicing information and domains are drawn in separate tracks below @@ -462,17 +467,13 @@ def draw_ustranscript( if there are multiple splicing variants then multiple exon tracks are drawn Args: - canvas (svgwrite.drawing.Drawing): the main svgwrite object used to create new svg elements - target_width (int): the target width of the diagram + canvas: the main svgwrite object used to create new svg elements + target_width: the target width of the diagram pre_transcript (Transcript): the transcript being drawn - exon_color (str): the color being used for the fill of the exons - utr_color (str): the color for the fill of the UTR regions - abrogated_splice_sites (List[int]): list of positions to ignore as splice sites - breakpoints (List[Breakpoint]): the breakpoints to overlay + breakpoints: the breakpoints to overlay Return: - svgwrite.container.Group: the group element for the transcript diagram - Has the added parameters of labels, height, and mapping + the group element for the transcript diagram Has the added parameters of labels, height, and mapping """ if pre_transcript.get_strand() not in [STRAND.POS, STRAND.NEG]: raise NotSpecifiedError('strand must be positive or negative to draw the pre_transcript') @@ -610,31 +611,29 @@ def draw_ustranscript( def draw_genes( - config, - canvas, - genes, - target_width, - breakpoints=None, - colors=None, - labels=None, - plots=None, - masks=None, -): + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + genes: List['Gene'], + target_width: int, + breakpoints: Optional[List[Breakpoint]] = None, + colors: Optional[Dict[str, 'Gene']] = None, + labels: Optional[LabelMapping] = None, + plots: Optional[List] = None, + masks: Optional[List[Interval]] = None, +) -> svgwrite.container.Group: """ draws the genes given in order of their start position trying to minimize the number of tracks required to avoid overlap Args: - canvas (svgwrite.drawing.Drawing): the main svgwrite object used to create new svg elements - target_width (int): the target width of the diagram - genes (List[Gene]): the list of genes to draw - breakpoints (List[Breakpoint]): the breakpoints to overlay - colors (Dict[str,Gene]): dictionary of the colors assigned to each Gene as - fill + canvas: the main svgwrite object used to create new svg elements + target_width: the target width of the diagram + genes: the list of genes to draw + breakpoints: the breakpoints to overlay + colors: dictionary of the colors assigned to each Gene as fill Return: - svgwrite.container.Group: the group element for the diagram. - Has the added parameters of labels, height, and mapping + the group element for the diagram. Has the added parameters of labels, height, and mapping """ # mutable default argument parameters breakpoints = [] if breakpoints is None else breakpoints @@ -752,15 +751,22 @@ def draw_genes( return main_group -def draw_vmarker(config, canvas, marker, width, height, label='', color=None): +def draw_vmarker( + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + marker: 'BioInterval', + width: int, + height: int, + label='', + color=None, +) -> svgwrite.container.Group: """ Args: - canvas (svgwrite.drawing.Drawing): the main svgwrite object used to create new svg elements - breakpoint (Breakpoint): the breakpoint to draw - width (int): the pixel width - height (int): the pixel height + canvas: the main svgwrite object used to create new svg elements + width: the pixel width + height: the pixel height Return: - svgwrite.container.Group: the group element for the diagram + the group element for the diagram """ color = config.marker_color if color is None else color g = canvas.g(class_='marker') @@ -789,15 +795,22 @@ def draw_vmarker(config, canvas, marker, width, height, label='', color=None): return g -def draw_breakpoint(config, canvas, breakpoint, width, height, label=''): +def draw_breakpoint( + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + breakpoint: Breakpoint, + width: int, + height: int, + label: str = '', +) -> svgwrite.container.Group: """ Args: - canvas (svgwrite.drawing.Drawing): the main svgwrite object used to create new svg elements - breakpoint (Breakpoint): the breakpoint to draw - width (int): the pixel width - height (int): the pixel height + canvas: the main svgwrite object used to create new svg elements + breakpoint: the breakpoint to draw + width: the pixel width + height: the pixel height Return: - svgwrite.container.Group: the group element for the diagram + the group element for the diagram """ g = canvas.g(class_='breakpoint') y = config.padding + config.breakpoint_label_font_size / 2 @@ -841,19 +854,28 @@ def draw_breakpoint(config, canvas, breakpoint, width, height, label=''): return g -def draw_exon(config, canvas, exon, width, height, fill, label='', translation=None): +def draw_exon( + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + exon: 'Exon', + width: int, + height: int, + fill: str, + label: str = '', + translation=None, +) -> svgwrite.container.Group: """ generates the svg object representing an exon Args: - canvas (svgwrite.drawing.Drawing): the main svgwrite object used to create new svg elements + canvas: the main svgwrite object used to create new svg elements exon (Exon): the exon to draw - width (int): the pixel width - height (int): the pixel height - fill (str): the fill color to use for the exon + width: the pixel width + height: the pixel height + fill: the fill color to use for the exon Return: - svgwrite.container.Group: the group element for the diagram + the group element for the diagram Todo: add markers for exons with abrogated splice sites @@ -892,13 +914,19 @@ def draw_exon(config, canvas, exon, width, height, fill, label='', translation=N def draw_template( - config, canvas, template, target_width, labels=None, colors=None, breakpoints=None -): + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + template, + target_width, + labels=None, + colors=None, + breakpoints=None, +) -> svgwrite.container.Group: """ Creates the template/chromosome illustration Return: - svgwrite.container.Group: the group element for the diagram + the group element for the diagram """ labels = LabelMapping() if labels is None else labels @@ -1019,19 +1047,27 @@ def draw_template( return group -def draw_gene(config, canvas, gene, width, height, fill, label='', reference_genome=None): +def draw_gene( + config: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + gene: 'Gene', + width: int, + height: int, + fill: str, + label: str = '', +) -> svgwrite.container.Group: """ generates the svg object representing a gene Args: - canvas (svgwrite.drawing.Drawing): the main svgwrite object used to create new svg elements - gene (Gene): the gene to draw - width (int): the pixel width - height (int): the pixel height - fill (str): the fill color to use for the gene + canvas: the main svgwrite object used to create new svg elements + gene: the gene to draw + width: the pixel width + height: the pixel height + fill: the fill color to use for the gene Return: - svgwrite.container.Group: the group element for the diagram + the group element for the diagram """ group = canvas.g(class_='gene') diff --git a/src/mavis/illustrate/scatter.py b/src/mavis/illustrate/scatter.py index fd78a66f..b71b52ac 100644 --- a/src/mavis/illustrate/scatter.py +++ b/src/mavis/illustrate/scatter.py @@ -1,39 +1,42 @@ import os +from typing import Optional + +import svgwrite from ..bam.read import pileup, sequenced_strand -from ..interval import Interval +from ..interval import Interval, IntervalMapping from ..util import logger +from .constants import DiagramSettings def bam_to_scatter( - bam_file, - chrom, - start, - end, + bam_file: str, + chrom: str, + start: int, + end: int, density, - strand=None, - axis_name=None, - ymax=None, - min_mapping_quality=0, - strand_determining_read=2, - ymax_color='#FF0000', -): + strand: Optional[str] = None, + axis_name: Optional[str] = None, + ymax: Optional[int] = None, + min_mapping_quality: int = 0, + strand_determining_read: int = 2, + ymax_color: str = '#FF0000', +) -> 'ScatterPlot': """ pull data from a bam file to set up a scatter plot of the pileup Args: - bam_file (str): path to the bam file - chrom (str): chromosome name - start (int): genomic start position for the plot - end (int): genomic end position for the plot - bin_size (int): number of genomic positions to group together and average to reduce data + bam_file: path to the bam file + chrom: chromosome name + start: genomic start position for the plot + end: genomic end position for the plot strand (STRAND): expected strand - axis_name (str): axis name - ymax (int): maximum value to plot the y axis - min_mapping_quality (int): minimum mapping quality for reads to be considered in the plot + axis_name: axis name + ymax: maximum value to plot the y axis + min_mapping_quality: minimum mapping quality for reads to be considered in the plot Returns: - ScatterPlot: the scatter plot representing the bam pileup + the scatter plot representing the bam pileup """ import pysam @@ -123,16 +126,20 @@ def __init__( self.density = density -def draw_scatter(ds, canvas, plot, xmapping): +def draw_scatter( + ds: DiagramSettings, + canvas: svgwrite.drawing.Drawing, + plot: ScatterPlot, + xmapping: IntervalMapping, +) -> svgwrite.container.Group: """ given a xmapping, draw the scatter plot svg group Args: - ds (DiagramSettings): the settings/constants to use for building the svg - canvas (svgwrite.canvas): the svgwrite object used to create new svg elements - plot (ScatterPlot): the plot to be drawn - xmapping (Dict[Interval,Interval]): - dict used for conversion of coordinates in the xaxis to pixel positions + ds: the settings/constants to use for building the svg + canvas: the svgwrite object used to create new svg elements + plot: the plot to be drawn + xmapping: dict used for conversion of coordinates in the xaxis to pixel positions """ from shapely.geometry import Point as sPoint diff --git a/src/mavis/interval.py b/src/mavis/interval.py index 754d4ba0..3fdd205d 100644 --- a/src/mavis/interval.py +++ b/src/mavis/interval.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import List, Optional, Tuple class Interval: @@ -10,9 +10,9 @@ class Interval: def __init__(self, start: int, end: Optional[int] = None, freq: int = 1, number_type=None): """ Args: - start (int): the start of the interval (inclusive) - end (int): the end of the interval (inclusive) - freq (int): the frequency or weight of the interval + start: the start of the interval (inclusive) + end: the end of the interval (inclusive) + freq: the frequency or weight of the interval """ self.start = start self.end = end if end is not None else start @@ -98,13 +98,13 @@ def __getitem__(self, index): raise IndexError('index input accessor is out of bounds: 1 or 2 only', index) @classmethod - def overlaps(cls, first, other): + def overlaps(cls, first: 'Interval', other: 'Interval') -> bool: """ checks if two intervals have any portion of their given ranges in common Args: - first (Interval): an interval to be compared - other (Interval): an interval to be compared + first: an interval to be compared + other: an interval to be compared Example: >>> Interval.overlaps(Interval(1, 4), Interval(5, 7)) diff --git a/src/mavis/main.py b/src/mavis/main.py index 5c7c8ecd..c7e94bbe 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -6,7 +6,7 @@ import platform import sys import time -from typing import Dict +from typing import Dict, List, Optional from mavis_config import validate_config from mavis_config.constants import SUBCOMMAND @@ -160,13 +160,13 @@ def create_parser(argv): return parser, parser.parse_args(argv) -def main(argv=None): +def main(argv: Optional[List[str]] = None): """ sets up the parser and checks the validity of command line args loads reference files and redirects into subcommand main functions Args: - argv (list): List of arguments, defaults to command line arguments + argv: List of arguments, defaults to command line arguments """ if argv is None: # need to do at run time or patching will not behave as expected argv = sys.argv[1:] diff --git a/src/mavis/types.py b/src/mavis/types.py index 23e3de15..080c69ac 100644 --- a/src/mavis/types.py +++ b/src/mavis/types.py @@ -2,10 +2,13 @@ Helper classes for type hints """ -from typing import Dict, List, Tuple +from typing import TYPE_CHECKING, Dict, List, Tuple from Bio.SeqRecord import SeqRecord -ReferenceGenome = Dict[str, SeqRecord] +if TYPE_CHECKING: + from .annotate.genomic import Gene +ReferenceGenome = Dict[str, SeqRecord] +ReferenceAnnotations = Dict[str, List['Gene']] CigarTuples = List[Tuple[int, int]] diff --git a/src/mavis/util.py b/src/mavis/util.py index 7009cb48..d4d34508 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -4,7 +4,7 @@ import os import re import time -from typing import Any, Callable, Dict, List, Set +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set import pandas as pd from mavis_config import bash_expands @@ -25,6 +25,9 @@ from .error import InvalidRearrangement from .interval import Interval +if TYPE_CHECKING: + from mavis.annotate.base import BioInterval + ENV_VAR_PREFIX = 'MAVIS_' logger = logging.getLogger('mavis') @@ -143,13 +146,15 @@ def mkdirp(dirname): return dirname -def filter_on_overlap(bpps, regions_by_reference_name): +def filter_on_overlap( + bpps: List[BreakpointPair], regions_by_reference_name: Dict[str, List['BioInterval']] +): """ filter a set of breakpoint pairs based on overlap with a set of genomic regions Args: - bpps (List[mavis.breakpoint.BreakpointPair]): list of breakpoint pairs to be filtered - regions_by_reference_name (Dict[str,List[mavis.annotate.base.BioInterval]]): regions to filter against + bpps: list of breakpoint pairs to be filtered + regions_by_reference_name: regions to filter against """ logger.info(f'filtering from {len(bpps)} using overlaps with regions filter') failed = [] @@ -175,7 +180,9 @@ def filter_on_overlap(bpps, regions_by_reference_name): return passed, failed -def read_inputs(inputs, required_columns=[], **kwargs): +def read_inputs( + inputs: List[str], required_columns: List[str] = [], **kwargs +) -> List[BreakpointPair]: bpps = [] for finput in bash_expands(*inputs): @@ -237,18 +244,19 @@ def get_connected_components(adj_matrix): return components -def generate_complete_stamp(output_dir, prefix='MAVIS.', start_time=None): +def generate_complete_stamp( + output_dir: str, prefix: str = 'MAVIS.', start_time: Optional[int] = None +) -> str: """ writes a complete stamp, optionally including the run time if start_time is given Args: - output_dir (str): path to the output dir the stamp should be written in - log (Callable): function to print logging messages to - prefix (str): prefix for the stamp name - start_time (int): the start time + output_dir: path to the output dir the stamp should be written in + prefix: prefix for the stamp name + start_time: the start time Return: - str: path to the complete stamp + path to the complete stamp Example: >>> generate_complete_stamp('some_output_dir') diff --git a/src/mavis/validate/evidence.py b/src/mavis/validate/evidence.py index 610c4206..cc89725f 100644 --- a/src/mavis/validate/evidence.py +++ b/src/mavis/validate/evidence.py @@ -75,9 +75,7 @@ def __init__(self, *pos, **kwargs): self.compatible_window1 = self.generate_window(compt_break1) self.compatible_window2 = self.generate_window(compt_break2) - def compute_fragment_size( - self, read: pysam.AlignedSegment, mate: Optional[pysam.AlignedSegment] = None - ): + def compute_fragment_size(self, read: pysam.AlignedSegment): return Interval(abs(read.template_length)) From f7038ee4a3824a63906c3dab7387b50ec58fb6c1 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 1 Feb 2022 14:07:34 -0800 Subject: [PATCH 105/137] Fix type checking conditional imports --- src/mavis/align.py | 2 +- src/mavis/annotate/variant.py | 10 ++++++---- src/mavis/illustrate/elements.py | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/mavis/align.py b/src/mavis/align.py index 43e67be6..223afddb 100644 --- a/src/mavis/align.py +++ b/src/mavis/align.py @@ -388,7 +388,7 @@ def call_paired_read_event(read1, read2, is_stranded=False): def align_sequences( sequences: Dict[str, str], - input_bam_cache: BamCache, + input_bam_cache: 'BamCache', reference_genome: ReferenceGenome, aligner: str, aligner_reference: str, diff --git a/src/mavis/annotate/variant.py b/src/mavis/annotate/variant.py index aabb2488..dd92d84d 100644 --- a/src/mavis/annotate/variant.py +++ b/src/mavis/annotate/variant.py @@ -8,7 +8,7 @@ from ..constants import COLUMNS, GENE_PRODUCT_TYPE, PROTOCOL, STOP_AA, STRAND, SVTYPE from ..error import NotSpecifiedError from ..interval import Interval -from ..types import Annotations, ReferenceGenome +from ..types import ReferenceAnnotations, ReferenceGenome from ..util import logger from .fusion import FusionTranscript, determine_prime from .genomic import Gene, IntergenicRegion, PreTranscript, Transcript @@ -564,7 +564,7 @@ def overlapping_transcripts( def _gather_breakpoint_annotations( - ref_ann: Annotations, breakpoint: Breakpoint + ref_ann: ReferenceAnnotations, breakpoint: Breakpoint ) -> Tuple[ List[Union[PreTranscript, IntergenicRegion]], List[Union[PreTranscript, IntergenicRegion]] ]: @@ -657,7 +657,9 @@ def _gather_breakpoint_annotations( ) -def _gather_annotations(ref: Annotations, bp: BreakpointPair, proximity=None) -> List[Annotation]: +def _gather_annotations( + ref: ReferenceAnnotations, bp: BreakpointPair, proximity=None +) -> List[Annotation]: """ each annotation is defined by the annotations selected at the breakpoints the other annotations are given relative to this @@ -865,7 +867,7 @@ def choose_transcripts_by_priority(ann_list: List[Annotation]) -> List[Annotatio def annotate_events( bpps: List[BreakpointPair], - annotations: Annotations, + annotations: ReferenceAnnotations, reference_genome: ReferenceGenome, max_proximity: int = 5000, min_orf_size: int = 200, diff --git a/src/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py index ee435ca2..07ac4233 100644 --- a/src/mavis/illustrate/elements.py +++ b/src/mavis/illustrate/elements.py @@ -8,6 +8,7 @@ import svgwrite from ..annotate.variant import FusionTranscript +from ..breakpoint import Breakpoint from ..constants import CODON_SIZE, GIEMSA_STAIN, ORIENT, STRAND from ..error import DrawingFitError, NotSpecifiedError from ..interval import Interval, IntervalMapping @@ -23,7 +24,7 @@ if TYPE_CHECKING: from ..annotate.base import BioInterval from ..annotate.genomic import Exon, Gene - from ..breakpoint import Breakpoint + # draw gene level view # draw gene box HEX_WHITE = '#FFFFFF' From 256794c4aff344a9183e6b00ebbfde967e147599 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 1 Feb 2022 14:10:13 -0800 Subject: [PATCH 106/137] Do not change argument signatures --- src/mavis/annotate/fusion.py | 22 ++++++++++++++++++---- src/mavis/illustrate/elements.py | 1 + src/mavis/validate/evidence.py | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/mavis/annotate/fusion.py b/src/mavis/annotate/fusion.py index 2666e1d9..0430753b 100644 --- a/src/mavis/annotate/fusion.py +++ b/src/mavis/annotate/fusion.py @@ -95,7 +95,14 @@ def map_region_to_genome(self, chr, interval_on_fusion, genome_interval, flipped self.mapping_to_chrs[Interval(interval_on_fusion[0], interval_on_fusion[1])] = chr @classmethod - def _build_single_gene_inversion(cls, ann, reference_genome: ReferenceGenome): + def _build_single_gene_inversion( + cls, + ann, + reference_genome: ReferenceGenome, + min_orf_size, + max_orf_cap, + min_domain_mapping_match, + ): """ builds a fusion transcript for a single gene inversion. Note that this is an incomplete fusion transcript and still requires translations and domain information to be added @@ -210,7 +217,14 @@ def _build_single_gene_inversion(cls, ann, reference_genome: ReferenceGenome): return fusion_pre_transcript @classmethod - def _build_single_gene_duplication(cls, ann, reference_genome): + def _build_single_gene_duplication( + cls, + ann, + reference_genome, + min_orf_size, + max_orf_cap, + min_domain_mapping_match, + ): """ builds a fusion transcript for a single gene duplication. Note that this is an incomplete fusion transcript and still requires translations and domain information to be added @@ -487,10 +501,10 @@ def build( pass return fusion_pre_transcript - def get_seq(self): + def get_seq(self, reference_genome=None, ignore_cache=False): return PreTranscript.get_seq(self) - def get_cdna_seq(self, splicing_pattern): + def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=False): """ Args: splicing_pattern (List[int]): the list of splicing positions diff --git a/src/mavis/illustrate/elements.py b/src/mavis/illustrate/elements.py index 07ac4233..92a68e0b 100644 --- a/src/mavis/illustrate/elements.py +++ b/src/mavis/illustrate/elements.py @@ -460,6 +460,7 @@ def draw_ustranscript( colors={}, mapping=None, masks=None, + reference_genome=None, ) -> svgwrite.container.Group: """ builds an svg group representing the transcript. Exons are drawn in a track with the splicing diff --git a/src/mavis/validate/evidence.py b/src/mavis/validate/evidence.py index cc89725f..db013e46 100644 --- a/src/mavis/validate/evidence.py +++ b/src/mavis/validate/evidence.py @@ -75,7 +75,7 @@ def __init__(self, *pos, **kwargs): self.compatible_window1 = self.generate_window(compt_break1) self.compatible_window2 = self.generate_window(compt_break2) - def compute_fragment_size(self, read: pysam.AlignedSegment): + def compute_fragment_size(self, read: pysam.AlignedSegment, mate=None): return Interval(abs(read.template_length)) From 5d81602aa5e4534b0a78eb747e4eb9dfe98e1240 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 1 Feb 2022 22:04:44 -0800 Subject: [PATCH 107/137] Add support for converting gtf/gff3 files resolves: #302 --- docs/inputs/reference.md | 20 + docs/migrating.md | 3 +- src/mavis/annotate/annotations_schema.json | 14 +- src/tools/convert_annotations_format.py | 470 ++++++++++++++++++ src/tools/migrate_mavis_annotations_2to3.py | 190 ------- .../data/Homo_sapiens.GRCh38.105.chr.kras.gtf | 186 +++++++ .../data/Homo_sapiens.GRCh38.105.kras.gff3 | 19 + .../tools/test_convert_annotations_format.py | 22 + 8 files changed, 728 insertions(+), 196 deletions(-) create mode 100644 src/tools/convert_annotations_format.py delete mode 100644 src/tools/migrate_mavis_annotations_2to3.py create mode 100644 tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf create mode 100644 tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 create mode 100644 tests/tools/test_convert_annotations_format.py diff --git a/docs/inputs/reference.md b/docs/inputs/reference.md index 5eff1cbb..854e1058 100644 --- a/docs/inputs/reference.md +++ b/docs/inputs/reference.md @@ -163,6 +163,26 @@ python tools/generate_ensembl_json.py -s human -r 75 -o ensembl_human_v75.json This will produce the JSON file required as input by MAVIS +### Conversion from Other Standard Formats + +If you have a GTF or GFF3 file you can convert them to match the MAVIS json format with the helper script provided in the tools folder + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/gtf/file \ + --input_type gtf \ + output_mavis_annotations.json +``` + +or similarly for the GFF3 format + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/gff3/file \ + --input_type gff3 \ + output_mavis_annotations.json +``` + ## DGV (Database of Genomic Variants) diff --git a/docs/migrating.md b/docs/migrating.md index 91fb0d4f..db9c76b2 100644 --- a/docs/migrating.md +++ b/docs/migrating.md @@ -25,7 +25,8 @@ MAVIS is now integrated with snakemake instead of handling its own scheduling MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory ```bash -python src/tools/migrate_mavis_annotations_to_jsonl.py \ +python src/tools/convert_annotations_format.py \ /path/to/tab/file.tab \ + --input_type v2 \ /path/to/new/json/file.json ``` diff --git a/src/mavis/annotate/annotations_schema.json b/src/mavis/annotate/annotations_schema.json index 04d0cc50..83f1b501 100644 --- a/src/mavis/annotate/annotations_schema.json +++ b/src/mavis/annotate/annotations_schema.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": false, + "additionalProperties": true, "properties": { "best_transcript_file": { "type": "string" @@ -13,6 +13,7 @@ }, "genes": { "items": { + "additionalProperties": true, "properties": { "aliases": { "default": [ @@ -50,6 +51,7 @@ "default": [ ], "items": { + "additionalProperties": true, "properties": { "aliases": { "default": [ @@ -61,25 +63,26 @@ "type": "array" }, "cdna_coding_end": { + "default": null, "minimum": 1, "type": [ "integer", "null" - ], - "default": null + ] }, "cdna_coding_start": { + "default": null, "minimum": 1, "type": [ "integer", "null" - ], - "default": null + ] }, "domains": { "default": [ ], "items": { + "additionalProperties": true, "properties": { "name": { "minLength": 1, @@ -116,6 +119,7 @@ "defualt": [ ], "items": { + "additionalProperties": true, "properties": { "end": { "minimum": 1, diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py new file mode 100644 index 00000000..13dfb8e5 --- /dev/null +++ b/src/tools/convert_annotations_format.py @@ -0,0 +1,470 @@ +import argparse +import json +import logging +import re +from typing import Dict + +import pandas as pd +import pkg_resources +from snakemake.utils import validate as snakemake_validate + +PANDAS_DEFAULT_NA_VALUES = [ + '-1.#IND', + '1.#QNAN', + '1.#IND', + '-1.#QNAN', + '#N/A', + 'N/A', + 'NA', + '#NA', + 'NULL', + 'NaN', + '-NaN', + 'nan', + '-nan', +] + + +def convert_tab_to_json(filepath: str) -> Dict: + """ + given a file in the std input format (see below) reads and return a list of genes (and sub-objects) + + +-----------------------+---------------------------+-----------------------------------------------------------+ + | column name | example | description | + +=======================+===========================+===========================================================+ + | ensembl_transcript_id | ENST000001 | | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | ensembl_gene_id | ENSG000001 | | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | strand | -1 | positive or negative 1 | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | cdna_coding_start | 44 | where translation begins relative to the start of the cdna| + +-----------------------+---------------------------+-----------------------------------------------------------+ + | cdna_coding_end | 150 | where translation terminates | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | + +-----------------------+---------------------------+-----------------------------------------------------------+ + | hugo_names | KRAS | hugo gene name | + +-----------------------+---------------------------+-----------------------------------------------------------+ + + Args: + filepath (str): path to the input tab-delimited file + + Returns: + Dict[str,List[Gene]]: a dictionary keyed by chromosome name with values of list of genes on the chromosome + + Warning: + does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 + """ + + def parse_exon_list(row): + if pd.isnull(row): + return [] + exons = [] + for temp in re.split('[; ]', row): + try: + start, end = temp.split('-') + exons.append({'start': int(start), 'end': int(end)}) + except Exception as err: + logging.warning(f'exon error: {repr(temp)}, {repr(err)}') + return exons + + def parse_domain_list(row): + if pd.isnull(row): + return [] + domains = [] + for domain in row.split(';'): + try: + name, temp = domain.rsplit(':') + temp = temp.split(',') + temp = [x.split('-') for x in temp] + regions = [{'start': int(x), 'end': int(y)} for x, y in temp] + domains.append({'name': name, 'regions': regions}) + except Exception as err: + logging.warning(f'error in domain: {domain}, {row}, {repr(err)}') + return domains + + df = pd.read_csv( + filepath, + dtype={ + 'ensembl_gene_id': str, + 'ensembl_transcript_id': str, + 'chr': str, + 'cdna_coding_start': pd.Int64Dtype(), + 'cdna_coding_end': pd.Int64Dtype(), + 'AA_domain_ranges': str, + 'genomic_exon_ranges': str, + 'hugo_names': str, + 'transcript_genomic_start': pd.Int64Dtype(), + 'transcript_genomic_end': pd.Int64Dtype(), + 'best_ensembl_transcript_id': str, + 'gene_start': int, + 'gene_end': int, + }, + sep='\t', + comment='#', + ) + + for col in ['ensembl_gene_id', 'chr', 'ensembl_transcript_id', 'gene_start', 'gene_end']: + if col not in df: + raise KeyError(f'missing required column: {col}') + + for col, parser in [ + ('genomic_exon_ranges', parse_exon_list), + ('AA_domain_ranges', parse_domain_list), + ]: + if col in df: + df[col] = df[col].apply(parser) + + genes = {} + rows = df.where(df.notnull(), None).to_dict('records') + + for row in rows: + gene = { + 'chr': row['chr'], + 'start': int(row['gene_start']), + 'end': int(row['gene_end']), + 'name': row['ensembl_gene_id'], + 'strand': row['strand'], + 'aliases': row['hugo_names'].split(';') if row.get('hugo_names') else [], + 'transcripts': [], + } + if gene['strand'] in {'true', '1', '+', '+1', 'True', 1, True}: + gene['strand'] = '+' + elif gene['strand'] in {'false', '-1', '-', 'False', -1, False}: + gene['strand'] = '-' + if gene['name'] not in genes: + genes[gene['name']] = gene + else: + gene = genes[gene['name']] + is_best_transcript = ( + row.get('best_ensembl_transcript_id', row['ensembl_transcript_id']) + == row['ensembl_transcript_id'] + ) + transcript = { + 'is_best_transcript': is_best_transcript, + 'name': row['ensembl_transcript_id'], + 'exons': row.get('genomic_exon_ranges', []), + 'domains': row.get('AA_domain_ranges', []), + 'start': row.get('transcript_genomic_start'), + 'end': row.get('transcript_genomic_end'), + 'cdna_coding_start': row.get('cdna_coding_start'), + 'cdna_coding_end': row.get('cdna_coding_end'), + 'aliases': [], + } + for int_value in ['start', 'end', 'cdna_coding_start', 'cdna_coding_end']: + if transcript.get(int_value) is not None: + transcript[int_value] = int(transcript[int_value]) + gene['transcripts'].append(transcript) + + return {'genes': list(genes.values())} + + +def convert_pandas_gff_to_mavis(df) -> Dict: + df['parent_type'] = df.Parent.str.split(':').str[0] + genelike_features = {'gene', 'ncRNA_gene', 'biological_region', 'pseudogene'} + + def pull_alias_terms(row): + aliases = [] + if row['Name']: + aliases.append(row['Name']) + if row['Alias']: + aliases.extend(row['Alias'].split(',')) + return aliases + + genes_by_id = {} + for row in df[df.type.isin(genelike_features)].to_dict('records'): + genes_by_id[row['feature_id']] = { + 'start': row['start'], + 'end': row['end'], + 'chr': row['seqid'], + 'aliases': pull_alias_terms(row), + 'strand': row['strand'], + 'transcripts': [], + 'name': row['feature_id'] + '.' + row['version'], + } + logging.info(f'loaded {len(genes_by_id)} genes') + + transcripts_by_id = {} + + for row in df[df.parent_type == 'gene'].to_dict('records'): + for parent in row['Parent'].split(','): + gene_id = parent.split(':')[1] + if gene_id not in genes_by_id: + raise KeyError( + f'cannot find gene ({gene_id}) skipping transcript ({row["feature_id"]})' + ) + feature_id = row['feature_id'] + transcript = { + 'name': feature_id + '.' + row['version'], + 'start': row['start'], + 'end': row['end'], + 'aliases': pull_alias_terms(row), + 'domains': [], + 'exons': [], + 'cdna_coding_start': None, + 'cdna_coding_end': None, + } + genes_by_id[gene_id]['transcripts'].append(transcript) + transcripts_by_id[feature_id] = transcript + + logging.info(f'loaded {len(transcripts_by_id)} transcripts') + # now cds + cds_count = 0 + for row in df[df.type == 'CDS'].to_dict('records'): + for parent in row['Parent'].split(','): + transcript_id = parent.split(':')[1] + if transcript_id not in transcripts_by_id: + raise KeyError( + f'failed to find parent transcript ({transcript_id}) skipping cds ({row["feature_id"]})' + ) + transcripts_by_id[transcript_id].update( + {'cdna_coding_start': row['start'], 'cdna_coding_end': row['end']} + ) + cds_count += 1 + logging.info(f'loaded {cds_count} cds regions') + # exons + exons_count = 0 + for row in df[df.type == 'exon'].to_dict('records'): + for parent in row['Parent'].split(','): + transcript_id = parent.split(':')[1] + if transcript_id not in transcripts_by_id: + raise KeyError( + f'failed to find parent transcript ({transcript_id}) skipping exon ({row["feature_id"]})' + ) + transcripts_by_id[transcript_id]['exons'].append( + { + 'start': row['start'], + 'end': row['end'], + 'name': row['feature_id'] + '.' + row['version'], + } + ) + exons_count += 1 + + logging.info(f'loaded {exons_count} exons') + + result = {'genes': list(genes_by_id.values())} + try: + snakemake_validate( + result, pkg_resources.resource_filename('mavis.annotate', 'annotations_schema.json') + ) + except Exception as err: + short_msg = '. '.join( + [line for line in str(err).split('\n') if line.strip()][:3] + ) # these can get super long + raise AssertionError(short_msg) + return result + + +def convert_gff3_to_mavis(filename: str, no_alt) -> Dict: + """ + Convert an input gff3 file to the JSON format accepted by MAVIS + """ + df = pd.read_csv( + filename, + sep='\t', + dtype={ + 'seqid': str, + 'source': str, + 'type': str, + 'start': int, + 'end': int, + 'score': str, + 'strand': str, + 'phase': str, + 'attributes': str, + }, + index_col=False, + header=None, + comment='#', + na_values=['.'] + PANDAS_DEFAULT_NA_VALUES, + names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'], + ) + if no_alt: + df = df[~df.seqid.str.startswith('GL')] + df = df[~df.seqid.str.startswith('KI')] + df['row_index'] = df.index + + skip_types = { + 'five_prime_UTR', + 'five_prime_UTR', + } + df = df[~df.type.isin(skip_types)] + + attribute_columns = [ + 'ID', + 'Name', + 'Alias', + 'Parent', + 'Target', + 'Gap', + 'Derives_from', + 'Note', + 'DBxref', + 'Ontology_term', + 'rank', + 'version', + 'exon_id', + ] + + def split_attributes(row): + result = {} + for attr in row.attributes.split(';'): + name, value = attr.split('=') + result[name] = value + return [row.row_index] + [result.get(c, '') for c in attribute_columns] + + prev_size = df.shape[0] + attrs_df = pd.DataFrame( + df.apply(split_attributes, axis=1).tolist(), + columns=['row_index'] + attribute_columns, + ) + assert prev_size == attrs_df.shape[0] + df = df.merge(attrs_df, on=['row_index']) + + assert prev_size == df.shape[0] + + df['feature_id'] = df['ID'].apply(lambda id: id.split(':')[1] if ':' in id else '') + df.loc[(df.feature_id == '') & (df.type == 'exon'), 'feature_id'] = df.exon_id + df = df[df.feature_id != ''] + df['strand'] = df.strand.fillna('?') + return convert_pandas_gff_to_mavis(df) + + +def convert_gff2_to_mavis(filename: str, no_alt) -> Dict: + """ + Convert an input gff2/gtf file to the JSON format accepted by MAVIS + """ + df = pd.read_csv( + filename, + sep='\t', + dtype={ + 'seqname': str, + 'source': str, + 'feature': str, + 'start': int, + 'end': int, + 'score': str, + 'strand': str, + 'frame': str, + 'attribute': str, + }, + index_col=False, + header=None, + comment='#', + na_values=['.'] + PANDAS_DEFAULT_NA_VALUES, + names=[ + 'seqname', + 'source', + 'feature', + 'start', + 'end', + 'score', + 'strand', + 'frame', + 'attribute', + ], + ).rename( + columns={'feature': 'type', 'seqname': 'seqid', 'frame': 'phase', 'attribute': 'attributes'} + ) # match gff3 names + df['row_index'] = df.index + + if no_alt: + df = df[~df.seqid.str.startswith('GL')] + df = df[~df.seqid.str.startswith('KI')] + + skip_types = { + 'five_prime_utr', + 'five_prime_utr', + } + df = df[~df.type.isin(skip_types)] + + attribute_columns = [ + 'gene_id', + 'gene_version', + 'gene_name', + 'transcript_id', + 'transcript_version', + 'transcript_name', + 'exon_id', + 'exon_version', + ] + + def split_attributes(row): + result = {} + for attr in row.attributes.split(';'): + if not attr: + continue + m = re.match(r'^\s*([^"]+)\s+"(.*)"$', attr) + if not m: + raise KeyError(f'attributes do not follow expected pattern: {attr}') + result[m.group(1)] = m.group(2) + return [row.row_index] + [result.get(c, '') for c in attribute_columns] + + prev_size = df.shape[0] + attrs_df = pd.DataFrame( + df.apply(split_attributes, axis=1).tolist(), + columns=['row_index'] + attribute_columns, + ) + assert prev_size == attrs_df.shape[0] + df = df.merge(attrs_df, on=['row_index']) + assert prev_size == df.shape[0] + + df['Alias'] = '' + df['feature_id'] = '' + df.loc[df.type == 'exon', 'feature_id'] = df.exon_id + df.loc[df.type == 'gene', 'feature_id'] = df.gene_id + df.loc[df.type == 'transcript', 'feature_id'] = df.transcript_id + + df['Name'] = '' + df.loc[df.type == 'gene', 'Name'] = df.gene_name + df.loc[df.type == 'transcript', 'Name'] = df.transcript_name + df['strand'] = df.strand.fillna('?') + + df['Parent'] = '' + df.loc[(df.type == 'transcript') & (df.gene_id != ''), 'Parent'] = 'gene:' + df.gene_id + df.loc[(df.type == 'exon') & (df.transcript_id != ''), 'Parent'] = ( + 'transcript:' + df.transcript_id + ) + df.loc[(df.type == 'CDS') & (df.transcript_id != ''), 'Parent'] = ( + 'transcript:' + df.transcript_id + ) + + df['version'] = '' + df.loc[df.type == 'transcript', 'version'] = df.transcript_version + df.loc[df.type == 'exon', 'version'] = df.exon_version + df.loc[df.type == 'gene', 'version'] = df.gene_version + + df['strand'] = df.strand.fillna('?') + return convert_pandas_gff_to_mavis(df) + + +if __name__ == '__main__': + logging.basicConfig(format='{message}', style='{', level=logging.INFO) + parser = argparse.ArgumentParser() + parser.add_argument( + 'input', help='path to the tab-delimated mavis v2 style reference annotations file' + ) + parser.add_argument('--input_type', default='v2', choices=['v2', 'gff3', 'gtf']) + parser.add_argument('output', help='path to the JSON output file') + parser.add_argument( + '--keep_alt', + help='do not filter out chromosome/seqid names starting with GL or KI', + action='store_true', + default=False, + ) + + args = parser.parse_args() + + if args.input_type == 'v2': + annotations = convert_tab_to_json(args.input) + elif args.input_type == 'gtf': + annotations = convert_gff2_to_mavis(args.input, not args.keep_alt) + else: + annotations = convert_gff3_to_mavis(args.input, not args.keep_alt) + + logging.info(f'writing: {args.output}') + with open(args.output, 'w') as fh: + fh.write(json.dumps(annotations, sort_keys=True)) diff --git a/src/tools/migrate_mavis_annotations_2to3.py b/src/tools/migrate_mavis_annotations_2to3.py deleted file mode 100644 index 0fe9d39d..00000000 --- a/src/tools/migrate_mavis_annotations_2to3.py +++ /dev/null @@ -1,190 +0,0 @@ -import argparse -import json -import logging -import re -from typing import Dict - -import pandas as pd - - -def convert_tab_to_json(filepath: str) -> Dict: - """ - given a file in the std input format (see below) reads and return a list of genes (and sub-objects) - - +-----------------------+---------------------------+-----------------------------------------------------------+ - | column name | example | description | - +=======================+===========================+===========================================================+ - | ensembl_transcript_id | ENST000001 | | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | ensembl_gene_id | ENSG000001 | | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | strand | -1 | positive or negative 1 | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | cdna_coding_start | 44 | where translation begins relative to the start of the cdna| - +-----------------------+---------------------------+-----------------------------------------------------------+ - | cdna_coding_end | 150 | where translation terminates | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | - +-----------------------+---------------------------+-----------------------------------------------------------+ - | hugo_names | KRAS | hugo gene name | - +-----------------------+---------------------------+-----------------------------------------------------------+ - - Args: - filepath (str): path to the input tab-delimited file - - Returns: - Dict[str,List[Gene]]: a dictionary keyed by chromosome name with values of list of genes on the chromosome - - Warning: - does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 - """ - - def parse_exon_list(row): - if pd.isnull(row): - return [] - exons = [] - for temp in re.split('[; ]', row): - try: - start, end = temp.split('-') - exons.append({'start': int(start), 'end': int(end)}) - except Exception as err: - logging.warning(f'exon error: {repr(temp)}, {repr(err)}') - return exons - - def parse_domain_list(row): - if pd.isnull(row): - return [] - domains = [] - for domain in row.split(';'): - try: - name, temp = domain.rsplit(':') - temp = temp.split(',') - temp = [x.split('-') for x in temp] - regions = [{'start': int(x), 'end': int(y)} for x, y in temp] - domains.append({'name': name, 'regions': regions}) - except Exception as err: - logging.warning(f'error in domain: {domain}, {row}, {repr(err)}') - return domains - - df = pd.read_csv( - filepath, - dtype={ - 'ensembl_gene_id': str, - 'ensembl_transcript_id': str, - 'chr': str, - 'cdna_coding_start': pd.Int64Dtype(), - 'cdna_coding_end': pd.Int64Dtype(), - 'AA_domain_ranges': str, - 'genomic_exon_ranges': str, - 'hugo_names': str, - 'transcript_genomic_start': pd.Int64Dtype(), - 'transcript_genomic_end': pd.Int64Dtype(), - 'best_ensembl_transcript_id': str, - 'gene_start': int, - 'gene_end': int, - }, - sep='\t', - comment='#', - ) - - for col in ['ensembl_gene_id', 'chr', 'ensembl_transcript_id', 'gene_start', 'gene_end']: - if col not in df: - raise KeyError(f'missing required column: {col}') - - for col, parser in [ - ('genomic_exon_ranges', parse_exon_list), - ('AA_domain_ranges', parse_domain_list), - ]: - if col in df: - df[col] = df[col].apply(parser) - - genes = {} - rows = df.where(df.notnull(), None).to_dict('records') - - for row in rows: - gene = { - 'chr': row['chr'], - 'start': int(row['gene_start']), - 'end': int(row['gene_end']), - 'name': row['ensembl_gene_id'], - 'strand': row['strand'], - 'aliases': row['hugo_names'].split(';') if row.get('hugo_names') else [], - 'transcripts': [], - } - if gene['strand'] in {'true', '1', '+', '+1', 'True', 1, True}: - gene['strand'] = '+' - elif gene['strand'] in {'false', '-1', '-', 'False', -1, False}: - gene['strand'] = '-' - if gene['name'] not in genes: - genes[gene['name']] = gene - else: - gene = genes[gene['name']] - is_best_transcript = ( - row.get('best_ensembl_transcript_id', row['ensembl_transcript_id']) - == row['ensembl_transcript_id'] - ) - transcript = { - 'is_best_transcript': is_best_transcript, - 'name': row['ensembl_transcript_id'], - 'exons': row.get('genomic_exon_ranges', []), - 'domains': row.get('AA_domain_ranges', []), - 'start': row.get('transcript_genomic_start'), - 'end': row.get('transcript_genomic_end'), - 'cdna_coding_start': row.get('cdna_coding_start'), - 'cdna_coding_end': row.get('cdna_coding_end'), - 'aliases': [], - } - for int_value in ['start', 'end', 'cdna_coding_start', 'cdna_coding_end']: - if transcript.get(int_value) is not None: - transcript[int_value] = int(transcript[int_value]) - gene['transcripts'].append(transcript) - - return {'genes': list(genes.values())} - - -if __name__ == '__main__': - logging.basicConfig(**{'format': '{message}', 'style': '{', 'level': logging.INFO}) - parser = argparse.ArgumentParser() - parser.add_argument( - 'input', help='path to the tab-delimated mavis v2 style reference annotations file' - ) - parser.add_argument('output', help='path to the JSON output file') - - args = parser.parse_args() - - annotations = convert_tab_to_json(args.input) - - rows = [] - logging.info(f'writing: {args.output}') - if args.output_format == 'jsonl': - with open(args.output, 'w') as fh: - for gene in annotations['genes']: - fh.write(json.dumps(gene, sort_keys=True) + '\n') - elif args.output_format == 'json': - with open(args.output, 'w') as fh: - fh.write(json.dumps(annotations, sort_keys=True)) - else: - transcripts = [] - - for gene in annotations['genes']: - meta = {**gene} - del meta['transcripts'] - if gene['transcripts']: - for transcript in gene['transcripts']: - transcripts.append( - {**meta, **{f'transcript.{k}': v for k, v in transcript.items()}} - ) - else: - transcripts.append(meta) - df = pd.json_normalize(transcripts, max_level=1) - json_cols = [ - 'aliases', - 'transcript.aliases', - 'transcript.exons', - 'transcript.domains', - ] - for col in json_cols: - df[col] = df[col].apply(json.dumps) - df.to_csv(args.output, index=False, sep='\t') diff --git a/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf b/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf new file mode 100644 index 00000000..ce1f904f --- /dev/null +++ b/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf @@ -0,0 +1,186 @@ +6 havana gene 54770583 54771134 . + . gene_id "ENSG00000220635"; gene_version "2"; gene_name "KRASP1"; gene_source "havana"; gene_biotype "processed_pseudogene"; +6 havana transcript 54770583 54771134 . + . gene_id "ENSG00000220635"; gene_version "2"; transcript_id "ENST00000407852"; transcript_version "2"; gene_name "KRASP1"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "KRASP1-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; tag "basic"; transcript_support_level "NA"; +6 havana exon 54770583 54771134 . + . gene_id "ENSG00000220635"; gene_version "2"; transcript_id "ENST00000407852"; transcript_version "2"; exon_number "1"; gene_name "KRASP1"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "KRASP1-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; exon_id "ENSE00001550689"; exon_version "2"; tag "basic"; transcript_support_level "NA"; +12 ensembl_havana gene 25205246 25250936 . - . gene_id "ENSG00000133703"; gene_version "14"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +12 havana transcript 25205246 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00001644818"; exon_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000509798"; protein_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana exon 25213114 25213206 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003927570"; exon_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana stop_codon 25213204 25213206 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana exon 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00002477035"; exon_version "3"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana three_prime_utr 25213114 25213203 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 havana three_prime_utr 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690406"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-211"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; tag "cds_start_NF"; tag "mRNA_start_NF"; +12 ensembl_havana transcript 25205246 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00003903543"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00001719809"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana CDS 25227234 25227412 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana exon 25215437 25215560 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00001189807"; exon_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana CDS 25215444 25215560 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana stop_codon 25215441 25215443 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana exon 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00002477035"; exon_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana five_prime_utr 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana three_prime_utr 25215437 25215440 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana three_prime_utr 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; +12 ensembl_havana transcript 25205246 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003903543"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001719809"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana CDS 25227234 25227412 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana exon 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00002456976"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana five_prime_utr 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana three_prime_utr 25205246 25209794 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 havana transcript 25205250 25250908 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25250751 25250908 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003928105"; exon_version "1"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00000936617"; exon_version "1"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000510431"; protein_version "1"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25230483 25230621 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003930732"; exon_version "1"; +12 havana CDS 25230568 25230621 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000510431"; protein_version "1"; +12 havana stop_codon 25230565 25230567 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003930847"; exon_version "1"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003937476"; exon_version "1"; +12 havana exon 25205250 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; exon_number "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003934058"; exon_version "1"; +12 havana five_prime_utr 25250751 25250908 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25230483 25230564 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25205250 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686877"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-206"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana transcript 25205258 25250935 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana exon 25250764 25250935 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003934964"; exon_version "1"; tag "basic"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000508921"; protein_version "1"; tag "basic"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001719809"; exon_version "1"; tag "basic"; +12 havana CDS 25227234 25227412 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000508921"; protein_version "1"; tag "basic"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; +12 havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000508921"; protein_version "1"; tag "basic"; +12 havana exon 25205258 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003924510"; exon_version "1"; tag "basic"; +12 havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000508921"; protein_version "1"; tag "basic"; +12 havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana five_prime_utr 25250764 25250935 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana three_prime_utr 25205258 25209794 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000685328"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-205"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana transcript 25205260 25250899 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana exon 25250751 25250899 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003938559"; exon_version "1"; tag "basic"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000509223"; protein_version "1"; tag "basic"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana exon 25227234 25227337 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003923411"; exon_version "1"; tag "basic"; +12 havana CDS 25227234 25227337 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000509223"; protein_version "1"; tag "basic"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; +12 havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000509223"; protein_version "1"; tag "basic"; +12 havana exon 25205260 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003927775"; exon_version "1"; tag "basic"; +12 havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000509223"; protein_version "1"; tag "basic"; +12 havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana five_prime_utr 25250751 25250899 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana three_prime_utr 25205260 25209794 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000693229"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-214"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana transcript 25205270 25250927 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25250751 25250927 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003930705"; exon_version "1"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00000936617"; exon_version "1"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000510511"; protein_version "1"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003930939"; exon_version "1"; +12 havana CDS 25225765 25225773 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000510511"; protein_version "1"; +12 havana stop_codon 25225762 25225764 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25205270 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003933328"; exon_version "1"; +12 havana five_prime_utr 25250751 25250927 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25225614 25225761 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25205270 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000687356"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-208"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana transcript 25205343 25250917 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana exon 25250751 25250917 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003923448"; exon_version "1"; tag "basic"; +12 havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003923061"; exon_version "1"; tag "basic"; +12 havana CDS 25227234 25227325 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000510254"; protein_version "1"; tag "basic"; +12 havana start_codon 25227323 25227325 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; +12 havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000510254"; protein_version "1"; tag "basic"; +12 havana exon 25205343 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003925822"; exon_version "1"; tag "basic"; +12 havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000510254"; protein_version "1"; tag "basic"; +12 havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana five_prime_utr 25250751 25250917 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana five_prime_utr 25227326 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana three_prime_utr 25205343 25209794 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000692768"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-213"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana transcript 25206933 25250444 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana exon 25250255 25250444 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003932539"; exon_version "1"; tag "basic"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000509238"; protein_version "1"; tag "basic"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001719809"; exon_version "1"; tag "basic"; +12 havana CDS 25227234 25227412 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000509238"; protein_version "1"; tag "basic"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; +12 havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000509238"; protein_version "1"; tag "basic"; +12 havana exon 25206933 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003930148"; exon_version "1"; tag "basic"; +12 havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000509238"; protein_version "1"; tag "basic"; +12 havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana five_prime_utr 25250255 25250444 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana three_prime_utr 25206933 25209794 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688940"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-210"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; +12 havana transcript 25207948 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003903543"; exon_version "1"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00000936617"; exon_version "1"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000508568"; protein_version "1"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25228775 25228891 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003925179"; exon_version "1"; +12 havana CDS 25228850 25228891 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; protein_id "ENSP00000508568"; protein_version "1"; +12 havana stop_codon 25228847 25228849 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003930847"; exon_version "1"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003937476"; exon_version "1"; +12 havana exon 25207948 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; exon_number "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; exon_id "ENSE00003935620"; exon_version "1"; +12 havana five_prime_utr 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25228775 25228846 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana three_prime_utr 25207948 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000690804"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-212"; transcript_source "havana"; transcript_biotype "nonsense_mediated_decay"; +12 havana transcript 25209178 25250936 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana exon 25250751 25250936 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002446502"; exon_version "1"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000452512"; protein_version "1"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana exon 25209178 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002464674"; exon_version "2"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000452512"; protein_version "1"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana five_prime_utr 25250751 25250936 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana three_prime_utr 25209178 25209794 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000557334"; transcript_version "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5 (assigned to previous version 5)"; +12 havana transcript 25209673 25227997 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688228"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-209"; transcript_source "havana"; transcript_biotype "retained_intron"; +12 havana exon 25227234 25227997 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688228"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-209"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003935871"; exon_version "1"; +12 havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688228"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-209"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003937476"; exon_version "1"; +12 havana exon 25209673 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000688228"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-209"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003925173"; exon_version "1"; +12 havana transcript 25232558 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana exon 25250764 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002530521"; exon_version "1"; tag "basic"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000510479"; protein_version "1"; tag "basic"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana exon 25232558 25235226 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003927408"; exon_version "1"; tag "basic"; +12 havana CDS 25235209 25235226 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000510479"; protein_version "1"; tag "basic"; +12 havana stop_codon 25235206 25235208 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana five_prime_utr 25250764 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana three_prime_utr 25232558 25235205 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000686969"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-207"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; +12 havana transcript 25232591 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003903543"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000451856"; protein_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana exon 25232591 25235226 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002478081"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana CDS 25235209 25235226 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000451856"; protein_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana stop_codon 25235206 25235208 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana five_prime_utr 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; +12 havana three_prime_utr 25232591 25235205 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000556131"; transcript_version "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 1)"; diff --git a/tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 b/tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 new file mode 100644 index 00000000..be16e852 --- /dev/null +++ b/tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 @@ -0,0 +1,19 @@ +12 ensembl_havana gene 25205246 25250936 . - . ID=gene:ENSG00000133703;Name=KRAS;biotype=protein_coding;description=KRAS proto-oncogene%2C GTPase [Source:HGNC Symbol%3BAcc:HGNC:6407];gene_id=ENSG00000133703;logic_name=ensembl_havana_gene_homo_sapiens;version=14 +12 havana mRNA 25205246 25225773 . - . ID=transcript:ENST00000690406;Parent=gene:ENSG00000133703;Name=KRAS-211;biotype=nonsense_mediated_decay;transcript_id=ENST00000690406;version=1 +12 ensembl_havana mRNA 25205246 25250929 . - . ID=transcript:ENST00000256078;Parent=gene:ENSG00000133703;Name=KRAS-201;biotype=protein_coding;ccdsid=CCDS8703.1;tag=basic;transcript_id=ENST00000256078;transcript_support_level=1 (assigned to previous version 8);version=10 +12 ensembl_havana mRNA 25205246 25250929 . - . ID=transcript:ENST00000311936;Parent=gene:ENSG00000133703;Name=KRAS-202;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000311936;transcript_support_level=1 (assigned to previous version 7);version=8 +12 havana mRNA 25205250 25250908 . - . ID=transcript:ENST00000686877;Parent=gene:ENSG00000133703;Name=KRAS-206;biotype=nonsense_mediated_decay;transcript_id=ENST00000686877;version=1 +12 havana mRNA 25205258 25250935 . - . ID=transcript:ENST00000685328;Parent=gene:ENSG00000133703;Name=KRAS-205;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000685328;version=1 +12 havana mRNA 25205260 25250899 . - . ID=transcript:ENST00000693229;Parent=gene:ENSG00000133703;Name=KRAS-214;biotype=protein_coding;tag=basic;transcript_id=ENST00000693229;version=1 +12 havana mRNA 25205270 25250927 . - . ID=transcript:ENST00000687356;Parent=gene:ENSG00000133703;Name=KRAS-208;biotype=nonsense_mediated_decay;transcript_id=ENST00000687356;version=1 +12 havana mRNA 25205343 25250917 . - . ID=transcript:ENST00000692768;Parent=gene:ENSG00000133703;Name=KRAS-213;biotype=protein_coding;tag=basic;transcript_id=ENST00000692768;version=1 +12 havana mRNA 25206933 25250444 . - . ID=transcript:ENST00000688940;Parent=gene:ENSG00000133703;Name=KRAS-210;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000688940;version=1 +12 havana mRNA 25207948 25250929 . - . ID=transcript:ENST00000690804;Parent=gene:ENSG00000133703;Name=KRAS-212;biotype=nonsense_mediated_decay;transcript_id=ENST00000690804;version=1 +12 havana mRNA 25209178 25250936 . - . ID=transcript:ENST00000557334;Parent=gene:ENSG00000133703;Name=KRAS-204;biotype=protein_coding;tag=basic;transcript_id=ENST00000557334;transcript_support_level=5 (assigned to previous version 5);version=6 +12 havana lnc_RNA 25209673 25227997 . - . ID=transcript:ENST00000688228;Parent=gene:ENSG00000133703;Name=KRAS-209;biotype=retained_intron;transcript_id=ENST00000688228;version=1 +12 havana mRNA 25232558 25250929 . - . ID=transcript:ENST00000686969;Parent=gene:ENSG00000133703;Name=KRAS-207;biotype=protein_coding;tag=basic;transcript_id=ENST00000686969;version=1 +12 havana mRNA 25232591 25250929 . - . ID=transcript:ENST00000556131;Parent=gene:ENSG00000133703;Name=KRAS-203;biotype=protein_coding;tag=basic;transcript_id=ENST00000556131;transcript_support_level=1 (assigned to previous version 1);version=2 +12 havana ncRNA_gene 25210652 25211233 . + . ID=gene:ENSG00000274987;biotype=lncRNA;description=novel transcript%2C antisense to KRAS;gene_id=ENSG00000274987;logic_name=havana_homo_sapiens;version=1 +12 havana ncRNA_gene 25225103 25225665 . + . ID=gene:ENSG00000275197;biotype=lncRNA;description=novel transcript%2C antisense to KRAS;gene_id=ENSG00000275197;logic_name=havana_homo_sapiens;version=1 +6 havana pseudogene 54770583 54771134 . + . ID=gene:ENSG00000220635;Name=KRASP1;biotype=processed_pseudogene;description=KRAS proto-oncogene%2C GTPase pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:6406];gene_id=ENSG00000220635;logic_name=havana_homo_sapiens;version=2 +6 havana pseudogenic_transcript 54770583 54771134 . + . ID=transcript:ENST00000407852;Parent=gene:ENSG00000220635;Name=KRASP1-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000407852;transcript_support_level=NA;version=2 diff --git a/tests/tools/test_convert_annotations_format.py b/tests/tools/test_convert_annotations_format.py new file mode 100644 index 00000000..0f837b30 --- /dev/null +++ b/tests/tools/test_convert_annotations_format.py @@ -0,0 +1,22 @@ +import os + +from tools.convert_annotations_format import convert_gff2_to_mavis, convert_gff3_to_mavis + + +def test_load_gff3(): + input = os.path.join(os.path.dirname(__file__), 'data', 'Homo_sapiens.GRCh38.105.chr.kras.gtf') + data = convert_gff2_to_mavis(input, False) + assert len(data['genes']) == 2 + assert sum([len(g['transcripts']) for g in data['genes']]) == 15 + exons = 0 + for gene in data['genes']: + for transcript in gene['transcripts']: + exons += len(transcript['exons']) + assert exons == 62 + + +def test_load_gtf(): + input = os.path.join(os.path.dirname(__file__), 'data', 'Homo_sapiens.GRCh38.105.kras.gff3') + data = convert_gff3_to_mavis(input, False) + assert len(data['genes']) == 4 + assert sum([len(g['transcripts']) for g in data['genes']]) == 15 From 0c0f0516b1cfa89f587cc2fde746d68ba41a10d2 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 2 Feb 2022 10:18:52 -0800 Subject: [PATCH 108/137] Add example with semi colon in attribute --- src/tools/convert_annotations_format.py | 21 ++++++++++++++----- .../data/Homo_sapiens.GRCh38.105.chr.kras.gtf | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index 13dfb8e5..fd3c2b3b 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -165,6 +165,7 @@ def parse_domain_list(row): def convert_pandas_gff_to_mavis(df) -> Dict: df['parent_type'] = df.Parent.str.split(':').str[0] genelike_features = {'gene', 'ncRNA_gene', 'biological_region', 'pseudogene'} + consumed = set() def pull_alias_terms(row): aliases = [] @@ -185,6 +186,7 @@ def pull_alias_terms(row): 'transcripts': [], 'name': row['feature_id'] + '.' + row['version'], } + consumed.add(row['row_index']) logging.info(f'loaded {len(genes_by_id)} genes') transcripts_by_id = {} @@ -209,6 +211,7 @@ def pull_alias_terms(row): } genes_by_id[gene_id]['transcripts'].append(transcript) transcripts_by_id[feature_id] = transcript + consumed.add(row['row_index']) logging.info(f'loaded {len(transcripts_by_id)} transcripts') # now cds @@ -218,12 +221,13 @@ def pull_alias_terms(row): transcript_id = parent.split(':')[1] if transcript_id not in transcripts_by_id: raise KeyError( - f'failed to find parent transcript ({transcript_id}) skipping cds ({row["feature_id"]})' + f'failed to find parent transcript ({transcript_id}) skipping cds on line ({row["row_index"] + 1})' ) transcripts_by_id[transcript_id].update( {'cdna_coding_start': row['start'], 'cdna_coding_end': row['end']} ) cds_count += 1 + consumed.add(row['row_index']) logging.info(f'loaded {cds_count} cds regions') # exons exons_count = 0 @@ -232,7 +236,7 @@ def pull_alias_terms(row): transcript_id = parent.split(':')[1] if transcript_id not in transcripts_by_id: raise KeyError( - f'failed to find parent transcript ({transcript_id}) skipping exon ({row["feature_id"]})' + f'failed to find parent transcript ({transcript_id}) skipping exon ({row["feature_id"]}) on line {row["row_index"] + 1}' ) transcripts_by_id[transcript_id]['exons'].append( { @@ -242,9 +246,16 @@ def pull_alias_terms(row): } ) exons_count += 1 + consumed.add(row['row_index']) logging.info(f'loaded {exons_count} exons') + ignored_df = df[~df.row_index.isin(consumed)] + if ignored_df.shape[0]: + logging.warning( + f'Ignored {ignored_df.shape[0]} rows that did not match the expected types: {ignored_df.type.unique()}' + ) + result = {'genes': list(genes_by_id.values())} try: snakemake_validate( @@ -282,10 +293,10 @@ def convert_gff3_to_mavis(filename: str, no_alt) -> Dict: na_values=['.'] + PANDAS_DEFAULT_NA_VALUES, names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'], ) + df['row_index'] = df.index if no_alt: df = df[~df.seqid.str.startswith('GL')] df = df[~df.seqid.str.startswith('KI')] - df['row_index'] = df.index skip_types = { 'five_prime_UTR', @@ -394,10 +405,10 @@ def convert_gff2_to_mavis(filename: str, no_alt) -> Dict: def split_attributes(row): result = {} - for attr in row.attributes.split(';'): + for attr in row.attributes.split('";'): if not attr: continue - m = re.match(r'^\s*([^"]+)\s+"(.*)"$', attr) + m = re.match(r'^\s*([^"]+)\s+"(.*)"?$', attr) if not m: raise KeyError(f'attributes do not follow expected pattern: {attr}') result[m.group(1)] = m.group(2) diff --git a/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf b/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf index ce1f904f..1246b11c 100644 --- a/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf +++ b/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf @@ -28,7 +28,7 @@ 12 ensembl_havana three_prime_utr 25215437 25215440 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; 12 ensembl_havana three_prime_utr 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; 12 ensembl_havana transcript 25205246 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -12 ensembl_havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003903543"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +12 ensembl_havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00003903543"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)";note "some note with; a semi-colon" 12 ensembl_havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; 12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; 12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "14"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; From 2987227f5e3c5f5839d3ac86fd24bbf559b845eb Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 2 Feb 2022 15:04:35 -0800 Subject: [PATCH 109/137] revert old changes --- src/mavis/tools/vcf.py | 7 ++++-- tests/unit/test_tools_vcf.py | 43 ++++++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index ff4c0cb0..f756df43 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -156,8 +156,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: if info.get('SVTYPE') == 'BND': chr2, end, orient1, orient2, ref, alt = parse_bnd_alt(alt) - if end == 0: - end = 1 # telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row[COLUMNS.break1_orientation] = orient1 std_row[COLUMNS.break2_orientation] = orient2 std_row[COLUMNS.untemplated_seq] = alt @@ -204,6 +202,11 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: + # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 + std_row.update({'break1_position_end': 1}) + if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: + std_row.update({'break2_position_end': 1}) if 'SVTYPE' in info: std_row[COLUMNS.event_type] = info['SVTYPE'] diff --git a/tests/unit/test_tools_vcf.py b/tests/unit/test_tools_vcf.py index c4eac443..2036e656 100644 --- a/tests/unit/test_tools_vcf.py +++ b/tests/unit/test_tools_vcf.py @@ -11,10 +11,10 @@ def test_read_vcf(): def test_convert_record(): - variant = VcfRecordType( - 1, - 0, - 'chr14_KI270722v1_random', + variant_imprecise = VcfRecordType( + id='mock-BND-imprecise', + pos=0, + chrom='chr14_KI270722v1_random', alts=['N[chr17_GL000205v2_random:0['], ref='N', info=VcfInfoType( @@ -29,8 +29,33 @@ def test_convert_record(): AF="1", ), ) - records = convert_record(variant) - assert len(records) == 1 - record = records[0] - assert record.get('break2_position_end') == 1 - assert record.get('break2_chromosome') == 'chr17_GL000205v2_random' + variant_precise = VcfRecordType( + id='mock-BND-precise', + pos=0, + chrom='chr14_KI270722v1_random', + alts=[']chrUn_GL000216v2:142821]N'], + ref='N', + info=VcfInfoType( + IMPRECISE=False, + SVMETHOD="Snifflesv1.0.11", + SVTYPE="BND", + SUPTYPE="SR", + SVLEN="0", + STRANDS="+-", + RE="5", + REF_strand="0,0", + AF="1", + ), + ) + imprecise_records = convert_record(variant_imprecise) + assert len(imprecise_records) == 1 + imprecise_records = imprecise_records[0] + assert imprecise_records.get('break1_position_end') == 1 + + precise_records = convert_record(variant_precise) + assert len(precise_records) == 1 + precise_records = precise_records[0] + assert precise_records.get('break1_position_end') == 1 + + assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' + assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' From 7c939e38c7db9739ab719f7d1b5751d44d9a6a5e Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 5 Feb 2022 20:55:13 -0800 Subject: [PATCH 110/137] Support transform of gff and gtf files --- src/tools/convert_annotations_format.py | 628 +- tests/data/example_genes.json | 8038 +---------------- .../data/Homo_sapiens.GRCh38.105.kras.gff3 | 19 - .../tools/data/Homo_sapiens.GRCh38.kras.gff3 | 163 + .../data/Homo_sapiens.GRCh38.kras.gff3.json | 1 + ....kras.gtf => Homo_sapiens.GRCh38.kras.gtf} | 0 .../data/Homo_sapiens.GRCh38.kras.gtf.json | 1 + tests/tools/data/K02718.1.gff3 | 24 + tests/tools/data/K02718.1.gff3.json | 243 + tests/tools/data/K02718.1.gtf | 32 + tests/tools/data/K02718.1.gtf.json | 188 + tests/tools/data/example_genes.v2.json | 7700 ++++++++++++++++ tests/tools/data/example_genes.v3.json | 1 + .../tools/test_convert_annotations_format.py | 71 +- 14 files changed, 8912 insertions(+), 8197 deletions(-) delete mode 100644 tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 create mode 100644 tests/tools/data/Homo_sapiens.GRCh38.kras.gff3 create mode 100644 tests/tools/data/Homo_sapiens.GRCh38.kras.gff3.json rename tests/tools/data/{Homo_sapiens.GRCh38.105.chr.kras.gtf => Homo_sapiens.GRCh38.kras.gtf} (100%) create mode 100644 tests/tools/data/Homo_sapiens.GRCh38.kras.gtf.json create mode 100644 tests/tools/data/K02718.1.gff3 create mode 100644 tests/tools/data/K02718.1.gff3.json create mode 100644 tests/tools/data/K02718.1.gtf create mode 100644 tests/tools/data/K02718.1.gtf.json create mode 100644 tests/tools/data/example_genes.v2.json create mode 100644 tests/tools/data/example_genes.v3.json diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index fd3c2b3b..28ca9dac 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -2,11 +2,14 @@ import json import logging import re -from typing import Dict +import traceback +from typing import Dict, Tuple import pandas as pd -import pkg_resources -from snakemake.utils import validate as snakemake_validate +from mavis.annotate.file_io import parse_annotations_json + +# pd.set_option('display.width', 250) +pd.options.display.width = 0 PANDAS_DEFAULT_NA_VALUES = [ '-1.#IND', @@ -25,6 +28,85 @@ ] +GFF_GENELIKE_FEATURES = { + 'gene', + 'ncRNA_gene', + 'biological_region', + 'pseudogene', + 'enhancer', + 'promoter', + 'region', + 'protein_binding_site', +} +GFF_RNALIKE_FEATURES = { + 'rna', + 'mRNA', + 'lncRNA', + 'transcript', + 'lnc_RNA', + 'pseudogenic_transcript', + 'snRNA', + 'miRNA', + 'unconfirmed_transcript', + 'ncRNA', + 'snoRNA', + 'scRNA', +} +GFF_ALL_FEATURES = GFF_GENELIKE_FEATURES | GFF_RNALIKE_FEATURES | {'CDS', 'exon'} +GFF_ID_DELIMITER = '_' +GFF_ATTRS = [ + 'Alias', + 'bound_moiety', + 'DBxref', + 'Derives_from', + 'exon_id', + 'exon_number', + 'exon_version', + 'function', + 'Gap', + 'gene_id', + 'gene_name', + 'gene_version', + 'ID', + 'Name', + 'Note', + 'old-name', + 'Ontology_term', + 'Parent', + 'product', + 'protein_id', + 'protein_version', + 'rank', + 'standard_name', + 'Target', + 'transcript_id', + 'transcript_name', + 'transcript_version', + 'version', +] +GFF_KEY_COLS = ['feature_id', 'type', 'seqid', 'strand'] + + +def agg_strings_unique(series): + series = series.fillna('') + return ';'.join([s for s in series.astype(str).unique()]) + + +def strip_empty_fields(input_obj): + """Remove all empty string fields from some dictionary object to reduce the size""" + + if isinstance(input_obj, dict): + result = {} + for k, v in input_obj.items(): + if v == '' or (isinstance(v, list) and not len(v)): + continue + result[k] = strip_empty_fields(v) + return result + elif isinstance(input_obj, list): + return [strip_empty_fields(v) for v in input_obj] + return input_obj + + def convert_tab_to_json(filepath: str) -> Dict: """ given a file in the std input format (see below) reads and return a list of genes (and sub-objects) @@ -42,7 +124,7 @@ def convert_tab_to_json(filepath: str) -> Dict: +-----------------------+---------------------------+-----------------------------------------------------------+ | cdna_coding_end | 150 | where translation terminates | +-----------------------+---------------------------+-----------------------------------------------------------+ - | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | + | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon delimited exon start/ends | +-----------------------+---------------------------+-----------------------------------------------------------+ | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | +-----------------------+---------------------------+-----------------------------------------------------------+ @@ -162,117 +244,403 @@ def parse_domain_list(row): return {'genes': list(genes.values())} +def strip_id_field(feature_id) -> Tuple[str, str]: + """ + Remove type prefix from ID if applicable + """ + prefix_map = {k: k for k in ['gene', 'transcript', 'cds', 'exon']} + prefix_map.update({k: 'gene' for k in GFF_GENELIKE_FEATURES}) + prefix_map.update({k: 'transcript' for k in GFF_RNALIKE_FEATURES}) + if feature_id: + for prefix in prefix_map: + if feature_id.lower().startswith(prefix): + return prefix_map.get(prefix, prefix), feature_id[len(prefix) + 1 :] + return '', feature_id + + +def parse_gff_id(row): + """ + Get the unique ID of the current row/feature + """ + _, feature_id = strip_id_field(row.ID if 'ID' in row else '') + + if not feature_id: + if row.type == 'exon' and 'exon_id' in row: + return row.exon_id + elif row.type == 'gene' and 'gene_id' in row: + return row.gene_id + elif row.type == 'transcript' and 'transcript_id' in row: + return row.transcript_id + elif row.type.lower() == 'cds' and 'protein_id' in row: + return row.protein_id + return feature_id + + +def pull_alias_terms(row): + aliases = [] + for field in ['Name', 'standard_name', 'old-name']: + if row[field] and not pd.isnull(row[field]): + aliases.extend(row[field].split(';')) + if row.Alias and not pd.isnull(row.Alias): + aliases.extend(row.Alias.split(',')) + return [a for a in aliases if a != row.feature_id] + + +class NumberedFeatureGenerator: + def __init__(self): + self.counter = 0 + + def __call__(self, features, parent_id, prefix='-T'): + result = f'{parent_id}{prefix}{self.counter}' + while result in features: + self.counter += 1 + result = f'{parent_id}{prefix}{self.counter}' + return result + + +def split_col_into_rows(df, col, delimiter=',', new_col=None): + """ + Given some string column in a dataframe, split the column by the delimiter and for each resulting value duplicate the existing row + """ + if not new_col: + new_col = col + new_df = df.copy().reset_index() + + s = new_df[col].str.split(delimiter).apply(pd.Series, 1).stack() + s.index = s.index.droplevel(-1) + s.name = new_col + + if new_col == col: + new_df = new_df.drop(columns=[new_col]) + return new_df.merge(s, left_index=True, right_index=True) + + +def print_marker(df, links_df=None): + stack = traceback.extract_stack(limit=2)[0] + print(f'{stack.filename}:{stack.lineno} {stack.name}') + print(df.shape, links_df.shape if links_df is not None else '') + print(df.groupby(['type']).agg({'feature_id': 'count', 'feature_id': 'unique'}).reset_index()) + + +def fix_dangling_parent_reference(nodes_df, links_df): + """ + Insert a pseudo element for any parents referenced by an element that do not already have their own line/definition + + Returns the elements to be added to the node definitions + """ + dangling_refs = links_df.rename( + { + 'parent_id': 'feature_id', + 'parent_type': 'type', + 'feature_id': 'child_id', + 'type': 'child_type', + } + ).merge(nodes_df[GFF_KEY_COLS], how='left', indicator=True) + dangling_refs = dangling_refs[dangling_refs._merge == 'left_only'] + # now join back to its children to create coordinates that are the interval covering all connected children + dangling_refs = dangling_refs.merge( + nodes_df[GFF_KEY_COLS + ['start', 'end', 'row_index']].rename( + columns={'feature_id': 'child_id', 'type': 'child_type'} + ) + ) + dangling_refs = ( + dangling_refs.groupby(GFF_KEY_COLS) + .agg( + { + 'start': 'min', + 'end': 'max', + 'row_index': agg_strings_unique, + } + ) + .reset_index() + ) + if dangling_refs.shape[0]: + logging.warning(f'Inserting {dangling_refs.shape[0]} missing parent element definitions') + + return pd.concat([nodes_df, dangling_refs]).reset_index(drop=True), links_df + + +def fix_orphan_elements(nodes_df, links_df): + """ + When there are non-gene elements that do not have a parent assigned to them, connect them to a + inserted 'mock' gene instead + """ + links_df = links_df.copy() + + links_df['_orphan'] = False + links_df.loc[ + (links_df.parent_id == '') & (links_df.type.isin({'CDS', 'exon'})), '_orphan' + ] = True + links_df.loc[links_df._orphan, 'parent_id'] = 'G' + GFF_ID_DELIMITER + links_df.feature_id + links_df.loc[links_df._orphan, 'parent_type'] = 'gene' + + new_genes_df = ( + links_df[links_df._orphan] + .merge(nodes_df[GFF_KEY_COLS + ['start', 'end']]) + .rename( + columns={ + 'feature_id': 'child_id', + 'type': 'child_type', + 'parent_id': 'feature_id', + 'parent_type': 'type', + } + ) + ) + new_genes_df = ( + new_genes_df.groupby(GFF_KEY_COLS) + .agg({'start': 'min', 'end': 'max', 'row_index': agg_strings_unique}) + .reset_index() + ) + + links_df = links_df.drop(columns=['_orphan']) + if new_genes_df.shape[0]: + logging.warning( + f'Inserting {new_genes_df.shape[0]} new genes to connect to orphan elements' + ) + return pd.concat([nodes_df, new_genes_df]).reset_index(drop=True), links_df + + +def insert_missing_transcripts(nodes_df, links_df): + """ + For any cds elements with a direct parent gene, create a transcript and link them through that instead + """ + direct_links_df = links_df[(links_df.parent_type == 'gene') & (links_df.type != 'transcript')] + rest_links_df = links_df[(links_df.parent_type != 'gene') | (links_df.type == 'transcript')] + + src_transcript_df = direct_links_df.copy() + src_transcript_df['feature_id'] = src_transcript_df.parent_id + GFF_ID_DELIMITER + 'T' + src_transcript_df['type'] = 'transcript' + + tgt_transcript_df = direct_links_df.copy() + tgt_transcript_df['parent_id'] = tgt_transcript_df.parent_id + GFF_ID_DELIMITER + 'T' + tgt_transcript_df['parent_type'] = 'transcript' + + links_df = pd.concat([rest_links_df, src_transcript_df, tgt_transcript_df]).reset_index( + drop=True + ) + + if direct_links_df.shape[0]: + logging.warning( + f'Inserting {direct_links_df.shape[0]} transcripts between lower element to gene connections' + ) + + return fix_dangling_parent_reference(nodes_df, links_df) + + +def validate_gff_coordinates(nodes_df, links_df): + """ + Check that all child elements have coordinates within the coordinates of their parent elements + """ + df = links_df.merge(nodes_df[GFF_KEY_COLS + ['start', 'end']]).merge( + nodes_df[GFF_KEY_COLS + ['start', 'end']].rename( + columns={ + 'feature_id': 'parent_id', + 'type': 'parent_type', + 'start': 'parent_start', + 'end': 'parent_end', + } + ) + ) + df['error'] = False + df.loc[(df.parent_start > df.start) | (df.parent_end < df.end), 'error'] = True + + errors = df[df.error] + if errors.shape[0]: + for _, row in errors.iterrows(): + logging.debug( + f'{row.feature_id} ({row.start}-{row.end}) is not within its parent element {row.parent_id} ({row.parent_start}-{row.parent_end})' + ) + raise ValueError(f'{errors.shape[0]} entries with impossible coordinates') + + def convert_pandas_gff_to_mavis(df) -> Dict: - df['parent_type'] = df.Parent.str.split(':').str[0] - genelike_features = {'gene', 'ncRNA_gene', 'biological_region', 'pseudogene'} - consumed = set() - - def pull_alias_terms(row): - aliases = [] - if row['Name']: - aliases.append(row['Name']) - if row['Alias']: - aliases.extend(row['Alias'].split(',')) - return aliases + df['error'] = '' + df.loc[~df.type.isin(GFF_ALL_FEATURES), 'error'] = 'unrecognized type ' + df.type + df = split_col_into_rows(df, 'Parent', ',') + # simplify the type + df['biotype'] = df.type.fillna('') + + def simplify_type(t): + if t in GFF_GENELIKE_FEATURES: + return 'gene' + elif t in GFF_RNALIKE_FEATURES: + return 'transcript' + return t + + df['type'] = df.type.apply(simplify_type).fillna('') + df['parent_type'] = ( + df.Parent.apply(lambda x: strip_id_field(x)[0]).fillna('').apply(simplify_type) + ) + df['parent_id'] = df.Parent.apply(lambda x: strip_id_field(x)[1]).fillna('') + df.loc[df.type == 'gene', 'parent_type'] = 'seq' + df.loc[df.type == 'gene', 'parent_id'] = df.seqid + + if df[df.error != ''].shape[0]: + logging.warning( + f'dropping {df[df.error != ""].shape[0]} features that did not match an expected type: {df[df.error != ""].type.unique()}' + ) + df = df[df.error == ''] + + if df[df.feature_id == ''].shape[0]: + logging.warning(f'dropping {df[df.feature_id == ""].shape[0]} rows for missing ID') + df = df[df.feature_id != ''] + df['regions'] = df.start.astype(str) + '-' + df.end.astype(str) + + # use the feature key to group elements that are discontinuous + links_df = ( + df.sort_values(['seqid', 'start']) + .groupby(GFF_KEY_COLS + ['parent_type', 'parent_id']) + .agg({'row_index': agg_strings_unique}) + .reset_index() + ) + nodes_df = ( + df.sort_values(['seqid', 'start']) + .groupby(GFF_KEY_COLS) + .agg( + { + 'start': 'min', + 'end': 'max', + 'regions': agg_strings_unique, + 'version': agg_strings_unique, + 'Note': agg_strings_unique, + 'Name': agg_strings_unique, + 'Alias': agg_strings_unique, + 'biotype': agg_strings_unique, + 'exon_number': agg_strings_unique, + 'row_index': agg_strings_unique, + 'source': agg_strings_unique, + 'standard_name': agg_strings_unique, + 'old-name': agg_strings_unique, + } + ) + .reset_index() + ) + nodes_df, links_df = fix_dangling_parent_reference(nodes_df, links_df) + nodes_df, links_df = fix_orphan_elements(nodes_df, links_df) + nodes_df, links_df = insert_missing_transcripts(nodes_df, links_df) + validate_gff_coordinates(nodes_df, links_df) + + df = nodes_df.merge(links_df, how='outer', on=GFF_KEY_COLS).fillna('') + + def feature_key(row, parent=False): + if not parent: + return tuple([row[c] for c in ['feature_id', 'type', 'seqid', 'strand']]) + else: + return tuple([row[c] for c in ['parent_id', 'parent_type', 'seqid', 'strand']]) genes_by_id = {} - for row in df[df.type.isin(genelike_features)].to_dict('records'): - genes_by_id[row['feature_id']] = { - 'start': row['start'], - 'end': row['end'], - 'chr': row['seqid'], + for _, row in df[df.type == 'gene'].iterrows(): + genes_by_id[feature_key(row)] = { + 'start': row.start, + 'end': row.end, + 'chr': row.seqid, 'aliases': pull_alias_terms(row), - 'strand': row['strand'], + 'strand': row.strand, 'transcripts': [], - 'name': row['feature_id'] + '.' + row['version'], + 'name': row.feature_id, + 'version': row.version, + 'biotype': row.biotype, + 'note': row.Note, } - consumed.add(row['row_index']) logging.info(f'loaded {len(genes_by_id)} genes') transcripts_by_id = {} + df = df.fillna('') - for row in df[df.parent_type == 'gene'].to_dict('records'): - for parent in row['Parent'].split(','): - gene_id = parent.split(':')[1] - if gene_id not in genes_by_id: - raise KeyError( - f'cannot find gene ({gene_id}) skipping transcript ({row["feature_id"]})' - ) - feature_id = row['feature_id'] - transcript = { - 'name': feature_id + '.' + row['version'], - 'start': row['start'], - 'end': row['end'], - 'aliases': pull_alias_terms(row), - 'domains': [], - 'exons': [], - 'cdna_coding_start': None, - 'cdna_coding_end': None, - } - genes_by_id[gene_id]['transcripts'].append(transcript) - transcripts_by_id[feature_id] = transcript - consumed.add(row['row_index']) + for _, row in df[df.type == 'transcript'].iterrows(): + parent_key = feature_key(row, True) + if parent_key not in genes_by_id: + raise KeyError( + f'cannot find gene ({row.parent_id}) skipping feature ({row.feature_id}) on line ({row.row_index})' + ) + feature_id = row.feature_id + transcript = { + 'name': feature_id, + 'start': row.start, + 'end': row.end, + 'aliases': pull_alias_terms(row), + 'domains': [], + 'exons': [], + 'version': row.version, + 'note': row.Note, + 'biotype': row.biotype, + } + genes_by_id[parent_key]['transcripts'].append(transcript) + transcripts_by_id[feature_key(row)] = transcript - logging.info(f'loaded {len(transcripts_by_id)} transcripts') # now cds - cds_count = 0 - for row in df[df.type == 'CDS'].to_dict('records'): - for parent in row['Parent'].split(','): - transcript_id = parent.split(':')[1] - if transcript_id not in transcripts_by_id: - raise KeyError( - f'failed to find parent transcript ({transcript_id}) skipping cds on line ({row["row_index"] + 1})' - ) - transcripts_by_id[transcript_id].update( - {'cdna_coding_start': row['start'], 'cdna_coding_end': row['end']} + cds_by_id = {} + for _, row in df[df.type == 'CDS'].iterrows(): + parent_key = feature_key(row, True) + if parent_key not in transcripts_by_id: + print(row) + raise KeyError( + f'failed to find parent transcript ({row.parent_id}) skipping cds ({row.feature_id}) on line ({row.row_index})' ) - cds_count += 1 - consumed.add(row['row_index']) - logging.info(f'loaded {cds_count} cds regions') + parent = transcripts_by_id[parent_key] + parent.setdefault('translations', []) + cds = { + 'start': row.start, + 'end': row.end, + 'name': row.feature_id, + 'aliases': pull_alias_terms(row), + 'version': row.version, + 'note': row.Note, + 'biotype': row.biotype, + } + parent['translations'].append(cds) + cds_by_id[feature_key(row)] = cds + + logging.info(f'loaded {len(transcripts_by_id)} transcripts') + logging.info(f'loaded {len(cds_by_id)} cds regions') # exons - exons_count = 0 - for row in df[df.type == 'exon'].to_dict('records'): - for parent in row['Parent'].split(','): - transcript_id = parent.split(':')[1] - if transcript_id not in transcripts_by_id: - raise KeyError( - f'failed to find parent transcript ({transcript_id}) skipping exon ({row["feature_id"]}) on line {row["row_index"] + 1}' - ) - transcripts_by_id[transcript_id]['exons'].append( - { - 'start': row['start'], - 'end': row['end'], - 'name': row['feature_id'] + '.' + row['version'], - } + exons_by_id = {} + + for _, row in df[df.type == 'exon'].iterrows(): + parent_key = feature_key(row, True) + if parent_key not in transcripts_by_id: + raise KeyError( + f'failed to find parent transcript ({row.parent_id}) skipping exon ({row["feature_id"]}) index={row["row_index"]}' ) - exons_count += 1 - consumed.add(row['row_index']) + exon = { + 'start': row.start, + 'end': row.end, + 'name': row.feature_id, + 'version': row.version, + 'number': row.exon_number, + } + transcripts_by_id[parent_key]['exons'].append(exon) + exons_by_id[feature_key(row)] = exon - logging.info(f'loaded {exons_count} exons') + logging.info(f'loaded {len(exons_by_id)} exons') - ignored_df = df[~df.row_index.isin(consumed)] + ignored_df = df[~df.type.isin({'exon', 'CDS', 'transcript', 'gene'})] if ignored_df.shape[0]: logging.warning( f'Ignored {ignored_df.shape[0]} rows that did not match the expected types: {ignored_df.type.unique()}' ) - result = {'genes': list(genes_by_id.values())} + result = strip_empty_fields({'genes': list(genes_by_id.values())}) + try: - snakemake_validate( - result, pkg_resources.resource_filename('mavis.annotate', 'annotations_schema.json') - ) + parse_annotations_json(result) except Exception as err: short_msg = '. '.join( [line for line in str(err).split('\n') if line.strip()][:3] ) # these can get super long + with open('tmp_out.json', 'w') as fh: + fh.write(json.dumps(result, sort_keys=True, indent=' ')) raise AssertionError(short_msg) + # re-strip (mavis adds defaults) + result = strip_empty_fields({'genes': list(genes_by_id.values())}) return result -def convert_gff3_to_mavis(filename: str, no_alt) -> Dict: +def convert_gff3_to_mavis(filename: str, no_alt=False) -> Dict: """ Convert an input gff3 file to the JSON format accepted by MAVIS """ + logging.info(f'reading: {filename}') df = pd.read_csv( filename, sep='\t', @@ -304,50 +672,36 @@ def convert_gff3_to_mavis(filename: str, no_alt) -> Dict: } df = df[~df.type.isin(skip_types)] - attribute_columns = [ - 'ID', - 'Name', - 'Alias', - 'Parent', - 'Target', - 'Gap', - 'Derives_from', - 'Note', - 'DBxref', - 'Ontology_term', - 'rank', - 'version', - 'exon_id', - ] - def split_attributes(row): result = {} for attr in row.attributes.split(';'): name, value = attr.split('=') result[name] = value - return [row.row_index] + [result.get(c, '') for c in attribute_columns] + return [row.row_index] + [result.get(c, '') for c in GFF_ATTRS] prev_size = df.shape[0] attrs_df = pd.DataFrame( df.apply(split_attributes, axis=1).tolist(), - columns=['row_index'] + attribute_columns, + columns=['row_index'] + GFF_ATTRS, ) assert prev_size == attrs_df.shape[0] df = df.merge(attrs_df, on=['row_index']) + df = df.drop(columns=['attributes']) assert prev_size == df.shape[0] - df['feature_id'] = df['ID'].apply(lambda id: id.split(':')[1] if ':' in id else '') + df['feature_id'] = df.apply(parse_gff_id, axis=1) df.loc[(df.feature_id == '') & (df.type == 'exon'), 'feature_id'] = df.exon_id df = df[df.feature_id != ''] - df['strand'] = df.strand.fillna('?') + df['strand'] = df.strand.fillna('') return convert_pandas_gff_to_mavis(df) -def convert_gff2_to_mavis(filename: str, no_alt) -> Dict: +def convert_gff2_to_mavis(filename: str, no_alt=False) -> Dict: """ Convert an input gff2/gtf file to the JSON format accepted by MAVIS """ + logging.info(f'reading: {filename}') df = pd.read_csv( filename, sep='\t', @@ -392,47 +746,34 @@ def convert_gff2_to_mavis(filename: str, no_alt) -> Dict: } df = df[~df.type.isin(skip_types)] - attribute_columns = [ - 'gene_id', - 'gene_version', - 'gene_name', - 'transcript_id', - 'transcript_version', - 'transcript_name', - 'exon_id', - 'exon_version', - ] - def split_attributes(row): result = {} for attr in row.attributes.split('";'): - if not attr: + if not attr.strip(): continue m = re.match(r'^\s*([^"]+)\s+"(.*)"?$', attr) if not m: raise KeyError(f'attributes do not follow expected pattern: {attr}') result[m.group(1)] = m.group(2) - return [row.row_index] + [result.get(c, '') for c in attribute_columns] + return [row.row_index] + [result.get(c, '') for c in GFF_ATTRS] prev_size = df.shape[0] attrs_df = pd.DataFrame( df.apply(split_attributes, axis=1).tolist(), - columns=['row_index'] + attribute_columns, + columns=['row_index'] + GFF_ATTRS, ) assert prev_size == attrs_df.shape[0] df = df.merge(attrs_df, on=['row_index']) assert prev_size == df.shape[0] + df = df.drop(columns=['attributes']) df['Alias'] = '' - df['feature_id'] = '' - df.loc[df.type == 'exon', 'feature_id'] = df.exon_id - df.loc[df.type == 'gene', 'feature_id'] = df.gene_id - df.loc[df.type == 'transcript', 'feature_id'] = df.transcript_id + df['feature_id'] = df.apply(parse_gff_id, axis=1) df['Name'] = '' df.loc[df.type == 'gene', 'Name'] = df.gene_name df.loc[df.type == 'transcript', 'Name'] = df.transcript_name - df['strand'] = df.strand.fillna('?') + df['strand'] = df.strand.fillna('') df['Parent'] = '' df.loc[(df.type == 'transcript') & (df.gene_id != ''), 'Parent'] = 'gene:' + df.gene_id @@ -442,23 +783,53 @@ def split_attributes(row): df.loc[(df.type == 'CDS') & (df.transcript_id != ''), 'Parent'] = ( 'transcript:' + df.transcript_id ) + df.loc[ + (df.type == 'CDS') & df.Parent.str.startswith('transcript:unassigned_transcript_'), 'Parent' + ] = '' + df.loc[(df.type == 'CDS') & (df.Parent == '') & (df.gene_id != ''), 'Parent'] = ( + 'gene:' + df.gene_id + ) df['version'] = '' df.loc[df.type == 'transcript', 'version'] = df.transcript_version df.loc[df.type == 'exon', 'version'] = df.exon_version df.loc[df.type == 'gene', 'version'] = df.gene_version + df.loc[df.type == 'CDS', 'version'] = df.protein_version - df['strand'] = df.strand.fillna('?') + df['strand'] = df.strand.fillna('') return convert_pandas_gff_to_mavis(df) +def convert_mavis_json_2to3(filename): + logging.info(f'loading: {filename}') + with open(filename, 'r') as fh: + content = json.load(fh) + + # move translations into sep object + for gene in content['genes']: + for transcript in gene.get('transcripts', []): + if any(transcript.get(k) for k in ['cdna_coding_start', 'cdna_coding_end', 'domains']): + transcript['translations'] = [ + { + 'cdna_coding_start': transcript['cdna_coding_start'], + 'cdna_coding_end': transcript['cdna_coding_end'], + 'domains': transcript['domains'], + } + ] + del transcript['domains'] + del transcript['cdna_coding_start'] + del transcript['cdna_coding_end'] + parse_annotations_json(content) + content = strip_empty_fields(content) + return content + + if __name__ == '__main__': - logging.basicConfig(format='{message}', style='{', level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( 'input', help='path to the tab-delimated mavis v2 style reference annotations file' ) - parser.add_argument('--input_type', default='v2', choices=['v2', 'gff3', 'gtf']) + parser.add_argument('--input_type', default='v2', choices=['v2-tab', 'v2-json', 'gff3', 'gtf']) parser.add_argument('output', help='path to the JSON output file') parser.add_argument( '--keep_alt', @@ -466,11 +837,18 @@ def split_attributes(row): action='store_true', default=False, ) + parser.add_argument( + '--log_level', choices=['INFO', 'DEBUG', 'WARNING', 'ERROR'], default='INFO' + ) args = parser.parse_args() - if args.input_type == 'v2': + logging.basicConfig(format='{message}', style='{', level=logging.getLevelName(args.log_level)) + + if args.input_type == 'v2-tab': annotations = convert_tab_to_json(args.input) + elif args.input_type == 'v2-json': + annotations = convert_mavis_json_2to3(args.input) elif args.input_type == 'gtf': annotations = convert_gff2_to_mavis(args.input, not args.keep_alt) else: diff --git a/tests/data/example_genes.json b/tests/data/example_genes.json index f1a6cf8e..470ac202 100644 --- a/tests/data/example_genes.json +++ b/tests/data/example_genes.json @@ -1,8037 +1 @@ -{ - "genes": [ - { - "aliases": [ - "EGFR" - ], - "chr": "7", - "end": 55324313, - "name": "ENSG00000146648", - "start": 55086714, - "strand": "+", - "transcripts": [ - { - "cdna_coding_end": 3533, - "cdna_coding_start": 258, - "domains": [ - { - "name": "PIRSF000619", - "regions": [ - { - "end": 1090, - "start": 1 - } - ] - }, - { - "name": "PF07714", - "regions": [ - { - "end": 920, - "start": 669 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 191, - "start": 28 - }, - { - "end": 475, - "start": 283 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 293, - "start": 141 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 934, - "start": 667 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 219, - "start": 145 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 290, - "start": 142 - }, - { - "end": 593, - "start": 460 - } - ] - }, - { - "name": "PR00109", - "regions": [ - { - "end": 758, - "start": 745 - }, - { - "end": 800, - "start": 782 - }, - { - "end": 841, - "start": 831 - }, - { - "end": 872, - "start": 850 - }, - { - "end": 916, - "start": 894 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 975, - "start": 651 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 141, - "start": 57 - }, - { - "end": 435, - "start": 316 - } - ] - }, - { - "name": "SM00220", - "regions": [ - { - "end": 924, - "start": 667 - } - ] - }, - { - "name": "SM00261", - "regions": [ - { - "end": 225, - "start": 183 - }, - { - "end": 502, - "start": 451 - }, - { - "end": 556, - "start": 507 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 923, - "start": 667 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 919, - "start": 667 - } - ] - } - ], - "end": 55270769, - "exons": [ - { - "end": 55087058, - "name": null, - "start": 55086714 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224525, - "name": null, - "start": 55224452 - }, - { - "end": 55225446, - "name": null, - "start": 55225356 - }, - { - "end": 55228031, - "name": null, - "start": 55227832 - }, - { - "end": 55229324, - "name": null, - "start": 55229192 - }, - { - "end": 55231516, - "name": null, - "start": 55231426 - }, - { - "end": 55233130, - "name": null, - "start": 55232973 - }, - { - "end": 55238906, - "name": null, - "start": 55238868 - }, - { - "end": 55240817, - "name": null, - "start": 55240676 - }, - { - "end": 55241736, - "name": null, - "start": 55241614 - }, - { - "end": 55242513, - "name": null, - "start": 55242415 - }, - { - "end": 55249171, - "name": null, - "start": 55248986 - }, - { - "end": 55259567, - "name": null, - "start": 55259412 - }, - { - "end": 55260534, - "name": null, - "start": 55260459 - }, - { - "end": 55266556, - "name": null, - "start": 55266410 - }, - { - "end": 55268106, - "name": null, - "start": 55268009 - }, - { - "end": 55269048, - "name": null, - "start": 55268881 - }, - { - "end": 55269475, - "name": null, - "start": 55269428 - }, - { - "end": 55270769, - "name": null, - "start": 55270210 - } - ], - "is_best_transcript": false, - "name": "ENST00000455089", - "start": 55086714 - }, - { - "cdna_coding_end": 2133, - "cdna_coding_start": 247, - "domains": [ - { - "name": "PS50311", - "regions": [ - { - "end": 264, - "start": 187 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 339, - "start": 182 - }, - { - "end": 624, - "start": 505 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 211, - "start": 29 - }, - { - "end": 520, - "start": 328 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 338, - "start": 185 - } - ] - }, - { - "name": "SM00261", - "regions": [ - { - "end": 270, - "start": 228 - }, - { - "end": 547, - "start": 496 - }, - { - "end": 601, - "start": 552 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 167, - "start": 57 - }, - { - "end": 480, - "start": 361 - } - ] - } - ], - "end": 55236328, - "exons": [ - { - "end": 55087058, - "name": null, - "start": 55086725 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214433, - "name": null, - "start": 55214299 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224525, - "name": null, - "start": 55224452 - }, - { - "end": 55225446, - "name": null, - "start": 55225356 - }, - { - "end": 55228031, - "name": null, - "start": 55227832 - }, - { - "end": 55229324, - "name": null, - "start": 55229192 - }, - { - "end": 55231516, - "name": null, - "start": 55231426 - }, - { - "end": 55233130, - "name": null, - "start": 55232973 - }, - { - "end": 55236328, - "name": null, - "start": 55236216 - } - ], - "is_best_transcript": false, - "name": "ENST00000342916", - "start": 55086725 - }, - { - "cdna_coding_end": 2363, - "cdna_coding_start": 246, - "domains": [ - { - "name": "SSF57184", - "regions": [ - { - "end": 339, - "start": 182 - }, - { - "end": 624, - "start": 505 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 264, - "start": 187 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 338, - "start": 185 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 211, - "start": 29 - }, - { - "end": 520, - "start": 328 - } - ] - }, - { - "name": "SM00261", - "regions": [ - { - "end": 270, - "start": 228 - }, - { - "end": 547, - "start": 496 - }, - { - "end": 601, - "start": 552 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 167, - "start": 57 - }, - { - "end": 480, - "start": 361 - } - ] - } - ], - "end": 55238738, - "exons": [ - { - "end": 55087058, - "name": null, - "start": 55086726 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214433, - "name": null, - "start": 55214299 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224525, - "name": null, - "start": 55224452 - }, - { - "end": 55225446, - "name": null, - "start": 55225356 - }, - { - "end": 55228031, - "name": null, - "start": 55227832 - }, - { - "end": 55229324, - "name": null, - "start": 55229192 - }, - { - "end": 55231516, - "name": null, - "start": 55231426 - }, - { - "end": 55233130, - "name": null, - "start": 55232973 - }, - { - "end": 55238738, - "name": null, - "start": 55238000 - } - ], - "is_best_transcript": false, - "name": "ENST00000344576", - "start": 55086726 - }, - { - "cdna_coding_end": 1462, - "cdna_coding_start": 245, - "domains": [ - { - "name": "SSF57184", - "regions": [ - { - "end": 339, - "start": 182 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 264, - "start": 187 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 338, - "start": 185 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 211, - "start": 29 - }, - { - "end": 403, - "start": 328 - } - ] - }, - { - "name": "SM00261", - "regions": [ - { - "end": 270, - "start": 228 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 167, - "start": 57 - } - ] - } - ], - "end": 55224644, - "exons": [ - { - "end": 55087058, - "name": null, - "start": 55086727 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214433, - "name": null, - "start": 55214299 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224644, - "name": null, - "start": 55224452 - } - ], - "is_best_transcript": false, - "name": "ENST00000420316", - "start": 55086727 - }, - { - "cdna_coding_end": 3810, - "cdna_coding_start": 178, - "domains": [ - { - "name": "SM00220", - "regions": [ - { - "end": 969, - "start": 712 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 167, - "start": 57 - }, - { - "end": 480, - "start": 361 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 1020, - "start": 696 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 964, - "start": 712 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 968, - "start": 712 - } - ] - }, - { - "name": "SM00261", - "regions": [ - { - "end": 270, - "start": 228 - }, - { - "end": 547, - "start": 496 - }, - { - "end": 601, - "start": 552 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 338, - "start": 185 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 211, - "start": 29 - }, - { - "end": 520, - "start": 328 - } - ] - }, - { - "name": "PF07714", - "regions": [ - { - "end": 965, - "start": 714 - } - ] - }, - { - "name": "PIRSF000619", - "regions": [ - { - "end": 1210, - "start": 1 - } - ] - }, - { - "name": "PR00109", - "regions": [ - { - "end": 803, - "start": 790 - }, - { - "end": 845, - "start": 827 - }, - { - "end": 886, - "start": 876 - }, - { - "end": 917, - "start": 895 - }, - { - "end": 961, - "start": 939 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 339, - "start": 182 - }, - { - "end": 638, - "start": 505 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 264, - "start": 187 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 979, - "start": 712 - } - ] - } - ], - "end": 55279321, - "exons": [ - { - "end": 55087058, - "name": null, - "start": 55086794 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214433, - "name": null, - "start": 55214299 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224525, - "name": null, - "start": 55224452 - }, - { - "end": 55225446, - "name": null, - "start": 55225356 - }, - { - "end": 55228031, - "name": null, - "start": 55227832 - }, - { - "end": 55229324, - "name": null, - "start": 55229192 - }, - { - "end": 55231516, - "name": null, - "start": 55231426 - }, - { - "end": 55233130, - "name": null, - "start": 55232973 - }, - { - "end": 55238906, - "name": null, - "start": 55238868 - }, - { - "end": 55240817, - "name": null, - "start": 55240676 - }, - { - "end": 55241736, - "name": null, - "start": 55241614 - }, - { - "end": 55242513, - "name": null, - "start": 55242415 - }, - { - "end": 55249171, - "name": null, - "start": 55248986 - }, - { - "end": 55259567, - "name": null, - "start": 55259412 - }, - { - "end": 55260534, - "name": null, - "start": 55260459 - }, - { - "end": 55266556, - "name": null, - "start": 55266410 - }, - { - "end": 55268106, - "name": null, - "start": 55268009 - }, - { - "end": 55269048, - "name": null, - "start": 55268881 - }, - { - "end": 55269475, - "name": null, - "start": 55269428 - }, - { - "end": 55270318, - "name": null, - "start": 55270210 - }, - { - "end": 55279321, - "name": null, - "start": 55272949 - } - ], - "is_best_transcript": true, - "name": "ENST00000275493", - "start": 55086794 - }, - { - "cdna_coding_end": 2134, - "cdna_coding_start": 161, - "domains": [ - { - "name": "PF01030", - "regions": [ - { - "end": 167, - "start": 57 - }, - { - "end": 480, - "start": 361 - } - ] - }, - { - "name": "SM00261", - "regions": [ - { - "end": 270, - "start": 228 - }, - { - "end": 547, - "start": 496 - }, - { - "end": 601, - "start": 552 - }, - { - "end": 653, - "start": 614 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 211, - "start": 29 - }, - { - "end": 520, - "start": 328 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 338, - "start": 185 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 264, - "start": 187 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 339, - "start": 182 - }, - { - "end": 638, - "start": 505 - } - ] - } - ], - "end": 55324313, - "exons": [ - { - "end": 55087058, - "name": null, - "start": 55086811 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214433, - "name": null, - "start": 55214299 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224525, - "name": null, - "start": 55224452 - }, - { - "end": 55225446, - "name": null, - "start": 55225356 - }, - { - "end": 55228031, - "name": null, - "start": 55227832 - }, - { - "end": 55229324, - "name": null, - "start": 55229192 - }, - { - "end": 55231516, - "name": null, - "start": 55231426 - }, - { - "end": 55233130, - "name": null, - "start": 55232973 - }, - { - "end": 55238906, - "name": null, - "start": 55238868 - }, - { - "end": 55240621, - "name": null, - "start": 55240539 - }, - { - "end": 55324313, - "name": null, - "start": 55323947 - } - ], - "is_best_transcript": false, - "name": "ENST00000442591", - "start": 55086811 - }, - { - "cdna_coding_end": 691, - "cdna_coding_start": 308, - "domains": [ - { - "name": "SSF52058", - "regions": [ - { - "end": 127, - "start": 1 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 114, - "start": 4 - } - ] - } - ], - "end": 55214417, - "exons": [ - { - "end": 55177651, - "name": null, - "start": 55177416 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214417, - "name": null, - "start": 55214299 - } - ], - "is_best_transcript": false, - "name": "ENST00000450046", - "start": 55177416 - }, - { - "cdna_coding_end": 3657, - "cdna_coding_start": 184, - "domains": [ - { - "name": "SM00261", - "regions": [ - { - "end": 217, - "start": 175 - }, - { - "end": 494, - "start": 443 - }, - { - "end": 548, - "start": 499 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 911, - "start": 659 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 915, - "start": 659 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 967, - "start": 643 - } - ] - }, - { - "name": "SM00220", - "regions": [ - { - "end": 916, - "start": 659 - } - ] - }, - { - "name": "PF01030", - "regions": [ - { - "end": 114, - "start": 4 - }, - { - "end": 427, - "start": 308 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 211, - "start": 134 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 926, - "start": 659 - } - ] - }, - { - "name": "PR00109", - "regions": [ - { - "end": 750, - "start": 737 - }, - { - "end": 792, - "start": 774 - }, - { - "end": 833, - "start": 823 - }, - { - "end": 864, - "start": 842 - }, - { - "end": 908, - "start": 886 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 286, - "start": 129 - }, - { - "end": 585, - "start": 452 - } - ] - }, - { - "name": "PIRSF000619", - "regions": [ - { - "end": 1157, - "start": 1 - } - ] - }, - { - "name": "PF07714", - "regions": [ - { - "end": 912, - "start": 661 - } - ] - }, - { - "name": "SSF52058", - "regions": [ - { - "end": 158, - "start": 1 - }, - { - "end": 467, - "start": 275 - } - ] - }, - { - "name": "PF00757", - "regions": [ - { - "end": 285, - "start": 132 - } - ] - } - ], - "end": 55273591, - "exons": [ - { - "end": 55177651, - "name": null, - "start": 55177540 - }, - { - "end": 55210130, - "name": null, - "start": 55209979 - }, - { - "end": 55211181, - "name": null, - "start": 55210998 - }, - { - "end": 55214433, - "name": null, - "start": 55214299 - }, - { - "end": 55219055, - "name": null, - "start": 55218987 - }, - { - "end": 55220357, - "name": null, - "start": 55220239 - }, - { - "end": 55221845, - "name": null, - "start": 55221704 - }, - { - "end": 55223639, - "name": null, - "start": 55223523 - }, - { - "end": 55224352, - "name": null, - "start": 55224226 - }, - { - "end": 55224525, - "name": null, - "start": 55224452 - }, - { - "end": 55225446, - "name": null, - "start": 55225356 - }, - { - "end": 55228031, - "name": null, - "start": 55227832 - }, - { - "end": 55229324, - "name": null, - "start": 55229192 - }, - { - "end": 55231516, - "name": null, - "start": 55231426 - }, - { - "end": 55233130, - "name": null, - "start": 55232973 - }, - { - "end": 55238906, - "name": null, - "start": 55238868 - }, - { - "end": 55240817, - "name": null, - "start": 55240676 - }, - { - "end": 55241736, - "name": null, - "start": 55241614 - }, - { - "end": 55242513, - "name": null, - "start": 55242415 - }, - { - "end": 55249171, - "name": null, - "start": 55248986 - }, - { - "end": 55259567, - "name": null, - "start": 55259412 - }, - { - "end": 55260534, - "name": null, - "start": 55260459 - }, - { - "end": 55266556, - "name": null, - "start": 55266410 - }, - { - "end": 55268106, - "name": null, - "start": 55268009 - }, - { - "end": 55269048, - "name": null, - "start": 55268881 - }, - { - "end": 55269475, - "name": null, - "start": 55269428 - }, - { - "end": 55270318, - "name": null, - "start": 55270210 - }, - { - "end": 55273591, - "name": null, - "start": 55272949 - } - ], - "is_best_transcript": false, - "name": "ENST00000454757", - "start": 55177540 - } - ] - }, - { - "aliases": [ - "DSTYK" - ], - "chr": "1", - "end": 205180727, - "name": "ENSG00000133059", - "start": 205111632, - "strand": "-", - "transcripts": [ - { - "cdna_coding_end": 1831, - "cdna_coding_start": 65, - "domains": [ - { - "name": "SM00220", - "regions": [ - { - "end": 565, - "start": 337 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 585, - "start": 452 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 556, - "start": 451 - } - ] - }, - { - "name": "PF07714", - "regions": [ - { - "end": 558, - "start": 471 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 565, - "start": 312 - } - ] - } - ], - "end": 205180727, - "exons": [ - { - "end": 205116873, - "name": null, - "start": 205111632 - }, - { - "end": 205117467, - "name": null, - "start": 205117333 - }, - { - "end": 205119898, - "name": null, - "start": 205119808 - }, - { - "end": 205133083, - "name": null, - "start": 205133055 - }, - { - "end": 205138960, - "name": null, - "start": 205138291 - }, - { - "end": 205156934, - "name": null, - "start": 205156546 - }, - { - "end": 205180727, - "name": null, - "start": 205180399 - } - ], - "is_best_transcript": false, - "name": "ENST00000367160", - "start": 205111632 - }, - { - "cdna_coding_end": 2686, - "cdna_coding_start": 32, - "domains": [ - { - "name": "PF07714", - "regions": [ - { - "end": 820, - "start": 654 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 884, - "start": 652 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 853, - "start": 627 - } - ] - }, - { - "name": "SM00220", - "regions": [ - { - "end": 861, - "start": 652 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 824, - "start": 654 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 861, - "start": 652 - } - ] - } - ], - "end": 205180694, - "exons": [ - { - "end": 205116873, - "name": null, - "start": 205111633 - }, - { - "end": 205119922, - "name": null, - "start": 205119808 - }, - { - "end": 205126514, - "name": null, - "start": 205126401 - }, - { - "end": 205128807, - "name": null, - "start": 205128675 - }, - { - "end": 205129398, - "name": null, - "start": 205129242 - }, - { - "end": 205130515, - "name": null, - "start": 205130386 - }, - { - "end": 205131340, - "name": null, - "start": 205131164 - }, - { - "end": 205132134, - "name": null, - "start": 205132051 - }, - { - "end": 205133083, - "name": null, - "start": 205132851 - }, - { - "end": 205138960, - "name": null, - "start": 205138291 - }, - { - "end": 205156934, - "name": null, - "start": 205156546 - }, - { - "end": 205180694, - "name": null, - "start": 205180399 - } - ], - "is_best_transcript": false, - "name": "ENST00000367161", - "start": 205111633 - }, - { - "cdna_coding_end": 2821, - "cdna_coding_start": 32, - "domains": [ - { - "name": "PF07714", - "regions": [ - { - "end": 899, - "start": 654 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 906, - "start": 652 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 897, - "start": 638 - } - ] - }, - { - "name": "SM00220", - "regions": [ - { - "end": 906, - "start": 652 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 906, - "start": 652 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 897, - "start": 654 - } - ] - } - ], - "end": 205180694, - "exons": [ - { - "end": 205116873, - "name": null, - "start": 205111633 - }, - { - "end": 205117467, - "name": null, - "start": 205117333 - }, - { - "end": 205119922, - "name": null, - "start": 205119808 - }, - { - "end": 205126514, - "name": null, - "start": 205126401 - }, - { - "end": 205128807, - "name": null, - "start": 205128675 - }, - { - "end": 205129398, - "name": null, - "start": 205129242 - }, - { - "end": 205130515, - "name": null, - "start": 205130386 - }, - { - "end": 205131340, - "name": null, - "start": 205131164 - }, - { - "end": 205132134, - "name": null, - "start": 205132051 - }, - { - "end": 205133083, - "name": null, - "start": 205132851 - }, - { - "end": 205138960, - "name": null, - "start": 205138291 - }, - { - "end": 205156934, - "name": null, - "start": 205156546 - }, - { - "end": 205180694, - "name": null, - "start": 205180399 - } - ], - "is_best_transcript": true, - "name": "ENST00000367162", - "start": 205111633 - } - ] - }, - { - "aliases": [ - "NDUFA12" - ], - "chr": "12", - "end": 95397546, - "name": "ENSG00000184752", - "start": 95290831, - "strand": "-", - "transcripts": [ - { - "domains": [], - "end": 95397436, - "exons": [ - { - "end": 95291086, - "name": null, - "start": 95290831 - }, - { - "end": 95318582, - "name": null, - "start": 95318422 - }, - { - "end": 95322039, - "name": null, - "start": 95321793 - }, - { - "end": 95396597, - "name": null, - "start": 95396515 - }, - { - "end": 95397436, - "name": null, - "start": 95397371 - } - ], - "is_best_transcript": false, - "name": "ENST00000552205", - "start": 95290831 - }, - { - "cdna_coding_end": 188, - "cdna_coding_start": 21, - "domains": [], - "end": 95397476, - "exons": [ - { - "end": 95365261, - "name": null, - "start": 95365108 - }, - { - "end": 95396597, - "name": null, - "start": 95396582 - }, - { - "end": 95397476, - "name": null, - "start": 95397371 - } - ], - "is_best_transcript": false, - "name": "ENST00000547157", - "start": 95365108 - }, - { - "cdna_coding_end": 144, - "cdna_coding_start": 1, - "domains": [ - { - "name": "PF05071", - "regions": [ - { - "end": 33, - "start": 12 - } - ] - } - ], - "end": 95397384, - "exons": [ - { - "end": 95365396, - "name": null, - "start": 95365109 - }, - { - "end": 95388033, - "name": null, - "start": 95387946 - }, - { - "end": 95390752, - "name": null, - "start": 95390680 - }, - { - "end": 95396597, - "name": null, - "start": 95396515 - }, - { - "end": 95397384, - "name": null, - "start": 95397371 - } - ], - "is_best_transcript": false, - "name": "ENST00000551991", - "start": 95365109 - }, - { - "cdna_coding_end": 528, - "cdna_coding_start": 91, - "domains": [ - { - "name": "PF05071", - "regions": [ - { - "end": 137, - "start": 36 - } - ] - } - ], - "end": 95397546, - "exons": [ - { - "end": 95365396, - "name": null, - "start": 95365109 - }, - { - "end": 95388033, - "name": null, - "start": 95387946 - }, - { - "end": 95396597, - "name": null, - "start": 95396515 - }, - { - "end": 95397546, - "name": null, - "start": 95397371 - } - ], - "is_best_transcript": true, - "name": "ENST00000327772", - "start": 95365109 - }, - { - "cdna_coding_end": 225, - "cdna_coding_start": 34, - "domains": [ - { - "name": "PF05071", - "regions": [ - { - "end": 53, - "start": 36 - } - ] - } - ], - "end": 95397489, - "exons": [ - { - "end": 95365396, - "name": null, - "start": 95365112 - }, - { - "end": 95396597, - "name": null, - "start": 95396515 - }, - { - "end": 95397489, - "name": null, - "start": 95397371 - } - ], - "is_best_transcript": false, - "name": "ENST00000547986", - "start": 95365112 - }, - { - "cdna_coding_end": 368, - "cdna_coding_start": 69, - "domains": [ - { - "name": "PF05071", - "regions": [ - { - "end": 87, - "start": 36 - } - ] - } - ], - "end": 95397524, - "exons": [ - { - "end": 95365396, - "name": null, - "start": 95365254 - }, - { - "end": 95366265, - "name": null, - "start": 95366171 - }, - { - "end": 95388033, - "name": null, - "start": 95387946 - }, - { - "end": 95396597, - "name": null, - "start": 95396515 - }, - { - "end": 95397524, - "name": null, - "start": 95397371 - } - ], - "is_best_transcript": false, - "name": "ENST00000546788", - "start": 95365254 - } - ] - }, - { - "aliases": [ - "FRMD6" - ], - "chr": "14", - "end": 52197445, - "name": "ENSG00000139926", - "start": 51955818, - "strand": "+", - "transcripts": [ - { - "cdna_coding_end": 2338, - "cdna_coding_start": 494, - "domains": [ - { - "name": "PF09379", - "regions": [ - { - "end": 109, - "start": 20 - } - ] - }, - { - "name": "PF09380", - "regions": [ - { - "end": 322, - "start": 237 - } - ] - }, - { - "name": "SSF50729", - "regions": [ - { - "end": 375, - "start": 219 - } - ] - }, - { - "name": "SM00295", - "regions": [ - { - "end": 226, - "start": 12 - } - ] - }, - { - "name": "PS50057", - "regions": [ - { - "end": 320, - "start": 16 - } - ] - }, - { - "name": "PF00373", - "regions": [ - { - "end": 226, - "start": 115 - } - ] - }, - { - "name": "SSF47031", - "regions": [ - { - "end": 218, - "start": 110 - } - ] - }, - { - "name": "SSF54236", - "regions": [ - { - "end": 110, - "start": 14 - } - ] - } - ], - "end": 52197177, - "exons": [ - { - "end": 51956138, - "name": null, - "start": 51955855 - }, - { - "end": 52037128, - "name": null, - "start": 52037066 - }, - { - "end": 52156653, - "name": null, - "start": 52156409 - }, - { - "end": 52164950, - "name": null, - "start": 52164860 - }, - { - "end": 52167853, - "name": null, - "start": 52167774 - }, - { - "end": 52169306, - "name": null, - "start": 52169230 - }, - { - "end": 52171653, - "name": null, - "start": 52171467 - }, - { - "end": 52174951, - "name": null, - "start": 52174796 - }, - { - "end": 52178314, - "name": null, - "start": 52178249 - }, - { - "end": 52179269, - "name": null, - "start": 52179201 - }, - { - "end": 52182217, - "name": null, - "start": 52182043 - }, - { - "end": 52187108, - "name": null, - "start": 52186773 - }, - { - "end": 52188798, - "name": null, - "start": 52188667 - }, - { - "end": 52192588, - "name": null, - "start": 52192497 - }, - { - "end": 52197177, - "name": null, - "start": 52194463 - } - ], - "is_best_transcript": false, - "name": "ENST00000356218", - "start": 51955855 - }, - { - "cdna_coding_end": 2130, - "cdna_coding_start": 286, - "domains": [ - { - "name": "PF00373", - "regions": [ - { - "end": 226, - "start": 115 - } - ] - }, - { - "name": "SSF47031", - "regions": [ - { - "end": 218, - "start": 110 - } - ] - }, - { - "name": "SSF54236", - "regions": [ - { - "end": 110, - "start": 14 - } - ] - }, - { - "name": "PS50057", - "regions": [ - { - "end": 320, - "start": 16 - } - ] - }, - { - "name": "SM00295", - "regions": [ - { - "end": 226, - "start": 12 - } - ] - }, - { - "name": "SSF50729", - "regions": [ - { - "end": 375, - "start": 219 - } - ] - }, - { - "name": "PF09380", - "regions": [ - { - "end": 322, - "start": 237 - } - ] - }, - { - "name": "PF09379", - "regions": [ - { - "end": 109, - "start": 20 - } - ] - } - ], - "end": 52197445, - "exons": [ - { - "end": 52118714, - "name": null, - "start": 52118576 - }, - { - "end": 52156653, - "name": null, - "start": 52156409 - }, - { - "end": 52164950, - "name": null, - "start": 52164860 - }, - { - "end": 52167853, - "name": null, - "start": 52167774 - }, - { - "end": 52169306, - "name": null, - "start": 52169230 - }, - { - "end": 52171653, - "name": null, - "start": 52171467 - }, - { - "end": 52174951, - "name": null, - "start": 52174796 - }, - { - "end": 52178314, - "name": null, - "start": 52178249 - }, - { - "end": 52179269, - "name": null, - "start": 52179201 - }, - { - "end": 52182217, - "name": null, - "start": 52182043 - }, - { - "end": 52187108, - "name": null, - "start": 52186773 - }, - { - "end": 52188798, - "name": null, - "start": 52188667 - }, - { - "end": 52192588, - "name": null, - "start": 52192497 - }, - { - "end": 52197445, - "name": null, - "start": 52194463 - } - ], - "is_best_transcript": true, - "name": "ENST00000395718", - "start": 52118576 - }, - { - "cdna_coding_end": 2065, - "cdna_coding_start": 197, - "domains": [ - { - "name": "PF09380", - "regions": [ - { - "end": 330, - "start": 245 - } - ] - }, - { - "name": "PF09379", - "regions": [ - { - "end": 117, - "start": 20 - } - ] - }, - { - "name": "SSF47031", - "regions": [ - { - "end": 226, - "start": 118 - } - ] - }, - { - "name": "PF00373", - "regions": [ - { - "end": 234, - "start": 123 - } - ] - }, - { - "name": "SSF54236", - "regions": [ - { - "end": 118, - "start": 14 - } - ] - }, - { - "name": "PS50057", - "regions": [ - { - "end": 328, - "start": 16 - } - ] - }, - { - "name": "SM00295", - "regions": [ - { - "end": 234, - "start": 12 - } - ] - }, - { - "name": "SSF50729", - "regions": [ - { - "end": 383, - "start": 227 - } - ] - } - ], - "end": 52195654, - "exons": [ - { - "end": 52118714, - "name": null, - "start": 52118665 - }, - { - "end": 52156653, - "name": null, - "start": 52156409 - }, - { - "end": 52164950, - "name": null, - "start": 52164860 - }, - { - "end": 52167877, - "name": null, - "start": 52167774 - }, - { - "end": 52169306, - "name": null, - "start": 52169230 - }, - { - "end": 52171653, - "name": null, - "start": 52171467 - }, - { - "end": 52174951, - "name": null, - "start": 52174796 - }, - { - "end": 52178314, - "name": null, - "start": 52178249 - }, - { - "end": 52179269, - "name": null, - "start": 52179201 - }, - { - "end": 52182217, - "name": null, - "start": 52182043 - }, - { - "end": 52187108, - "name": null, - "start": 52186773 - }, - { - "end": 52188798, - "name": null, - "start": 52188667 - }, - { - "end": 52192588, - "name": null, - "start": 52192497 - }, - { - "end": 52195654, - "name": null, - "start": 52194463 - } - ], - "is_best_transcript": false, - "name": "ENST00000344768", - "start": 52118665 - }, - { - "domains": [], - "end": 52164945, - "exons": [ - { - "end": 52118935, - "name": null, - "start": 52118698 - }, - { - "end": 52156653, - "name": null, - "start": 52156409 - }, - { - "end": 52164945, - "name": null, - "start": 52164860 - } - ], - "is_best_transcript": false, - "name": "ENST00000554778", - "start": 52118698 - }, - { - "domains": [], - "end": 52174806, - "exons": [ - { - "end": 52164950, - "name": null, - "start": 52164706 - }, - { - "end": 52167877, - "name": null, - "start": 52167774 - }, - { - "end": 52169306, - "name": null, - "start": 52169230 - }, - { - "end": 52171653, - "name": null, - "start": 52171467 - }, - { - "end": 52174806, - "name": null, - "start": 52174796 - } - ], - "is_best_transcript": false, - "name": "ENST00000555936", - "start": 52164706 - }, - { - "cdna_coding_end": 1775, - "cdna_coding_start": 138, - "domains": [ - { - "name": "SSF50729", - "regions": [ - { - "end": 306, - "start": 150 - } - ] - }, - { - "name": "PS50057", - "regions": [ - { - "end": 251, - "start": 1 - } - ] - }, - { - "name": "SSF54236", - "regions": [ - { - "end": 41, - "start": 1 - } - ] - }, - { - "name": "SSF47031", - "regions": [ - { - "end": 149, - "start": 41 - } - ] - }, - { - "name": "PF00373", - "regions": [ - { - "end": 157, - "start": 46 - } - ] - }, - { - "name": "PF09380", - "regions": [ - { - "end": 253, - "start": 168 - } - ] - } - ], - "end": 52197148, - "exons": [ - { - "end": 52164950, - "name": null, - "start": 52164831 - }, - { - "end": 52167853, - "name": null, - "start": 52167774 - }, - { - "end": 52169306, - "name": null, - "start": 52169230 - }, - { - "end": 52171653, - "name": null, - "start": 52171467 - }, - { - "end": 52174951, - "name": null, - "start": 52174796 - }, - { - "end": 52178314, - "name": null, - "start": 52178249 - }, - { - "end": 52179269, - "name": null, - "start": 52179201 - }, - { - "end": 52182217, - "name": null, - "start": 52182043 - }, - { - "end": 52187108, - "name": null, - "start": 52186773 - }, - { - "end": 52188798, - "name": null, - "start": 52188667 - }, - { - "end": 52192588, - "name": null, - "start": 52192497 - }, - { - "end": 52197148, - "name": null, - "start": 52194463 - } - ], - "is_best_transcript": false, - "name": "ENST00000554167", - "start": 52164831 - }, - { - "cdna_coding_end": 390, - "cdna_coding_start": 1, - "domains": [ - { - "name": "PS50057", - "regions": [ - { - "end": 129, - "start": 1 - } - ] - }, - { - "name": "PF00373", - "regions": [ - { - "end": 124, - "start": 13 - } - ] - }, - { - "name": "SSF47031", - "regions": [ - { - "end": 116, - "start": 8 - } - ] - } - ], - "end": 52175062, - "exons": [ - { - "end": 52169306, - "name": null, - "start": 52169266 - }, - { - "end": 52171653, - "name": null, - "start": 52171467 - }, - { - "end": 52175062, - "name": null, - "start": 52174796 - } - ], - "is_best_transcript": false, - "name": "ENST00000557405", - "start": 52169266 - }, - { - "cdna_coding_end": 618, - "cdna_coding_start": 1, - "domains": [ - { - "name": "PF09380", - "regions": [ - { - "end": 60, - "start": 2 - } - ] - }, - { - "name": "PS50057", - "regions": [ - { - "end": 58, - "start": 1 - } - ] - }, - { - "name": "SSF50729", - "regions": [ - { - "end": 113, - "start": 2 - } - ] - } - ], - "end": 52187243, - "exons": [ - { - "end": 52179269, - "name": null, - "start": 52179231 - }, - { - "end": 52182217, - "name": null, - "start": 52182043 - }, - { - "end": 52187243, - "name": null, - "start": 52186773 - } - ], - "is_best_transcript": false, - "name": "ENST00000555197", - "start": 52179231 - }, - { - "cdna_coding_end": 573, - "cdna_coding_start": 145, - "domains": [], - "end": 52192513, - "exons": [ - { - "end": 52184066, - "name": null, - "start": 52183973 - }, - { - "end": 52187108, - "name": null, - "start": 52186773 - }, - { - "end": 52188798, - "name": null, - "start": 52188673 - }, - { - "end": 52192513, - "name": null, - "start": 52192497 - } - ], - "is_best_transcript": false, - "name": "ENST00000555703", - "start": 52183973 - }, - { - "cdna_coding_end": 939, - "cdna_coding_start": 145, - "domains": [], - "end": 52195487, - "exons": [ - { - "end": 52184066, - "name": null, - "start": 52183973 - }, - { - "end": 52187108, - "name": null, - "start": 52186773 - }, - { - "end": 52188798, - "name": null, - "start": 52188667 - }, - { - "end": 52192588, - "name": null, - "start": 52192497 - }, - { - "end": 52195487, - "name": null, - "start": 52194463 - } - ], - "is_best_transcript": false, - "name": "ENST00000553556", - "start": 52183973 - } - ] - }, - { - "aliases": [ - "PRKCB" - ], - "chr": "16", - "end": 24231932, - "name": "ENSG00000166501", - "start": 23847322, - "strand": "+", - "transcripts": [ - { - "cdna_coding_end": 2191, - "cdna_coding_start": 176, - "domains": [ - { - "name": "SM00239", - "regions": [ - { - "end": 275, - "start": 172 - } - ] - }, - { - "name": "PF07714", - "regions": [ - { - "end": 583, - "start": 344 - } - ] - }, - { - "name": "SSF49562", - "regions": [ - { - "end": 288, - "start": 157 - } - ] - }, - { - "name": "SM00109", - "regions": [ - { - "end": 86, - "start": 37 - }, - { - "end": 151, - "start": 102 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 600, - "start": 342 - } - ] - }, - { - "name": "PR00008", - "regions": [ - { - "end": 48, - "start": 34 - }, - { - "end": 59, - "start": 50 - }, - { - "end": 74, - "start": 63 - }, - { - "end": 152, - "start": 140 - } - ] - }, - { - "name": "PF00433", - "regions": [ - { - "end": 666, - "start": 623 - } - ] - }, - { - "name": "SM00220", - "regions": [ - { - "end": 600, - "start": 342 - } - ] - }, - { - "name": "PF00168", - "regions": [ - { - "end": 259, - "start": 175 - } - ] - }, - { - "name": "SSF57889", - "regions": [ - { - "end": 92, - "start": 6 - }, - { - "end": 157, - "start": 101 - } - ] - }, - { - "name": "PF00130", - "regions": [ - { - "end": 87, - "start": 37 - }, - { - "end": 153, - "start": 102 - } - ] - }, - { - "name": "PS50081", - "regions": [ - { - "end": 86, - "start": 36 - }, - { - "end": 151, - "start": 101 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 627, - "start": 317 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 586, - "start": 343 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 576, - "start": 342 - } - ] - }, - { - "name": "PR00360", - "regions": [ - { - "end": 200, - "start": 188 - }, - { - "end": 230, - "start": 217 - }, - { - "end": 248, - "start": 240 - } - ] - }, - { - "name": "SM00133", - "regions": [ - { - "end": 664, - "start": 601 - } - ] - }, - { - "name": "PS50004", - "regions": [ - { - "end": 260, - "start": 173 - } - ] - }, - { - "name": "PIRSF000550", - "regions": [ - { - "end": 671, - "start": 1 - } - ] - } - ], - "end": 24231932, - "exons": [ - { - "end": 23847669, - "name": null, - "start": 23847322 - }, - { - "end": 23848727, - "name": null, - "start": 23848696 - }, - { - "end": 23999911, - "name": null, - "start": 23999829 - }, - { - "end": 24043568, - "name": null, - "start": 24043457 - }, - { - "end": 24046868, - "name": null, - "start": 24046740 - }, - { - "end": 24104268, - "name": null, - "start": 24104112 - }, - { - "end": 24105618, - "name": null, - "start": 24105484 - }, - { - "end": 24124390, - "name": null, - "start": 24124294 - }, - { - "end": 24135302, - "name": null, - "start": 24135156 - }, - { - "end": 24166178, - "name": null, - "start": 24166005 - }, - { - "end": 24183682, - "name": null, - "start": 24183591 - }, - { - "end": 24185901, - "name": null, - "start": 24185839 - }, - { - "end": 24192249, - "name": null, - "start": 24192111 - }, - { - "end": 24196512, - "name": null, - "start": 24196432 - }, - { - "end": 24196888, - "name": null, - "start": 24196781 - }, - { - "end": 24202551, - "name": null, - "start": 24202411 - }, - { - "end": 24231932, - "name": null, - "start": 24231282 - } - ], - "is_best_transcript": true, - "name": "ENST00000321728", - "start": 23847322 - }, - { - "cdna_coding_end": 2174, - "cdna_coding_start": 153, - "domains": [ - { - "name": "SM00133", - "regions": [ - { - "end": 663, - "start": 601 - } - ] - }, - { - "name": "PS50004", - "regions": [ - { - "end": 260, - "start": 173 - } - ] - }, - { - "name": "PIRSF000550", - "regions": [ - { - "end": 672, - "start": 1 - } - ] - }, - { - "name": "PF00069", - "regions": [ - { - "end": 586, - "start": 343 - } - ] - }, - { - "name": "PR00360", - "regions": [ - { - "end": 200, - "start": 188 - }, - { - "end": 230, - "start": 217 - }, - { - "end": 248, - "start": 240 - } - ] - }, - { - "name": "SM00219", - "regions": [ - { - "end": 576, - "start": 342 - } - ] - }, - { - "name": "PS50081", - "regions": [ - { - "end": 86, - "start": 36 - }, - { - "end": 151, - "start": 101 - } - ] - }, - { - "name": "SSF56112", - "regions": [ - { - "end": 627, - "start": 317 - } - ] - }, - { - "name": "SM00220", - "regions": [ - { - "end": 600, - "start": 342 - } - ] - }, - { - "name": "PF00433", - "regions": [ - { - "end": 664, - "start": 627 - } - ] - }, - { - "name": "PF00130", - "regions": [ - { - "end": 87, - "start": 37 - }, - { - "end": 153, - "start": 102 - } - ] - }, - { - "name": "PF00168", - "regions": [ - { - "end": 259, - "start": 175 - } - ] - }, - { - "name": "SSF57889", - "regions": [ - { - "end": 92, - "start": 6 - }, - { - "end": 157, - "start": 101 - } - ] - }, - { - "name": "PR00008", - "regions": [ - { - "end": 48, - "start": 34 - }, - { - "end": 59, - "start": 50 - }, - { - "end": 74, - "start": 63 - }, - { - "end": 152, - "start": 140 - } - ] - }, - { - "name": "PS50011", - "regions": [ - { - "end": 600, - "start": 342 - } - ] - }, - { - "name": "SM00109", - "regions": [ - { - "end": 86, - "start": 37 - }, - { - "end": 151, - "start": 102 - } - ] - }, - { - "name": "PF07714", - "regions": [ - { - "end": 583, - "start": 344 - } - ] - }, - { - "name": "SSF49562", - "regions": [ - { - "end": 288, - "start": 157 - } - ] - }, - { - "name": "SM00239", - "regions": [ - { - "end": 275, - "start": 172 - } - ] - } - ], - "end": 24231932, - "exons": [ - { - "end": 23847669, - "name": null, - "start": 23847345 - }, - { - "end": 23848727, - "name": null, - "start": 23848696 - }, - { - "end": 23999911, - "name": null, - "start": 23999829 - }, - { - "end": 24043568, - "name": null, - "start": 24043457 - }, - { - "end": 24046868, - "name": null, - "start": 24046740 - }, - { - "end": 24104268, - "name": null, - "start": 24104112 - }, - { - "end": 24105618, - "name": null, - "start": 24105484 - }, - { - "end": 24124390, - "name": null, - "start": 24124294 - }, - { - "end": 24135302, - "name": null, - "start": 24135156 - }, - { - "end": 24166178, - "name": null, - "start": 24166005 - }, - { - "end": 24183682, - "name": null, - "start": 24183591 - }, - { - "end": 24185901, - "name": null, - "start": 24185839 - }, - { - "end": 24192249, - "name": null, - "start": 24192111 - }, - { - "end": 24196512, - "name": null, - "start": 24196432 - }, - { - "end": 24196888, - "name": null, - "start": 24196781 - }, - { - "end": 24202551, - "name": null, - "start": 24202411 - }, - { - "end": 24231932, - "name": null, - "start": 24225979 - } - ], - "is_best_transcript": false, - "name": "ENST00000303531", - "start": 23847345 - }, - { - "cdna_coding_end": 268, - "cdna_coding_start": 95, - "domains": [ - { - "name": "PR00008", - "regions": [ - { - "end": 48, - "start": 34 - }, - { - "end": 57, - "start": 50 - } - ] - }, - { - "name": "PS50081", - "regions": [ - { - "end": 57, - "start": 36 - } - ] - }, - { - "name": "SSF57889", - "regions": [ - { - "end": 57, - "start": 6 - } - ] - } - ], - "end": 23880647, - "exons": [ - { - "end": 23847669, - "name": null, - "start": 23847403 - }, - { - "end": 23880647, - "name": null, - "start": 23880435 - } - ], - "is_best_transcript": false, - "name": "ENST00000498058", - "start": 23847403 - }, - { - "domains": [], - "end": 24124386, - "exons": [ - { - "end": 23848727, - "name": null, - "start": 23848544 - }, - { - "end": 24104268, - "name": null, - "start": 24104112 - }, - { - "end": 24105618, - "name": null, - "start": 24105484 - }, - { - "end": 24124386, - "name": null, - "start": 24124294 - } - ], - "is_best_transcript": false, - "name": "ENST00000498739", - "start": 23848544 - }, - { - "domains": [], - "end": 24192166, - "exons": [ - { - "end": 24163176, - "name": null, - "start": 24163006 - }, - { - "end": 24166178, - "name": null, - "start": 24166005 - }, - { - "end": 24183682, - "name": null, - "start": 24183591 - }, - { - "end": 24185901, - "name": null, - "start": 24185839 - }, - { - "end": 24192166, - "name": null, - "start": 24192111 - } - ], - "is_best_transcript": false, - "name": "ENST00000472066", - "start": 24163006 - }, - { - "domains": [], - "end": 24202909, - "exons": [ - { - "end": 24196888, - "name": null, - "start": 24196852 - }, - { - "end": 24202909, - "name": null, - "start": 24202411 - } - ], - "is_best_transcript": false, - "name": "ENST00000466124", - "start": 24196852 - } - ] - }, - { - "aliases": [ - "GIMAP4" - ], - "chr": "7", - "end": 150271041, - "name": "ENSG00000133574", - "start": 150264365, - "strand": "+", - "transcripts": [ - { - "cdna_coding_end": 1165, - "cdna_coding_start": 176, - "domains": [ - { - "name": "PF04548", - "regions": [ - { - "end": 238, - "start": 31 - } - ] - }, - { - "name": "SSF52540", - "regions": [ - { - "end": 288, - "start": 24 - } - ] - } - ], - "end": 150271041, - "exons": [ - { - "end": 150264525, - "name": null, - "start": 150264365 - }, - { - "end": 150267047, - "name": null, - "start": 150266976 - }, - { - "end": 150271041, - "name": null, - "start": 150269217 - } - ], - "is_best_transcript": true, - "name": "ENST00000255945", - "start": 150264365 - }, - { - "cdna_coding_end": 1115, - "cdna_coding_start": 84, - "domains": [ - { - "name": "PF04548", - "regions": [ - { - "end": 252, - "start": 45 - } - ] - }, - { - "name": "SSF52540", - "regions": [ - { - "end": 302, - "start": 38 - } - ] - } - ], - "end": 150270602, - "exons": [ - { - "end": 150264525, - "name": null, - "start": 150264457 - }, - { - "end": 150267089, - "name": null, - "start": 150266976 - }, - { - "end": 150270602, - "name": null, - "start": 150269217 - } - ], - "is_best_transcript": false, - "name": "ENST00000461940", - "start": 150264457 - }, - { - "cdna_coding_end": 552, - "cdna_coding_start": 100, - "domains": [ - { - "name": "SSF52540", - "regions": [ - { - "end": 151, - "start": 38 - } - ] - }, - { - "name": "PF04548", - "regions": [ - { - "end": 151, - "start": 45 - } - ] - } - ], - "end": 150269569, - "exons": [ - { - "end": 150264608, - "name": null, - "start": 150264524 - }, - { - "end": 150267089, - "name": null, - "start": 150266976 - }, - { - "end": 150269569, - "name": null, - "start": 150269217 - } - ], - "is_best_transcript": false, - "name": "ENST00000479232", - "start": 150264524 - } - ] - }, - { - "aliases": [ - "IL7" - ], - "chr": "8", - "end": 79717758, - "name": "ENSG00000104432", - "start": 79587978, - "strand": "-", - "transcripts": [ - { - "cdna_coding_end": 1135, - "cdna_coding_start": 602, - "domains": [ - { - "name": "PIRSF001942", - "regions": [ - { - "end": 177, - "start": 1 - } - ] - }, - { - "name": "PR00435", - "regions": [ - { - "end": 25, - "start": 2 - }, - { - "end": 48, - "start": 26 - }, - { - "end": 77, - "start": 57 - }, - { - "end": 98, - "start": 78 - }, - { - "end": 118, - "start": 99 - }, - { - "end": 173, - "start": 151 - } - ] - }, - { - "name": "PF01415", - "regions": [ - { - "end": 173, - "start": 28 - } - ] - }, - { - "name": "SM00127", - "regions": [ - { - "end": 173, - "start": 27 - } - ] - } - ], - "end": 79717758, - "exons": [ - { - "end": 79646067, - "name": null, - "start": 79645007 - }, - { - "end": 79648762, - "name": null, - "start": 79648709 - }, - { - "end": 79650870, - "name": null, - "start": 79650739 - }, - { - "end": 79652317, - "name": null, - "start": 79652237 - }, - { - "end": 79710443, - "name": null, - "start": 79710307 - }, - { - "end": 79717758, - "name": null, - "start": 79717148 - } - ], - "is_best_transcript": true, - "name": "ENST00000263851", - "start": 79645007 - }, - { - "cdna_coding_end": 758, - "cdna_coding_start": 543, - "domains": [ - { - "name": "PR00435", - "regions": [ - { - "end": 25, - "start": 2 - }, - { - "end": 48, - "start": 26 - } - ] - }, - { - "name": "PF01415", - "regions": [ - { - "end": 54, - "start": 28 - } - ] - } - ], - "end": 79717699, - "exons": [ - { - "end": 79646063, - "name": null, - "start": 79645283 - }, - { - "end": 79648762, - "name": null, - "start": 79648709 - }, - { - "end": 79650870, - "name": null, - "start": 79650739 - }, - { - "end": 79652317, - "name": null, - "start": 79652237 - }, - { - "end": 79659331, - "name": null, - "start": 79659129 - }, - { - "end": 79710443, - "name": null, - "start": 79710307 - }, - { - "end": 79717699, - "name": null, - "start": 79717148 - } - ], - "is_best_transcript": false, - "name": "ENST00000518982", - "start": 79645283 - }, - { - "cdna_coding_end": 408, - "cdna_coding_start": 7, - "domains": [ - { - "name": "PF01415", - "regions": [ - { - "end": 77, - "start": 28 - }, - { - "end": 129, - "start": 91 - } - ] - }, - { - "name": "SM00127", - "regions": [ - { - "end": 129, - "start": 27 - } - ] - }, - { - "name": "PR00435", - "regions": [ - { - "end": 25, - "start": 2 - }, - { - "end": 48, - "start": 26 - }, - { - "end": 77, - "start": 57 - } - ] - }, - { - "name": "PIRSF001942", - "regions": [ - { - "end": 133, - "start": 1 - } - ] - } - ], - "end": 79717163, - "exons": [ - { - "end": 79646067, - "name": null, - "start": 79645900 - }, - { - "end": 79648762, - "name": null, - "start": 79648709 - }, - { - "end": 79652317, - "name": null, - "start": 79652237 - }, - { - "end": 79710443, - "name": null, - "start": 79710307 - }, - { - "end": 79717163, - "name": null, - "start": 79717148 - } - ], - "is_best_transcript": false, - "name": "ENST00000520269", - "start": 79645900 - }, - { - "cdna_coding_end": 120, - "cdna_coding_start": 7, - "domains": [ - { - "name": "PR00435", - "regions": [ - { - "end": 25, - "start": 2 - }, - { - "end": 37, - "start": 26 - } - ] - } - ], - "end": 79717163, - "exons": [ - { - "end": 79646067, - "name": null, - "start": 79645900 - }, - { - "end": 79648762, - "name": null, - "start": 79648709 - }, - { - "end": 79652317, - "name": null, - "start": 79652237 - }, - { - "end": 79710443, - "name": null, - "start": 79710363 - }, - { - "end": 79717163, - "name": null, - "start": 79717148 - } - ], - "is_best_transcript": false, - "name": "ENST00000520215", - "start": 79645900 - }, - { - "cdna_coding_end": 643, - "cdna_coding_start": 530, - "domains": [ - { - "name": "PR00435", - "regions": [ - { - "end": 25, - "start": 2 - }, - { - "end": 37, - "start": 26 - } - ] - } - ], - "end": 79717686, - "exons": [ - { - "end": 79646067, - "name": null, - "start": 79645900 - }, - { - "end": 79648762, - "name": null, - "start": 79648709 - }, - { - "end": 79650870, - "name": null, - "start": 79650739 - }, - { - "end": 79652317, - "name": null, - "start": 79652237 - }, - { - "end": 79710443, - "name": null, - "start": 79710363 - }, - { - "end": 79717686, - "name": null, - "start": 79717148 - } - ], - "is_best_transcript": false, - "name": "ENST00000520317", - "start": 79645900 - }, - { - "cdna_coding_end": 195, - "cdna_coding_start": 1, - "domains": [ - { - "name": "SM00127", - "regions": [ - { - "end": 60, - "start": 1 - } - ] - }, - { - "name": "PF01415", - "regions": [ - { - "end": 60, - "start": 1 - } - ] - } - ], - "end": 79652311, - "exons": [ - { - "end": 79646067, - "name": null, - "start": 79645948 - }, - { - "end": 79652311, - "name": null, - "start": 79652237 - } - ], - "is_best_transcript": false, - "name": "ENST00000541183", - "start": 79645948 - }, - { - "cdna_coding_end": 817, - "cdna_coding_start": 602, - "domains": [ - { - "name": "PF01415", - "regions": [ - { - "end": 54, - "start": 28 - } - ] - }, - { - "name": "PR00435", - "regions": [ - { - "end": 25, - "start": 2 - }, - { - "end": 48, - "start": 26 - } - ] - } - ], - "end": 79717758, - "exons": [ - { - "end": 79659331, - "name": null, - "start": 79659263 - }, - { - "end": 79710443, - "name": null, - "start": 79710307 - }, - { - "end": 79717758, - "name": null, - "start": 79717148 - } - ], - "is_best_transcript": false, - "name": "ENST00000379113", - "start": 79659263 - } - ] - }, - { - "aliases": [ - "SVEP1" - ], - "chr": "9", - "end": 113342160, - "name": "ENSG00000165124", - "start": 113127531, - "strand": "-", - "transcripts": [ - { - "cdna_coding_end": 11053, - "cdna_coding_start": 338, - "domains": [ - { - "name": "SM00032", - "regions": [ - { - "end": 433, - "start": 378 - }, - { - "end": 493, - "start": 438 - }, - { - "end": 559, - "start": 498 - }, - { - "end": 787, - "start": 727 - }, - { - "end": 1685, - "start": 1631 - }, - { - "end": 1743, - "start": 1690 - }, - { - "end": 1842, - "start": 1789 - }, - { - "end": 1900, - "start": 1847 - }, - { - "end": 1958, - "start": 1905 - }, - { - "end": 2016, - "start": 1963 - }, - { - "end": 2078, - "start": 2021 - }, - { - "end": 2141, - "start": 2083 - }, - { - "end": 2199, - "start": 2146 - }, - { - "end": 2259, - "start": 2204 - }, - { - "end": 2318, - "start": 2264 - }, - { - "end": 2376, - "start": 2323 - }, - { - "end": 2435, - "start": 2381 - }, - { - "end": 2493, - "start": 2440 - }, - { - "end": 2551, - "start": 2498 - }, - { - "end": 2608, - "start": 2556 - }, - { - "end": 2712, - "start": 2654 - }, - { - "end": 2770, - "start": 2717 - }, - { - "end": 2828, - "start": 2775 - }, - { - "end": 2886, - "start": 2833 - }, - { - "end": 2944, - "start": 2891 - }, - { - "end": 3002, - "start": 2949 - }, - { - "end": 3059, - "start": 3007 - }, - { - "end": 3117, - "start": 3064 - }, - { - "end": 3176, - "start": 3122 - }, - { - "end": 3236, - "start": 3181 - }, - { - "end": 3294, - "start": 3241 - }, - { - "end": 3352, - "start": 3299 - }, - { - "end": 3411, - "start": 3357 - }, - { - "end": 3468, - "start": 3416 - } - ] - }, - { - "name": "PF02494", - "regions": [ - { - "end": 642, - "start": 561 - }, - { - "end": 721, - "start": 644 - } - ] - }, - { - "name": "PR00895", - "regions": [ - { - "end": 1530, - "start": 1512 - }, - { - "end": 1558, - "start": 1539 - }, - { - "end": 1592, - "start": 1559 - } - ] - }, - { - "name": "SSF57535", - "regions": [ - { - "end": 433, - "start": 374 - }, - { - "end": 493, - "start": 434 - }, - { - "end": 560, - "start": 494 - }, - { - "end": 790, - "start": 727 - }, - { - "end": 1746, - "start": 1626 - }, - { - "end": 1842, - "start": 1785 - }, - { - "end": 1900, - "start": 1843 - }, - { - "end": 1958, - "start": 1901 - }, - { - "end": 2016, - "start": 1959 - }, - { - "end": 2078, - "start": 2017 - }, - { - "end": 2199, - "start": 2081 - }, - { - "end": 2318, - "start": 2202 - }, - { - "end": 2377, - "start": 2321 - }, - { - "end": 2437, - "start": 2379 - }, - { - "end": 2551, - "start": 2438 - }, - { - "end": 2616, - "start": 2552 - }, - { - "end": 2712, - "start": 2643 - }, - { - "end": 2828, - "start": 2715 - }, - { - "end": 2886, - "start": 2829 - }, - { - "end": 2944, - "start": 2887 - }, - { - "end": 3117, - "start": 2945 - }, - { - "end": 3176, - "start": 3118 - }, - { - "end": 3229, - "start": 3177 - }, - { - "end": 3475, - "start": 3239 - } - ] - }, - { - "name": "SSF49899", - "regions": [ - { - "end": 1632, - "start": 1421 - } - ] - }, - { - "name": "SM00159", - "regions": [ - { - "end": 1627, - "start": 1420 - } - ] - }, - { - "name": "PF00354", - "regions": [ - { - "end": 1620, - "start": 1442 - } - ] - }, - { - "name": "PF07699", - "regions": [ - { - "end": 360, - "start": 310 - }, - { - "end": 1052, - "start": 1005 - }, - { - "end": 1106, - "start": 1059 - }, - { - "end": 1160, - "start": 1113 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 1409, - "start": 1197 - }, - { - "end": 3554, - "start": 3468 - } - ] - }, - { - "name": "PS50825", - "regions": [ - { - "end": 642, - "start": 560 - }, - { - "end": 724, - "start": 643 - } - ] - }, - { - "name": "PF00092", - "regions": [ - { - "end": 252, - "start": 84 - } - ] - }, - { - "name": "SSF57196", - "regions": [ - { - "end": 1267, - "start": 1189 - }, - { - "end": 1305, - "start": 1268 - }, - { - "end": 1342, - "start": 1306 - }, - { - "end": 1423, - "start": 1344 - }, - { - "end": 1786, - "start": 1735 - }, - { - "end": 3506, - "start": 3463 - }, - { - "end": 3535, - "start": 3507 - }, - { - "end": 3570, - "start": 3537 - } - ] - }, - { - "name": "PS50026", - "regions": [ - { - "end": 1229, - "start": 1193 - }, - { - "end": 1267, - "start": 1231 - }, - { - "end": 1305, - "start": 1269 - }, - { - "end": 1343, - "start": 1307 - }, - { - "end": 1381, - "start": 1345 - }, - { - "end": 1419, - "start": 1383 - }, - { - "end": 1784, - "start": 1745 - }, - { - "end": 3532, - "start": 3500 - }, - { - "end": 3564, - "start": 3533 - } - ] - }, - { - "name": "SM00181", - "regions": [ - { - "end": 1229, - "start": 1196 - }, - { - "end": 1267, - "start": 1234 - }, - { - "end": 1305, - "start": 1272 - }, - { - "end": 1343, - "start": 1310 - }, - { - "end": 1381, - "start": 1348 - }, - { - "end": 1419, - "start": 1386 - }, - { - "end": 1784, - "start": 1748 - }, - { - "end": 3500, - "start": 3471 - }, - { - "end": 3532, - "start": 3503 - }, - { - "end": 3564, - "start": 3535 - } - ] - }, - { - "name": "SM00179", - "regions": [ - { - "end": 1229, - "start": 1196 - }, - { - "end": 1267, - "start": 1231 - }, - { - "end": 1305, - "start": 1269 - }, - { - "end": 1343, - "start": 1307 - }, - { - "end": 1381, - "start": 1345 - }, - { - "end": 1419, - "start": 1383 - }, - { - "end": 1784, - "start": 1745 - }, - { - "end": 3532, - "start": 3504 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 440, - "start": 269 - }, - { - "end": 1144, - "start": 988 - } - ] - }, - { - "name": "PF07645", - "regions": [ - { - "end": 1783, - "start": 1745 - } - ] - }, - { - "name": "PS50923", - "regions": [ - { - "end": 435, - "start": 376 - }, - { - "end": 495, - "start": 436 - }, - { - "end": 561, - "start": 496 - }, - { - "end": 789, - "start": 725 - }, - { - "end": 1687, - "start": 1629 - }, - { - "end": 1745, - "start": 1688 - }, - { - "end": 1844, - "start": 1787 - }, - { - "end": 1902, - "start": 1845 - }, - { - "end": 1960, - "start": 1903 - }, - { - "end": 2018, - "start": 1961 - }, - { - "end": 2080, - "start": 2019 - }, - { - "end": 2143, - "start": 2081 - }, - { - "end": 2201, - "start": 2144 - }, - { - "end": 2261, - "start": 2202 - }, - { - "end": 2320, - "start": 2262 - }, - { - "end": 2378, - "start": 2321 - }, - { - "end": 2437, - "start": 2379 - }, - { - "end": 2495, - "start": 2438 - }, - { - "end": 2553, - "start": 2496 - }, - { - "end": 2610, - "start": 2554 - }, - { - "end": 2714, - "start": 2663 - }, - { - "end": 2772, - "start": 2715 - }, - { - "end": 2830, - "start": 2773 - }, - { - "end": 2888, - "start": 2831 - }, - { - "end": 2946, - "start": 2889 - }, - { - "end": 3004, - "start": 2947 - }, - { - "end": 3061, - "start": 3005 - }, - { - "end": 3119, - "start": 3062 - }, - { - "end": 3178, - "start": 3120 - }, - { - "end": 3238, - "start": 3179 - }, - { - "end": 3296, - "start": 3239 - }, - { - "end": 3354, - "start": 3297 - }, - { - "end": 3413, - "start": 3355 - }, - { - "end": 3470, - "start": 3414 - } - ] - }, - { - "name": "SM00327", - "regions": [ - { - "end": 260, - "start": 81 - } - ] - }, - { - "name": "PF00008", - "regions": [ - { - "end": 1226, - "start": 1197 - }, - { - "end": 1265, - "start": 1235 - }, - { - "end": 1302, - "start": 1273 - }, - { - "end": 1379, - "start": 1349 - }, - { - "end": 1417, - "start": 1387 - } - ] - }, - { - "name": "PS50234", - "regions": [ - { - "end": 264, - "start": 83 - } - ] - }, - { - "name": "PF07974", - "regions": [ - { - "end": 1266, - "start": 1235 - }, - { - "end": 3499, - "start": 3475 - }, - { - "end": 3531, - "start": 3507 - }, - { - "end": 3563, - "start": 3536 - } - ] - }, - { - "name": "SSF53300", - "regions": [ - { - "end": 262, - "start": 79 - } - ] - }, - { - "name": "PF00084", - "regions": [ - { - "end": 430, - "start": 378 - }, - { - "end": 493, - "start": 438 - }, - { - "end": 1685, - "start": 1628 - }, - { - "end": 1743, - "start": 1690 - }, - { - "end": 1842, - "start": 1789 - }, - { - "end": 1900, - "start": 1847 - }, - { - "end": 1958, - "start": 1905 - }, - { - "end": 2016, - "start": 1963 - }, - { - "end": 2078, - "start": 2021 - }, - { - "end": 2136, - "start": 2083 - }, - { - "end": 2199, - "start": 2146 - }, - { - "end": 2259, - "start": 2204 - }, - { - "end": 2318, - "start": 2264 - }, - { - "end": 2376, - "start": 2323 - }, - { - "end": 2435, - "start": 2381 - }, - { - "end": 2493, - "start": 2440 - }, - { - "end": 2551, - "start": 2498 - }, - { - "end": 2608, - "start": 2556 - }, - { - "end": 2712, - "start": 2667 - }, - { - "end": 2770, - "start": 2717 - }, - { - "end": 2828, - "start": 2775 - }, - { - "end": 2886, - "start": 2833 - }, - { - "end": 2944, - "start": 2891 - }, - { - "end": 3002, - "start": 2949 - }, - { - "end": 3059, - "start": 3007 - }, - { - "end": 3117, - "start": 3084 - }, - { - "end": 3172, - "start": 3122 - }, - { - "end": 3236, - "start": 3181 - }, - { - "end": 3290, - "start": 3241 - }, - { - "end": 3352, - "start": 3299 - }, - { - "end": 3411, - "start": 3357 - }, - { - "end": 3468, - "start": 3416 - } - ] - } - ], - "end": 113342160, - "exons": [ - { - "end": 113128840, - "name": null, - "start": 113127531 - }, - { - "end": 113132296, - "name": null, - "start": 113132203 - }, - { - "end": 113137743, - "name": null, - "start": 113137648 - }, - { - "end": 113139646, - "name": null, - "start": 113139551 - }, - { - "end": 113141797, - "name": null, - "start": 113141627 - }, - { - "end": 113148354, - "name": null, - "start": 113148178 - }, - { - "end": 113149738, - "name": null, - "start": 113149565 - }, - { - "end": 113151867, - "name": null, - "start": 113151804 - }, - { - "end": 113163289, - "name": null, - "start": 113163134 - }, - { - "end": 113166832, - "name": null, - "start": 113166607 - }, - { - "end": 113171231, - "name": null, - "start": 113168440 - }, - { - "end": 113174015, - "name": null, - "start": 113173343 - }, - { - "end": 113190038, - "name": null, - "start": 113189871 - }, - { - "end": 113191614, - "name": null, - "start": 113191423 - }, - { - "end": 113192284, - "name": null, - "start": 113192200 - }, - { - "end": 113192730, - "name": null, - "start": 113192554 - }, - { - "end": 113194314, - "name": null, - "start": 113194195 - }, - { - "end": 113194915, - "name": null, - "start": 113194742 - }, - { - "end": 113196786, - "name": null, - "start": 113196616 - }, - { - "end": 113197644, - "name": null, - "start": 113197521 - }, - { - "end": 113198784, - "name": null, - "start": 113198660 - }, - { - "end": 113206000, - "name": null, - "start": 113205825 - }, - { - "end": 113208318, - "name": null, - "start": 113208117 - }, - { - "end": 113209337, - "name": null, - "start": 113209180 - }, - { - "end": 113212540, - "name": null, - "start": 113212339 - }, - { - "end": 113213682, - "name": null, - "start": 113213569 - }, - { - "end": 113217983, - "name": null, - "start": 113217870 - }, - { - "end": 113219632, - "name": null, - "start": 113219536 - }, - { - "end": 113220842, - "name": null, - "start": 113220751 - }, - { - "end": 113221393, - "name": null, - "start": 113221232 - }, - { - "end": 113228306, - "name": null, - "start": 113228145 - }, - { - "end": 113231381, - "name": null, - "start": 113231220 - }, - { - "end": 113233877, - "name": null, - "start": 113233644 - }, - { - "end": 113234603, - "name": null, - "start": 113234439 - }, - { - "end": 113238595, - "name": null, - "start": 113238484 - }, - { - "end": 113242036, - "name": null, - "start": 113241915 - }, - { - "end": 113243716, - "name": null, - "start": 113243522 - }, - { - "end": 113244772, - "name": null, - "start": 113244641 - }, - { - "end": 113245973, - "name": null, - "start": 113245866 - }, - { - "end": 113252059, - "name": null, - "start": 113251930 - }, - { - "end": 113259213, - "name": null, - "start": 113259095 - }, - { - "end": 113261518, - "name": null, - "start": 113261321 - }, - { - "end": 113265497, - "name": null, - "start": 113265318 - }, - { - "end": 113275385, - "name": null, - "start": 113275206 - }, - { - "end": 113276386, - "name": null, - "start": 113276228 - }, - { - "end": 113308571, - "name": null, - "start": 113308395 - }, - { - "end": 113312384, - "name": null, - "start": 113312129 - }, - { - "end": 113342160, - "name": null, - "start": 113341293 - } - ], - "is_best_transcript": true, - "name": "ENST00000401783", - "start": 113127531 - }, - { - "cdna_coding_end": 4909, - "cdna_coding_start": 416, - "domains": [ - { - "name": "PF00084", - "regions": [ - { - "end": 62, - "start": 9 - }, - { - "end": 125, - "start": 72 - }, - { - "end": 185, - "start": 130 - }, - { - "end": 244, - "start": 190 - }, - { - "end": 302, - "start": 249 - }, - { - "end": 361, - "start": 307 - }, - { - "end": 419, - "start": 366 - }, - { - "end": 477, - "start": 424 - }, - { - "end": 534, - "start": 482 - }, - { - "end": 638, - "start": 593 - }, - { - "end": 696, - "start": 643 - }, - { - "end": 754, - "start": 701 - }, - { - "end": 812, - "start": 759 - }, - { - "end": 870, - "start": 817 - }, - { - "end": 928, - "start": 875 - }, - { - "end": 985, - "start": 933 - }, - { - "end": 1043, - "start": 1010 - }, - { - "end": 1098, - "start": 1048 - }, - { - "end": 1162, - "start": 1107 - }, - { - "end": 1216, - "start": 1167 - }, - { - "end": 1278, - "start": 1225 - }, - { - "end": 1337, - "start": 1283 - }, - { - "end": 1394, - "start": 1342 - } - ] - }, - { - "name": "PF07974", - "regions": [ - { - "end": 1425, - "start": 1401 - }, - { - "end": 1457, - "start": 1433 - }, - { - "end": 1489, - "start": 1462 - } - ] - }, - { - "name": "PF00008", - "regions": [ - { - "end": 1456, - "start": 1427 - } - ] - }, - { - "name": "PS50923", - "regions": [ - { - "end": 69, - "start": 7 - }, - { - "end": 127, - "start": 70 - }, - { - "end": 187, - "start": 128 - }, - { - "end": 246, - "start": 188 - }, - { - "end": 304, - "start": 247 - }, - { - "end": 363, - "start": 305 - }, - { - "end": 421, - "start": 364 - }, - { - "end": 479, - "start": 422 - }, - { - "end": 536, - "start": 480 - }, - { - "end": 640, - "start": 589 - }, - { - "end": 698, - "start": 641 - }, - { - "end": 756, - "start": 699 - }, - { - "end": 814, - "start": 757 - }, - { - "end": 872, - "start": 815 - }, - { - "end": 930, - "start": 873 - }, - { - "end": 987, - "start": 931 - }, - { - "end": 1045, - "start": 988 - }, - { - "end": 1104, - "start": 1046 - }, - { - "end": 1164, - "start": 1105 - }, - { - "end": 1222, - "start": 1165 - }, - { - "end": 1280, - "start": 1223 - }, - { - "end": 1339, - "start": 1281 - }, - { - "end": 1396, - "start": 1340 - } - ] - }, - { - "name": "SM00181", - "regions": [ - { - "end": 1426, - "start": 1397 - }, - { - "end": 1458, - "start": 1429 - }, - { - "end": 1490, - "start": 1461 - } - ] - }, - { - "name": "SSF57196", - "regions": [ - { - "end": 1432, - "start": 1389 - }, - { - "end": 1461, - "start": 1433 - }, - { - "end": 1496, - "start": 1463 - } - ] - }, - { - "name": "PS50026", - "regions": [ - { - "end": 1458, - "start": 1426 - }, - { - "end": 1490, - "start": 1459 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 1480, - "start": 1394 - } - ] - }, - { - "name": "SSF57535", - "regions": [ - { - "end": 125, - "start": 7 - }, - { - "end": 244, - "start": 128 - }, - { - "end": 303, - "start": 247 - }, - { - "end": 363, - "start": 305 - }, - { - "end": 477, - "start": 364 - }, - { - "end": 542, - "start": 478 - }, - { - "end": 638, - "start": 569 - }, - { - "end": 754, - "start": 641 - }, - { - "end": 812, - "start": 755 - }, - { - "end": 870, - "start": 813 - }, - { - "end": 1043, - "start": 871 - }, - { - "end": 1102, - "start": 1044 - }, - { - "end": 1155, - "start": 1103 - }, - { - "end": 1401, - "start": 1165 - } - ] - }, - { - "name": "SM00032", - "regions": [ - { - "end": 67, - "start": 9 - }, - { - "end": 125, - "start": 72 - }, - { - "end": 185, - "start": 130 - }, - { - "end": 244, - "start": 190 - }, - { - "end": 302, - "start": 249 - }, - { - "end": 361, - "start": 307 - }, - { - "end": 419, - "start": 366 - }, - { - "end": 477, - "start": 424 - }, - { - "end": 534, - "start": 482 - }, - { - "end": 638, - "start": 580 - }, - { - "end": 696, - "start": 643 - }, - { - "end": 754, - "start": 701 - }, - { - "end": 812, - "start": 759 - }, - { - "end": 870, - "start": 817 - }, - { - "end": 928, - "start": 875 - }, - { - "end": 985, - "start": 933 - }, - { - "end": 1043, - "start": 990 - }, - { - "end": 1102, - "start": 1048 - }, - { - "end": 1162, - "start": 1107 - }, - { - "end": 1220, - "start": 1167 - }, - { - "end": 1278, - "start": 1225 - }, - { - "end": 1337, - "start": 1283 - }, - { - "end": 1394, - "start": 1342 - } - ] - } - ], - "end": 113190038, - "exons": [ - { - "end": 113128840, - "name": null, - "start": 113127536 - }, - { - "end": 113132296, - "name": null, - "start": 113132203 - }, - { - "end": 113137743, - "name": null, - "start": 113137648 - }, - { - "end": 113139646, - "name": null, - "start": 113139551 - }, - { - "end": 113141797, - "name": null, - "start": 113141627 - }, - { - "end": 113148354, - "name": null, - "start": 113148178 - }, - { - "end": 113149738, - "name": null, - "start": 113149565 - }, - { - "end": 113151867, - "name": null, - "start": 113151804 - }, - { - "end": 113163289, - "name": null, - "start": 113163134 - }, - { - "end": 113166832, - "name": null, - "start": 113166607 - }, - { - "end": 113171231, - "name": null, - "start": 113168440 - }, - { - "end": 113174015, - "name": null, - "start": 113173343 - }, - { - "end": 113190038, - "name": null, - "start": 113189871 - } - ], - "is_best_transcript": false, - "name": "ENST00000297826", - "start": 113127536 - }, - { - "cdna_coding_end": 10911, - "cdna_coding_start": 265, - "domains": [ - { - "name": "SSF57535", - "regions": [ - { - "end": 410, - "start": 351 - }, - { - "end": 470, - "start": 411 - }, - { - "end": 537, - "start": 471 - }, - { - "end": 767, - "start": 704 - }, - { - "end": 1723, - "start": 1603 - }, - { - "end": 1819, - "start": 1762 - }, - { - "end": 1877, - "start": 1820 - }, - { - "end": 1935, - "start": 1878 - }, - { - "end": 1993, - "start": 1936 - }, - { - "end": 2055, - "start": 1994 - }, - { - "end": 2176, - "start": 2058 - }, - { - "end": 2295, - "start": 2179 - }, - { - "end": 2354, - "start": 2298 - }, - { - "end": 2414, - "start": 2356 - }, - { - "end": 2528, - "start": 2415 - }, - { - "end": 2593, - "start": 2529 - }, - { - "end": 2689, - "start": 2620 - }, - { - "end": 2805, - "start": 2692 - }, - { - "end": 2863, - "start": 2806 - }, - { - "end": 2921, - "start": 2864 - }, - { - "end": 3094, - "start": 2922 - }, - { - "end": 3153, - "start": 3095 - }, - { - "end": 3206, - "start": 3154 - }, - { - "end": 3452, - "start": 3216 - } - ] - }, - { - "name": "SSF49899", - "regions": [ - { - "end": 1609, - "start": 1398 - } - ] - }, - { - "name": "SM00159", - "regions": [ - { - "end": 1604, - "start": 1397 - } - ] - }, - { - "name": "PF00354", - "regions": [ - { - "end": 1597, - "start": 1419 - } - ] - }, - { - "name": "PR00895", - "regions": [ - { - "end": 1507, - "start": 1489 - }, - { - "end": 1535, - "start": 1516 - }, - { - "end": 1569, - "start": 1536 - } - ] - }, - { - "name": "PF02494", - "regions": [ - { - "end": 619, - "start": 538 - }, - { - "end": 698, - "start": 621 - } - ] - }, - { - "name": "SM00032", - "regions": [ - { - "end": 410, - "start": 355 - }, - { - "end": 470, - "start": 415 - }, - { - "end": 536, - "start": 475 - }, - { - "end": 764, - "start": 704 - }, - { - "end": 1662, - "start": 1608 - }, - { - "end": 1720, - "start": 1667 - }, - { - "end": 1819, - "start": 1766 - }, - { - "end": 1877, - "start": 1824 - }, - { - "end": 1935, - "start": 1882 - }, - { - "end": 1993, - "start": 1940 - }, - { - "end": 2055, - "start": 1998 - }, - { - "end": 2118, - "start": 2060 - }, - { - "end": 2176, - "start": 2123 - }, - { - "end": 2236, - "start": 2181 - }, - { - "end": 2295, - "start": 2241 - }, - { - "end": 2353, - "start": 2300 - }, - { - "end": 2412, - "start": 2358 - }, - { - "end": 2470, - "start": 2417 - }, - { - "end": 2528, - "start": 2475 - }, - { - "end": 2585, - "start": 2533 - }, - { - "end": 2689, - "start": 2631 - }, - { - "end": 2747, - "start": 2694 - }, - { - "end": 2805, - "start": 2752 - }, - { - "end": 2863, - "start": 2810 - }, - { - "end": 2921, - "start": 2868 - }, - { - "end": 2979, - "start": 2926 - }, - { - "end": 3036, - "start": 2984 - }, - { - "end": 3094, - "start": 3041 - }, - { - "end": 3153, - "start": 3099 - }, - { - "end": 3213, - "start": 3158 - }, - { - "end": 3271, - "start": 3218 - }, - { - "end": 3329, - "start": 3276 - }, - { - "end": 3388, - "start": 3334 - }, - { - "end": 3445, - "start": 3393 - } - ] - }, - { - "name": "SM00179", - "regions": [ - { - "end": 1206, - "start": 1173 - }, - { - "end": 1244, - "start": 1208 - }, - { - "end": 1282, - "start": 1246 - }, - { - "end": 1320, - "start": 1284 - }, - { - "end": 1358, - "start": 1322 - }, - { - "end": 1396, - "start": 1360 - }, - { - "end": 1761, - "start": 1722 - }, - { - "end": 3509, - "start": 3481 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 417, - "start": 246 - }, - { - "end": 1121, - "start": 965 - } - ] - }, - { - "name": "SSF57196", - "regions": [ - { - "end": 1244, - "start": 1166 - }, - { - "end": 1282, - "start": 1245 - }, - { - "end": 1319, - "start": 1283 - }, - { - "end": 1400, - "start": 1321 - }, - { - "end": 1763, - "start": 1712 - }, - { - "end": 3483, - "start": 3440 - }, - { - "end": 3512, - "start": 3484 - }, - { - "end": 3547, - "start": 3514 - } - ] - }, - { - "name": "PS50026", - "regions": [ - { - "end": 1206, - "start": 1170 - }, - { - "end": 1244, - "start": 1208 - }, - { - "end": 1282, - "start": 1246 - }, - { - "end": 1320, - "start": 1284 - }, - { - "end": 1358, - "start": 1322 - }, - { - "end": 1396, - "start": 1360 - }, - { - "end": 1761, - "start": 1722 - }, - { - "end": 3509, - "start": 3477 - }, - { - "end": 3541, - "start": 3510 - } - ] - }, - { - "name": "SM00181", - "regions": [ - { - "end": 1206, - "start": 1173 - }, - { - "end": 1244, - "start": 1211 - }, - { - "end": 1282, - "start": 1249 - }, - { - "end": 1320, - "start": 1287 - }, - { - "end": 1358, - "start": 1325 - }, - { - "end": 1396, - "start": 1363 - }, - { - "end": 1761, - "start": 1725 - }, - { - "end": 3477, - "start": 3448 - }, - { - "end": 3509, - "start": 3480 - }, - { - "end": 3541, - "start": 3512 - } - ] - }, - { - "name": "PF00092", - "regions": [ - { - "end": 229, - "start": 61 - } - ] - }, - { - "name": "PS50825", - "regions": [ - { - "end": 619, - "start": 537 - }, - { - "end": 701, - "start": 620 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 1386, - "start": 1174 - }, - { - "end": 3531, - "start": 3445 - } - ] - }, - { - "name": "PF07699", - "regions": [ - { - "end": 337, - "start": 287 - }, - { - "end": 1029, - "start": 982 - }, - { - "end": 1083, - "start": 1036 - }, - { - "end": 1137, - "start": 1090 - } - ] - }, - { - "name": "PF00008", - "regions": [ - { - "end": 1203, - "start": 1174 - }, - { - "end": 1242, - "start": 1212 - }, - { - "end": 1279, - "start": 1250 - }, - { - "end": 1356, - "start": 1326 - }, - { - "end": 1394, - "start": 1364 - } - ] - }, - { - "name": "SM00327", - "regions": [ - { - "end": 237, - "start": 58 - } - ] - }, - { - "name": "PS50923", - "regions": [ - { - "end": 412, - "start": 353 - }, - { - "end": 472, - "start": 413 - }, - { - "end": 538, - "start": 473 - }, - { - "end": 766, - "start": 702 - }, - { - "end": 1664, - "start": 1606 - }, - { - "end": 1722, - "start": 1665 - }, - { - "end": 1821, - "start": 1764 - }, - { - "end": 1879, - "start": 1822 - }, - { - "end": 1937, - "start": 1880 - }, - { - "end": 1995, - "start": 1938 - }, - { - "end": 2057, - "start": 1996 - }, - { - "end": 2120, - "start": 2058 - }, - { - "end": 2178, - "start": 2121 - }, - { - "end": 2238, - "start": 2179 - }, - { - "end": 2297, - "start": 2239 - }, - { - "end": 2355, - "start": 2298 - }, - { - "end": 2414, - "start": 2356 - }, - { - "end": 2472, - "start": 2415 - }, - { - "end": 2530, - "start": 2473 - }, - { - "end": 2587, - "start": 2531 - }, - { - "end": 2691, - "start": 2640 - }, - { - "end": 2749, - "start": 2692 - }, - { - "end": 2807, - "start": 2750 - }, - { - "end": 2865, - "start": 2808 - }, - { - "end": 2923, - "start": 2866 - }, - { - "end": 2981, - "start": 2924 - }, - { - "end": 3038, - "start": 2982 - }, - { - "end": 3096, - "start": 3039 - }, - { - "end": 3155, - "start": 3097 - }, - { - "end": 3215, - "start": 3156 - }, - { - "end": 3273, - "start": 3216 - }, - { - "end": 3331, - "start": 3274 - }, - { - "end": 3390, - "start": 3332 - }, - { - "end": 3447, - "start": 3391 - } - ] - }, - { - "name": "PF07645", - "regions": [ - { - "end": 1760, - "start": 1722 - } - ] - }, - { - "name": "SSF53300", - "regions": [ - { - "end": 239, - "start": 56 - } - ] - }, - { - "name": "PF00084", - "regions": [ - { - "end": 407, - "start": 355 - }, - { - "end": 470, - "start": 415 - }, - { - "end": 1662, - "start": 1605 - }, - { - "end": 1720, - "start": 1667 - }, - { - "end": 1819, - "start": 1766 - }, - { - "end": 1877, - "start": 1824 - }, - { - "end": 1935, - "start": 1882 - }, - { - "end": 1993, - "start": 1940 - }, - { - "end": 2055, - "start": 1998 - }, - { - "end": 2113, - "start": 2060 - }, - { - "end": 2176, - "start": 2123 - }, - { - "end": 2236, - "start": 2181 - }, - { - "end": 2295, - "start": 2241 - }, - { - "end": 2353, - "start": 2300 - }, - { - "end": 2412, - "start": 2358 - }, - { - "end": 2470, - "start": 2417 - }, - { - "end": 2528, - "start": 2475 - }, - { - "end": 2585, - "start": 2533 - }, - { - "end": 2689, - "start": 2644 - }, - { - "end": 2747, - "start": 2694 - }, - { - "end": 2805, - "start": 2752 - }, - { - "end": 2863, - "start": 2810 - }, - { - "end": 2921, - "start": 2868 - }, - { - "end": 2979, - "start": 2926 - }, - { - "end": 3036, - "start": 2984 - }, - { - "end": 3094, - "start": 3061 - }, - { - "end": 3149, - "start": 3099 - }, - { - "end": 3213, - "start": 3158 - }, - { - "end": 3267, - "start": 3218 - }, - { - "end": 3329, - "start": 3276 - }, - { - "end": 3388, - "start": 3334 - }, - { - "end": 3445, - "start": 3393 - } - ] - }, - { - "name": "PF07974", - "regions": [ - { - "end": 1243, - "start": 1212 - }, - { - "end": 3476, - "start": 3452 - }, - { - "end": 3508, - "start": 3484 - }, - { - "end": 3540, - "start": 3513 - } - ] - }, - { - "name": "PS50234", - "regions": [ - { - "end": 241, - "start": 60 - } - ] - } - ], - "end": 113342018, - "exons": [ - { - "end": 113128840, - "name": null, - "start": 113127536 - }, - { - "end": 113132296, - "name": null, - "start": 113132203 - }, - { - "end": 113137743, - "name": null, - "start": 113137648 - }, - { - "end": 113139646, - "name": null, - "start": 113139551 - }, - { - "end": 113141797, - "name": null, - "start": 113141627 - }, - { - "end": 113148354, - "name": null, - "start": 113148178 - }, - { - "end": 113149738, - "name": null, - "start": 113149565 - }, - { - "end": 113151867, - "name": null, - "start": 113151804 - }, - { - "end": 113163289, - "name": null, - "start": 113163134 - }, - { - "end": 113166832, - "name": null, - "start": 113166607 - }, - { - "end": 113171231, - "name": null, - "start": 113168440 - }, - { - "end": 113174015, - "name": null, - "start": 113173343 - }, - { - "end": 113190038, - "name": null, - "start": 113189871 - }, - { - "end": 113191614, - "name": null, - "start": 113191423 - }, - { - "end": 113192284, - "name": null, - "start": 113192200 - }, - { - "end": 113192730, - "name": null, - "start": 113192554 - }, - { - "end": 113194314, - "name": null, - "start": 113194195 - }, - { - "end": 113194915, - "name": null, - "start": 113194742 - }, - { - "end": 113196786, - "name": null, - "start": 113196616 - }, - { - "end": 113197644, - "name": null, - "start": 113197521 - }, - { - "end": 113198784, - "name": null, - "start": 113198660 - }, - { - "end": 113206000, - "name": null, - "start": 113205825 - }, - { - "end": 113208318, - "name": null, - "start": 113208117 - }, - { - "end": 113209337, - "name": null, - "start": 113209180 - }, - { - "end": 113212540, - "name": null, - "start": 113212339 - }, - { - "end": 113213682, - "name": null, - "start": 113213569 - }, - { - "end": 113217983, - "name": null, - "start": 113217870 - }, - { - "end": 113219632, - "name": null, - "start": 113219536 - }, - { - "end": 113220842, - "name": null, - "start": 113220751 - }, - { - "end": 113221393, - "name": null, - "start": 113221232 - }, - { - "end": 113228306, - "name": null, - "start": 113228145 - }, - { - "end": 113231381, - "name": null, - "start": 113231220 - }, - { - "end": 113233877, - "name": null, - "start": 113233644 - }, - { - "end": 113234603, - "name": null, - "start": 113234439 - }, - { - "end": 113238595, - "name": null, - "start": 113238484 - }, - { - "end": 113242036, - "name": null, - "start": 113241915 - }, - { - "end": 113243716, - "name": null, - "start": 113243522 - }, - { - "end": 113244772, - "name": null, - "start": 113244641 - }, - { - "end": 113245973, - "name": null, - "start": 113245866 - }, - { - "end": 113252059, - "name": null, - "start": 113251930 - }, - { - "end": 113259213, - "name": null, - "start": 113259095 - }, - { - "end": 113261518, - "name": null, - "start": 113261321 - }, - { - "end": 113265497, - "name": null, - "start": 113265318 - }, - { - "end": 113275385, - "name": null, - "start": 113275206 - }, - { - "end": 113276386, - "name": null, - "start": 113276228 - }, - { - "end": 113308571, - "name": null, - "start": 113308395 - }, - { - "end": 113312384, - "name": null, - "start": 113312129 - }, - { - "end": 113342018, - "name": null, - "start": 113341293 - } - ], - "is_best_transcript": false, - "name": "ENST00000374469", - "start": 113127536 - }, - { - "cdna_coding_end": 4650, - "cdna_coding_start": 1, - "domains": [ - { - "name": "PS50825", - "regions": [ - { - "end": 642, - "start": 560 - }, - { - "end": 724, - "start": 643 - } - ] - }, - { - "name": "PF07699", - "regions": [ - { - "end": 360, - "start": 310 - }, - { - "end": 1052, - "start": 1005 - }, - { - "end": 1106, - "start": 1059 - }, - { - "end": 1160, - "start": 1113 - } - ] - }, - { - "name": "PS50311", - "regions": [ - { - "end": 1409, - "start": 1197 - } - ] - }, - { - "name": "SM00181", - "regions": [ - { - "end": 1229, - "start": 1196 - }, - { - "end": 1267, - "start": 1234 - }, - { - "end": 1305, - "start": 1272 - }, - { - "end": 1343, - "start": 1310 - }, - { - "end": 1381, - "start": 1348 - }, - { - "end": 1419, - "start": 1386 - } - ] - }, - { - "name": "SSF57196", - "regions": [ - { - "end": 1267, - "start": 1189 - }, - { - "end": 1305, - "start": 1268 - }, - { - "end": 1342, - "start": 1306 - }, - { - "end": 1423, - "start": 1344 - } - ] - }, - { - "name": "PS50026", - "regions": [ - { - "end": 1229, - "start": 1193 - }, - { - "end": 1267, - "start": 1231 - }, - { - "end": 1305, - "start": 1269 - }, - { - "end": 1343, - "start": 1307 - }, - { - "end": 1381, - "start": 1345 - }, - { - "end": 1419, - "start": 1383 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 440, - "start": 269 - }, - { - "end": 1144, - "start": 988 - } - ] - }, - { - "name": "SM00179", - "regions": [ - { - "end": 1229, - "start": 1196 - }, - { - "end": 1267, - "start": 1231 - }, - { - "end": 1305, - "start": 1269 - }, - { - "end": 1343, - "start": 1307 - }, - { - "end": 1381, - "start": 1345 - }, - { - "end": 1419, - "start": 1383 - } - ] - }, - { - "name": "PF00092", - "regions": [ - { - "end": 252, - "start": 84 - } - ] - }, - { - "name": "SM00032", - "regions": [ - { - "end": 433, - "start": 378 - }, - { - "end": 493, - "start": 438 - }, - { - "end": 559, - "start": 498 - }, - { - "end": 787, - "start": 727 - } - ] - }, - { - "name": "PF02494", - "regions": [ - { - "end": 642, - "start": 561 - }, - { - "end": 721, - "start": 644 - } - ] - }, - { - "name": "PR00010", - "regions": [ - { - "end": 1318, - "start": 1307 - }, - { - "end": 1364, - "start": 1357 - }, - { - "end": 1413, - "start": 1403 - }, - { - "end": 1420, - "start": 1414 - } - ] - }, - { - "name": "PF00354", - "regions": [ - { - "end": 1532, - "start": 1442 - } - ] - }, - { - "name": "SSF57535", - "regions": [ - { - "end": 433, - "start": 374 - }, - { - "end": 493, - "start": 434 - }, - { - "end": 560, - "start": 494 - }, - { - "end": 790, - "start": 727 - } - ] - }, - { - "name": "SSF49899", - "regions": [ - { - "end": 1547, - "start": 1421 - } - ] - }, - { - "name": "PS50234", - "regions": [ - { - "end": 264, - "start": 83 - } - ] - }, - { - "name": "SSF53300", - "regions": [ - { - "end": 262, - "start": 79 - } - ] - }, - { - "name": "PF00084", - "regions": [ - { - "end": 430, - "start": 378 - }, - { - "end": 493, - "start": 438 - } - ] - }, - { - "name": "PS50923", - "regions": [ - { - "end": 435, - "start": 376 - }, - { - "end": 495, - "start": 436 - }, - { - "end": 561, - "start": 496 - }, - { - "end": 789, - "start": 725 - } - ] - }, - { - "name": "PF07645", - "regions": [ - { - "end": 1262, - "start": 1231 - }, - { - "end": 1338, - "start": 1308 - } - ] - }, - { - "name": "PF00008", - "regions": [ - { - "end": 1226, - "start": 1197 - }, - { - "end": 1265, - "start": 1235 - }, - { - "end": 1302, - "start": 1273 - }, - { - "end": 1337, - "start": 1311 - }, - { - "end": 1379, - "start": 1349 - }, - { - "end": 1417, - "start": 1387 - } - ] - }, - { - "name": "SM00327", - "regions": [ - { - "end": 260, - "start": 81 - } - ] - } - ], - "end": 113341823, - "exons": [ - { - "end": 113206000, - "name": null, - "start": 113204759 - }, - { - "end": 113208318, - "name": null, - "start": 113208117 - }, - { - "end": 113209337, - "name": null, - "start": 113209180 - }, - { - "end": 113212540, - "name": null, - "start": 113212339 - }, - { - "end": 113213682, - "name": null, - "start": 113213569 - }, - { - "end": 113217983, - "name": null, - "start": 113217870 - }, - { - "end": 113219632, - "name": null, - "start": 113219536 - }, - { - "end": 113220399, - "name": null, - "start": 113220395 - }, - { - "end": 113220842, - "name": null, - "start": 113220756 - }, - { - "end": 113221393, - "name": null, - "start": 113221232 - }, - { - "end": 113228306, - "name": null, - "start": 113228145 - }, - { - "end": 113231381, - "name": null, - "start": 113231220 - }, - { - "end": 113233877, - "name": null, - "start": 113233644 - }, - { - "end": 113234603, - "name": null, - "start": 113234439 - }, - { - "end": 113238595, - "name": null, - "start": 113238484 - }, - { - "end": 113242036, - "name": null, - "start": 113241915 - }, - { - "end": 113243716, - "name": null, - "start": 113243522 - }, - { - "end": 113244772, - "name": null, - "start": 113244641 - }, - { - "end": 113245973, - "name": null, - "start": 113245866 - }, - { - "end": 113252059, - "name": null, - "start": 113251930 - }, - { - "end": 113259213, - "name": null, - "start": 113259095 - }, - { - "end": 113261518, - "name": null, - "start": 113261321 - }, - { - "end": 113265497, - "name": null, - "start": 113265318 - }, - { - "end": 113275385, - "name": null, - "start": 113275206 - }, - { - "end": 113276386, - "name": null, - "start": 113276228 - }, - { - "end": 113308571, - "name": null, - "start": 113308395 - }, - { - "end": 113312384, - "name": null, - "start": 113312129 - }, - { - "end": 113341823, - "name": null, - "start": 113341293 - } - ], - "is_best_transcript": false, - "name": "ENST00000302728", - "start": 113204759 - }, - { - "cdna_coding_end": 2944, - "cdna_coding_start": 407, - "domains": [ - { - "name": "PF02494", - "regions": [ - { - "end": 619, - "start": 538 - }, - { - "end": 698, - "start": 621 - } - ] - }, - { - "name": "SM00032", - "regions": [ - { - "end": 410, - "start": 355 - }, - { - "end": 470, - "start": 415 - }, - { - "end": 536, - "start": 475 - }, - { - "end": 764, - "start": 704 - } - ] - }, - { - "name": "SSF57535", - "regions": [ - { - "end": 410, - "start": 351 - }, - { - "end": 470, - "start": 411 - }, - { - "end": 537, - "start": 471 - }, - { - "end": 767, - "start": 704 - } - ] - }, - { - "name": "PF07699", - "regions": [ - { - "end": 337, - "start": 287 - } - ] - }, - { - "name": "PS50825", - "regions": [ - { - "end": 619, - "start": 537 - }, - { - "end": 701, - "start": 620 - } - ] - }, - { - "name": "PF00092", - "regions": [ - { - "end": 229, - "start": 61 - } - ] - }, - { - "name": "SSF57184", - "regions": [ - { - "end": 417, - "start": 246 - } - ] - }, - { - "name": "PS50923", - "regions": [ - { - "end": 412, - "start": 353 - }, - { - "end": 472, - "start": 413 - }, - { - "end": 538, - "start": 473 - }, - { - "end": 766, - "start": 702 - } - ] - }, - { - "name": "SM00327", - "regions": [ - { - "end": 237, - "start": 58 - } - ] - }, - { - "name": "PS50234", - "regions": [ - { - "end": 241, - "start": 60 - } - ] - }, - { - "name": "SSF53300", - "regions": [ - { - "end": 239, - "start": 56 - } - ] - }, - { - "name": "PF00084", - "regions": [ - { - "end": 407, - "start": 355 - }, - { - "end": 470, - "start": 415 - } - ] - } - ], - "end": 113342160, - "exons": [ - { - "end": 113238595, - "name": null, - "start": 113238163 - }, - { - "end": 113242036, - "name": null, - "start": 113241915 - }, - { - "end": 113243716, - "name": null, - "start": 113243522 - }, - { - "end": 113244772, - "name": null, - "start": 113244641 - }, - { - "end": 113245973, - "name": null, - "start": 113245866 - }, - { - "end": 113252059, - "name": null, - "start": 113251930 - }, - { - "end": 113259213, - "name": null, - "start": 113259095 - }, - { - "end": 113261518, - "name": null, - "start": 113261321 - }, - { - "end": 113265497, - "name": null, - "start": 113265318 - }, - { - "end": 113275385, - "name": null, - "start": 113275206 - }, - { - "end": 113276386, - "name": null, - "start": 113276228 - }, - { - "end": 113308571, - "name": null, - "start": 113308395 - }, - { - "end": 113312384, - "name": null, - "start": 113312129 - }, - { - "end": 113342160, - "name": null, - "start": 113341293 - } - ], - "is_best_transcript": false, - "name": "ENST00000374461", - "start": 113238163 - } - ] - }, - { - "aliases": ["ARID1B"], - "chr": "6", - "strand": "+", - "name": "ENSG00000049618", - "end": 157530401, - "start": 157099063, - "transcripts": [{ - "name": "ENST00000346085", - "is_best_transcript": true, - "start": 157099063, - "end": 157529495, - "cdna_coding_end": 6751, - "cdna_coding_start": 2, - "exons": [ - {"start": 157099063, "end": 157100605}, - {"start": 157150361, "end": 157150555}, - {"start": 157192748, "end": 157192786}, - {"start": 157222510, "end": 157222659}, - {"start": 157256600, "end": 157256710}, - {"start": 157405796, "end": 157406039}, - {"start": 157431606, "end": 157431695}, - {"start": 157454162, "end": 157454341}, - {"start": 157469758, "end": 157470085}, - {"start": 157488174, "end": 157488319}, - {"start": 157495142, "end": 157495251}, - {"start": 157502103, "end": 157502312}, - {"start": 157505365, "end": 157505569}, - {"start": 157510776, "end": 157510914}, - {"start": 157511172, "end": 157511344}, - {"start": 157517299, "end": 157517449}, - {"start": 157519945, "end": 157520041}, - {"start": 157521839, "end": 157522622}, - {"start": 157525000, "end": 157525130}, - {"start": 157527301, "end": 157529495} - ], - "domains": [ - {"name": "PF12031", "regions": [{"start": 1939, "end": 2195}]}, - {"name": "PS50324", "regions": [{"start": 35, "end": 57}, {"start": 697, "end": 784}]}, - {"name": "PF01388", "regions": [{"start": 1065, "end": 1153}]}, - {"name": "PS50099", "regions": [{"start": 715, "end": 820}, {"start": 1472, "end": 1610}]}, - {"name": "SSF48371", "regions": [{"start": 2075, "end": 2220}]}, - {"name": "PS50316", "regions": [{"start": 81, "end": 104}]}, - {"name": "PS50322", "regions": [{"start": 107, "end": 131}, {"start": 574, "end": 646}]}, - {"name": "PS51011", "regions": [{"start": 1066, "end": 1157}]}, - {"name": "PS50310", "regions": [{"start": 2, "end": 47}, {"start": 329, "end": 493}]}, - {"name": "PS50315", "regions": [{"start": 141, "end": 401}]}, - {"name": "SSF46774", "regions": [{"start": 1049, "end": 1168}]}, - {"name": "SM00501", "regions": [{"start": 1067, "end": 1158}]} - ] - }] - } - ] -} +{"genes": [{"aliases": ["EGFR"], "chr": "7", "end": 55324313, "name": "ENSG00000146648", "start": 55086714, "strand": "+", "transcripts": [{"end": 55270769, "exons": [{"end": 55087058, "start": 55086714}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270769, "start": 55270210}], "is_best_transcript": false, "name": "ENST00000455089", "start": 55086714, "translations": [{"cdna_coding_end": 3533, "cdna_coding_start": 258, "domains": [{"name": "PIRSF000619", "regions": [{"end": 1090, "start": 1}]}, {"name": "PF07714", "regions": [{"end": 920, "start": 669}]}, {"name": "SSF52058", "regions": [{"end": 191, "start": 28}, {"end": 475, "start": 283}]}, {"name": "PF00757", "regions": [{"end": 293, "start": 141}]}, {"name": "PS50011", "regions": [{"end": 934, "start": 667}]}, {"name": "PS50311", "regions": [{"end": 219, "start": 145}]}, {"name": "SSF57184", "regions": [{"end": 290, "start": 142}, {"end": 593, "start": 460}]}, {"name": "PR00109", "regions": [{"end": 758, "start": 745}, {"end": 800, "start": 782}, {"end": 841, "start": 831}, {"end": 872, "start": 850}, {"end": 916, "start": 894}]}, {"name": "SSF56112", "regions": [{"end": 975, "start": 651}]}, {"name": "PF01030", "regions": [{"end": 141, "start": 57}, {"end": 435, "start": 316}]}, {"name": "SM00220", "regions": [{"end": 924, "start": 667}]}, {"name": "SM00261", "regions": [{"end": 225, "start": 183}, {"end": 502, "start": 451}, {"end": 556, "start": 507}]}, {"name": "SM00219", "regions": [{"end": 923, "start": 667}]}, {"name": "PF00069", "regions": [{"end": 919, "start": 667}]}]}]}, {"end": 55236328, "exons": [{"end": 55087058, "start": 55086725}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55236328, "start": 55236216}], "is_best_transcript": false, "name": "ENST00000342916", "start": 55086725, "translations": [{"cdna_coding_end": 2133, "cdna_coding_start": 247, "domains": [{"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 624, "start": 505}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}]}]}, {"end": 55238738, "exons": [{"end": 55087058, "start": 55086726}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238738, "start": 55238000}], "is_best_transcript": false, "name": "ENST00000344576", "start": 55086726, "translations": [{"cdna_coding_end": 2363, "cdna_coding_start": 246, "domains": [{"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 624, "start": 505}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}]}]}, {"end": 55224644, "exons": [{"end": 55087058, "start": 55086727}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224644, "start": 55224452}], "is_best_transcript": false, "name": "ENST00000420316", "start": 55086727, "translations": [{"cdna_coding_end": 1462, "cdna_coding_start": 245, "domains": [{"name": "SSF57184", "regions": [{"end": 339, "start": 182}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 403, "start": 328}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}]}]}]}, {"end": 55279321, "exons": [{"end": 55087058, "start": 55086794}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270318, "start": 55270210}, {"end": 55279321, "start": 55272949}], "is_best_transcript": true, "name": "ENST00000275493", "start": 55086794, "translations": [{"cdna_coding_end": 3810, "cdna_coding_start": 178, "domains": [{"name": "SM00220", "regions": [{"end": 969, "start": 712}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}, {"name": "SSF56112", "regions": [{"end": 1020, "start": 696}]}, {"name": "PF00069", "regions": [{"end": 964, "start": 712}]}, {"name": "SM00219", "regions": [{"end": 968, "start": 712}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF07714", "regions": [{"end": 965, "start": 714}]}, {"name": "PIRSF000619", "regions": [{"end": 1210, "start": 1}]}, {"name": "PR00109", "regions": [{"end": 803, "start": 790}, {"end": 845, "start": 827}, {"end": 886, "start": 876}, {"end": 917, "start": 895}, {"end": 961, "start": 939}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 638, "start": 505}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PS50011", "regions": [{"end": 979, "start": 712}]}]}]}, {"end": 55324313, "exons": [{"end": 55087058, "start": 55086811}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240621, "start": 55240539}, {"end": 55324313, "start": 55323947}], "is_best_transcript": false, "name": "ENST00000442591", "start": 55086811, "translations": [{"cdna_coding_end": 2134, "cdna_coding_start": 161, "domains": [{"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}, {"end": 653, "start": 614}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 638, "start": 505}]}]}]}, {"end": 55214417, "exons": [{"end": 55177651, "start": 55177416}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214417, "start": 55214299}], "is_best_transcript": false, "name": "ENST00000450046", "start": 55177416, "translations": [{"cdna_coding_end": 691, "cdna_coding_start": 308, "domains": [{"name": "SSF52058", "regions": [{"end": 127, "start": 1}]}, {"name": "PF01030", "regions": [{"end": 114, "start": 4}]}]}]}, {"end": 55273591, "exons": [{"end": 55177651, "start": 55177540}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270318, "start": 55270210}, {"end": 55273591, "start": 55272949}], "is_best_transcript": false, "name": "ENST00000454757", "start": 55177540, "translations": [{"cdna_coding_end": 3657, "cdna_coding_start": 184, "domains": [{"name": "SM00261", "regions": [{"end": 217, "start": 175}, {"end": 494, "start": 443}, {"end": 548, "start": 499}]}, {"name": "PF00069", "regions": [{"end": 911, "start": 659}]}, {"name": "SM00219", "regions": [{"end": 915, "start": 659}]}, {"name": "SSF56112", "regions": [{"end": 967, "start": 643}]}, {"name": "SM00220", "regions": [{"end": 916, "start": 659}]}, {"name": "PF01030", "regions": [{"end": 114, "start": 4}, {"end": 427, "start": 308}]}, {"name": "PS50311", "regions": [{"end": 211, "start": 134}]}, {"name": "PS50011", "regions": [{"end": 926, "start": 659}]}, {"name": "PR00109", "regions": [{"end": 750, "start": 737}, {"end": 792, "start": 774}, {"end": 833, "start": 823}, {"end": 864, "start": 842}, {"end": 908, "start": 886}]}, {"name": "SSF57184", "regions": [{"end": 286, "start": 129}, {"end": 585, "start": 452}]}, {"name": "PIRSF000619", "regions": [{"end": 1157, "start": 1}]}, {"name": "PF07714", "regions": [{"end": 912, "start": 661}]}, {"name": "SSF52058", "regions": [{"end": 158, "start": 1}, {"end": 467, "start": 275}]}, {"name": "PF00757", "regions": [{"end": 285, "start": 132}]}]}]}]}, {"aliases": ["DSTYK"], "chr": "1", "end": 205180727, "name": "ENSG00000133059", "start": 205111632, "strand": "-", "transcripts": [{"end": 205180727, "exons": [{"end": 205116873, "start": 205111632}, {"end": 205117467, "start": 205117333}, {"end": 205119898, "start": 205119808}, {"end": 205133083, "start": 205133055}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180727, "start": 205180399}], "is_best_transcript": false, "name": "ENST00000367160", "start": 205111632, "translations": [{"cdna_coding_end": 1831, "cdna_coding_start": 65, "domains": [{"name": "SM00220", "regions": [{"end": 565, "start": 337}]}, {"name": "SSF56112", "regions": [{"end": 585, "start": 452}]}, {"name": "PF00069", "regions": [{"end": 556, "start": 451}]}, {"name": "PF07714", "regions": [{"end": 558, "start": 471}]}, {"name": "PS50011", "regions": [{"end": 565, "start": 312}]}]}]}, {"end": 205180694, "exons": [{"end": 205116873, "start": 205111633}, {"end": 205119922, "start": 205119808}, {"end": 205126514, "start": 205126401}, {"end": 205128807, "start": 205128675}, {"end": 205129398, "start": 205129242}, {"end": 205130515, "start": 205130386}, {"end": 205131340, "start": 205131164}, {"end": 205132134, "start": 205132051}, {"end": 205133083, "start": 205132851}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180694, "start": 205180399}], "is_best_transcript": false, "name": "ENST00000367161", "start": 205111633, "translations": [{"cdna_coding_end": 2686, "cdna_coding_start": 32, "domains": [{"name": "PF07714", "regions": [{"end": 820, "start": 654}]}, {"name": "PS50011", "regions": [{"end": 884, "start": 652}]}, {"name": "SSF56112", "regions": [{"end": 853, "start": 627}]}, {"name": "SM00220", "regions": [{"end": 861, "start": 652}]}, {"name": "PF00069", "regions": [{"end": 824, "start": 654}]}, {"name": "SM00219", "regions": [{"end": 861, "start": 652}]}]}]}, {"end": 205180694, "exons": [{"end": 205116873, "start": 205111633}, {"end": 205117467, "start": 205117333}, {"end": 205119922, "start": 205119808}, {"end": 205126514, "start": 205126401}, {"end": 205128807, "start": 205128675}, {"end": 205129398, "start": 205129242}, {"end": 205130515, "start": 205130386}, {"end": 205131340, "start": 205131164}, {"end": 205132134, "start": 205132051}, {"end": 205133083, "start": 205132851}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180694, "start": 205180399}], "is_best_transcript": true, "name": "ENST00000367162", "start": 205111633, "translations": [{"cdna_coding_end": 2821, "cdna_coding_start": 32, "domains": [{"name": "PF07714", "regions": [{"end": 899, "start": 654}]}, {"name": "PS50011", "regions": [{"end": 906, "start": 652}]}, {"name": "SSF56112", "regions": [{"end": 897, "start": 638}]}, {"name": "SM00220", "regions": [{"end": 906, "start": 652}]}, {"name": "SM00219", "regions": [{"end": 906, "start": 652}]}, {"name": "PF00069", "regions": [{"end": 897, "start": 654}]}]}]}]}, {"aliases": ["NDUFA12"], "chr": "12", "end": 95397546, "name": "ENSG00000184752", "start": 95290831, "strand": "-", "transcripts": [{"end": 95397436, "exons": [{"end": 95291086, "start": 95290831}, {"end": 95318582, "start": 95318422}, {"end": 95322039, "start": 95321793}, {"end": 95396597, "start": 95396515}, {"end": 95397436, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000552205", "start": 95290831}, {"end": 95397476, "exons": [{"end": 95365261, "start": 95365108}, {"end": 95396597, "start": 95396582}, {"end": 95397476, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000547157", "start": 95365108, "translations": [{"cdna_coding_end": 188, "cdna_coding_start": 21}]}, {"end": 95397384, "exons": [{"end": 95365396, "start": 95365109}, {"end": 95388033, "start": 95387946}, {"end": 95390752, "start": 95390680}, {"end": 95396597, "start": 95396515}, {"end": 95397384, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000551991", "start": 95365109, "translations": [{"cdna_coding_end": 144, "cdna_coding_start": 1, "domains": [{"name": "PF05071", "regions": [{"end": 33, "start": 12}]}]}]}, {"end": 95397546, "exons": [{"end": 95365396, "start": 95365109}, {"end": 95388033, "start": 95387946}, {"end": 95396597, "start": 95396515}, {"end": 95397546, "start": 95397371}], "is_best_transcript": true, "name": "ENST00000327772", "start": 95365109, "translations": [{"cdna_coding_end": 528, "cdna_coding_start": 91, "domains": [{"name": "PF05071", "regions": [{"end": 137, "start": 36}]}]}]}, {"end": 95397489, "exons": [{"end": 95365396, "start": 95365112}, {"end": 95396597, "start": 95396515}, {"end": 95397489, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000547986", "start": 95365112, "translations": [{"cdna_coding_end": 225, "cdna_coding_start": 34, "domains": [{"name": "PF05071", "regions": [{"end": 53, "start": 36}]}]}]}, {"end": 95397524, "exons": [{"end": 95365396, "start": 95365254}, {"end": 95366265, "start": 95366171}, {"end": 95388033, "start": 95387946}, {"end": 95396597, "start": 95396515}, {"end": 95397524, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000546788", "start": 95365254, "translations": [{"cdna_coding_end": 368, "cdna_coding_start": 69, "domains": [{"name": "PF05071", "regions": [{"end": 87, "start": 36}]}]}]}]}, {"aliases": ["FRMD6"], "chr": "14", "end": 52197445, "name": "ENSG00000139926", "start": 51955818, "strand": "+", "transcripts": [{"end": 52197177, "exons": [{"end": 51956138, "start": 51955855}, {"end": 52037128, "start": 52037066}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197177, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000356218", "start": 51955855, "translations": [{"cdna_coding_end": 2338, "cdna_coding_start": 494, "domains": [{"name": "PF09379", "regions": [{"end": 109, "start": 20}]}, {"name": "PF09380", "regions": [{"end": 322, "start": 237}]}, {"name": "SSF50729", "regions": [{"end": 375, "start": 219}]}, {"name": "SM00295", "regions": [{"end": 226, "start": 12}]}, {"name": "PS50057", "regions": [{"end": 320, "start": 16}]}, {"name": "PF00373", "regions": [{"end": 226, "start": 115}]}, {"name": "SSF47031", "regions": [{"end": 218, "start": 110}]}, {"name": "SSF54236", "regions": [{"end": 110, "start": 14}]}]}]}, {"end": 52197445, "exons": [{"end": 52118714, "start": 52118576}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197445, "start": 52194463}], "is_best_transcript": true, "name": "ENST00000395718", "start": 52118576, "translations": [{"cdna_coding_end": 2130, "cdna_coding_start": 286, "domains": [{"name": "PF00373", "regions": [{"end": 226, "start": 115}]}, {"name": "SSF47031", "regions": [{"end": 218, "start": 110}]}, {"name": "SSF54236", "regions": [{"end": 110, "start": 14}]}, {"name": "PS50057", "regions": [{"end": 320, "start": 16}]}, {"name": "SM00295", "regions": [{"end": 226, "start": 12}]}, {"name": "SSF50729", "regions": [{"end": 375, "start": 219}]}, {"name": "PF09380", "regions": [{"end": 322, "start": 237}]}, {"name": "PF09379", "regions": [{"end": 109, "start": 20}]}]}]}, {"end": 52195654, "exons": [{"end": 52118714, "start": 52118665}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167877, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52195654, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000344768", "start": 52118665, "translations": [{"cdna_coding_end": 2065, "cdna_coding_start": 197, "domains": [{"name": "PF09380", "regions": [{"end": 330, "start": 245}]}, {"name": "PF09379", "regions": [{"end": 117, "start": 20}]}, {"name": "SSF47031", "regions": [{"end": 226, "start": 118}]}, {"name": "PF00373", "regions": [{"end": 234, "start": 123}]}, {"name": "SSF54236", "regions": [{"end": 118, "start": 14}]}, {"name": "PS50057", "regions": [{"end": 328, "start": 16}]}, {"name": "SM00295", "regions": [{"end": 234, "start": 12}]}, {"name": "SSF50729", "regions": [{"end": 383, "start": 227}]}]}]}, {"end": 52164945, "exons": [{"end": 52118935, "start": 52118698}, {"end": 52156653, "start": 52156409}, {"end": 52164945, "start": 52164860}], "is_best_transcript": false, "name": "ENST00000554778", "start": 52118698}, {"end": 52174806, "exons": [{"end": 52164950, "start": 52164706}, {"end": 52167877, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174806, "start": 52174796}], "is_best_transcript": false, "name": "ENST00000555936", "start": 52164706}, {"end": 52197148, "exons": [{"end": 52164950, "start": 52164831}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197148, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000554167", "start": 52164831, "translations": [{"cdna_coding_end": 1775, "cdna_coding_start": 138, "domains": [{"name": "SSF50729", "regions": [{"end": 306, "start": 150}]}, {"name": "PS50057", "regions": [{"end": 251, "start": 1}]}, {"name": "SSF54236", "regions": [{"end": 41, "start": 1}]}, {"name": "SSF47031", "regions": [{"end": 149, "start": 41}]}, {"name": "PF00373", "regions": [{"end": 157, "start": 46}]}, {"name": "PF09380", "regions": [{"end": 253, "start": 168}]}]}]}, {"end": 52175062, "exons": [{"end": 52169306, "start": 52169266}, {"end": 52171653, "start": 52171467}, {"end": 52175062, "start": 52174796}], "is_best_transcript": false, "name": "ENST00000557405", "start": 52169266, "translations": [{"cdna_coding_end": 390, "cdna_coding_start": 1, "domains": [{"name": "PS50057", "regions": [{"end": 129, "start": 1}]}, {"name": "PF00373", "regions": [{"end": 124, "start": 13}]}, {"name": "SSF47031", "regions": [{"end": 116, "start": 8}]}]}]}, {"end": 52187243, "exons": [{"end": 52179269, "start": 52179231}, {"end": 52182217, "start": 52182043}, {"end": 52187243, "start": 52186773}], "is_best_transcript": false, "name": "ENST00000555197", "start": 52179231, "translations": [{"cdna_coding_end": 618, "cdna_coding_start": 1, "domains": [{"name": "PF09380", "regions": [{"end": 60, "start": 2}]}, {"name": "PS50057", "regions": [{"end": 58, "start": 1}]}, {"name": "SSF50729", "regions": [{"end": 113, "start": 2}]}]}]}, {"end": 52192513, "exons": [{"end": 52184066, "start": 52183973}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188673}, {"end": 52192513, "start": 52192497}], "is_best_transcript": false, "name": "ENST00000555703", "start": 52183973, "translations": [{"cdna_coding_end": 573, "cdna_coding_start": 145}]}, {"end": 52195487, "exons": [{"end": 52184066, "start": 52183973}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52195487, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000553556", "start": 52183973, "translations": [{"cdna_coding_end": 939, "cdna_coding_start": 145}]}]}, {"aliases": ["PRKCB"], "chr": "16", "end": 24231932, "name": "ENSG00000166501", "start": 23847322, "strand": "+", "transcripts": [{"end": 24231932, "exons": [{"end": 23847669, "start": 23847322}, {"end": 23848727, "start": 23848696}, {"end": 23999911, "start": 23999829}, {"end": 24043568, "start": 24043457}, {"end": 24046868, "start": 24046740}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124390, "start": 24124294}, {"end": 24135302, "start": 24135156}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192249, "start": 24192111}, {"end": 24196512, "start": 24196432}, {"end": 24196888, "start": 24196781}, {"end": 24202551, "start": 24202411}, {"end": 24231932, "start": 24231282}], "is_best_transcript": true, "name": "ENST00000321728", "start": 23847322, "translations": [{"cdna_coding_end": 2191, "cdna_coding_start": 176, "domains": [{"name": "SM00239", "regions": [{"end": 275, "start": 172}]}, {"name": "PF07714", "regions": [{"end": 583, "start": 344}]}, {"name": "SSF49562", "regions": [{"end": 288, "start": 157}]}, {"name": "SM00109", "regions": [{"end": 86, "start": 37}, {"end": 151, "start": 102}]}, {"name": "PS50011", "regions": [{"end": 600, "start": 342}]}, {"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 59, "start": 50}, {"end": 74, "start": 63}, {"end": 152, "start": 140}]}, {"name": "PF00433", "regions": [{"end": 666, "start": 623}]}, {"name": "SM00220", "regions": [{"end": 600, "start": 342}]}, {"name": "PF00168", "regions": [{"end": 259, "start": 175}]}, {"name": "SSF57889", "regions": [{"end": 92, "start": 6}, {"end": 157, "start": 101}]}, {"name": "PF00130", "regions": [{"end": 87, "start": 37}, {"end": 153, "start": 102}]}, {"name": "PS50081", "regions": [{"end": 86, "start": 36}, {"end": 151, "start": 101}]}, {"name": "SSF56112", "regions": [{"end": 627, "start": 317}]}, {"name": "PF00069", "regions": [{"end": 586, "start": 343}]}, {"name": "SM00219", "regions": [{"end": 576, "start": 342}]}, {"name": "PR00360", "regions": [{"end": 200, "start": 188}, {"end": 230, "start": 217}, {"end": 248, "start": 240}]}, {"name": "SM00133", "regions": [{"end": 664, "start": 601}]}, {"name": "PS50004", "regions": [{"end": 260, "start": 173}]}, {"name": "PIRSF000550", "regions": [{"end": 671, "start": 1}]}]}]}, {"end": 24231932, "exons": [{"end": 23847669, "start": 23847345}, {"end": 23848727, "start": 23848696}, {"end": 23999911, "start": 23999829}, {"end": 24043568, "start": 24043457}, {"end": 24046868, "start": 24046740}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124390, "start": 24124294}, {"end": 24135302, "start": 24135156}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192249, "start": 24192111}, {"end": 24196512, "start": 24196432}, {"end": 24196888, "start": 24196781}, {"end": 24202551, "start": 24202411}, {"end": 24231932, "start": 24225979}], "is_best_transcript": false, "name": "ENST00000303531", "start": 23847345, "translations": [{"cdna_coding_end": 2174, "cdna_coding_start": 153, "domains": [{"name": "SM00133", "regions": [{"end": 663, "start": 601}]}, {"name": "PS50004", "regions": [{"end": 260, "start": 173}]}, {"name": "PIRSF000550", "regions": [{"end": 672, "start": 1}]}, {"name": "PF00069", "regions": [{"end": 586, "start": 343}]}, {"name": "PR00360", "regions": [{"end": 200, "start": 188}, {"end": 230, "start": 217}, {"end": 248, "start": 240}]}, {"name": "SM00219", "regions": [{"end": 576, "start": 342}]}, {"name": "PS50081", "regions": [{"end": 86, "start": 36}, {"end": 151, "start": 101}]}, {"name": "SSF56112", "regions": [{"end": 627, "start": 317}]}, {"name": "SM00220", "regions": [{"end": 600, "start": 342}]}, {"name": "PF00433", "regions": [{"end": 664, "start": 627}]}, {"name": "PF00130", "regions": [{"end": 87, "start": 37}, {"end": 153, "start": 102}]}, {"name": "PF00168", "regions": [{"end": 259, "start": 175}]}, {"name": "SSF57889", "regions": [{"end": 92, "start": 6}, {"end": 157, "start": 101}]}, {"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 59, "start": 50}, {"end": 74, "start": 63}, {"end": 152, "start": 140}]}, {"name": "PS50011", "regions": [{"end": 600, "start": 342}]}, {"name": "SM00109", "regions": [{"end": 86, "start": 37}, {"end": 151, "start": 102}]}, {"name": "PF07714", "regions": [{"end": 583, "start": 344}]}, {"name": "SSF49562", "regions": [{"end": 288, "start": 157}]}, {"name": "SM00239", "regions": [{"end": 275, "start": 172}]}]}]}, {"end": 23880647, "exons": [{"end": 23847669, "start": 23847403}, {"end": 23880647, "start": 23880435}], "is_best_transcript": false, "name": "ENST00000498058", "start": 23847403, "translations": [{"cdna_coding_end": 268, "cdna_coding_start": 95, "domains": [{"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 57, "start": 50}]}, {"name": "PS50081", "regions": [{"end": 57, "start": 36}]}, {"name": "SSF57889", "regions": [{"end": 57, "start": 6}]}]}]}, {"end": 24124386, "exons": [{"end": 23848727, "start": 23848544}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124386, "start": 24124294}], "is_best_transcript": false, "name": "ENST00000498739", "start": 23848544}, {"end": 24192166, "exons": [{"end": 24163176, "start": 24163006}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192166, "start": 24192111}], "is_best_transcript": false, "name": "ENST00000472066", "start": 24163006}, {"end": 24202909, "exons": [{"end": 24196888, "start": 24196852}, {"end": 24202909, "start": 24202411}], "is_best_transcript": false, "name": "ENST00000466124", "start": 24196852}]}, {"aliases": ["GIMAP4"], "chr": "7", "end": 150271041, "name": "ENSG00000133574", "start": 150264365, "strand": "+", "transcripts": [{"end": 150271041, "exons": [{"end": 150264525, "start": 150264365}, {"end": 150267047, "start": 150266976}, {"end": 150271041, "start": 150269217}], "is_best_transcript": true, "name": "ENST00000255945", "start": 150264365, "translations": [{"cdna_coding_end": 1165, "cdna_coding_start": 176, "domains": [{"name": "PF04548", "regions": [{"end": 238, "start": 31}]}, {"name": "SSF52540", "regions": [{"end": 288, "start": 24}]}]}]}, {"end": 150270602, "exons": [{"end": 150264525, "start": 150264457}, {"end": 150267089, "start": 150266976}, {"end": 150270602, "start": 150269217}], "is_best_transcript": false, "name": "ENST00000461940", "start": 150264457, "translations": [{"cdna_coding_end": 1115, "cdna_coding_start": 84, "domains": [{"name": "PF04548", "regions": [{"end": 252, "start": 45}]}, {"name": "SSF52540", "regions": [{"end": 302, "start": 38}]}]}]}, {"end": 150269569, "exons": [{"end": 150264608, "start": 150264524}, {"end": 150267089, "start": 150266976}, {"end": 150269569, "start": 150269217}], "is_best_transcript": false, "name": "ENST00000479232", "start": 150264524, "translations": [{"cdna_coding_end": 552, "cdna_coding_start": 100, "domains": [{"name": "SSF52540", "regions": [{"end": 151, "start": 38}]}, {"name": "PF04548", "regions": [{"end": 151, "start": 45}]}]}]}]}, {"aliases": ["IL7"], "chr": "8", "end": 79717758, "name": "ENSG00000104432", "start": 79587978, "strand": "-", "transcripts": [{"end": 79717758, "exons": [{"end": 79646067, "start": 79645007}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710307}, {"end": 79717758, "start": 79717148}], "is_best_transcript": true, "name": "ENST00000263851", "start": 79645007, "translations": [{"cdna_coding_end": 1135, "cdna_coding_start": 602, "domains": [{"name": "PIRSF001942", "regions": [{"end": 177, "start": 1}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}, {"end": 77, "start": 57}, {"end": 98, "start": 78}, {"end": 118, "start": 99}, {"end": 173, "start": 151}]}, {"name": "PF01415", "regions": [{"end": 173, "start": 28}]}, {"name": "SM00127", "regions": [{"end": 173, "start": 27}]}]}]}, {"end": 79717699, "exons": [{"end": 79646063, "start": 79645283}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79659331, "start": 79659129}, {"end": 79710443, "start": 79710307}, {"end": 79717699, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000518982", "start": 79645283, "translations": [{"cdna_coding_end": 758, "cdna_coding_start": 543, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}]}, {"name": "PF01415", "regions": [{"end": 54, "start": 28}]}]}]}, {"end": 79717163, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710307}, {"end": 79717163, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520269", "start": 79645900, "translations": [{"cdna_coding_end": 408, "cdna_coding_start": 7, "domains": [{"name": "PF01415", "regions": [{"end": 77, "start": 28}, {"end": 129, "start": 91}]}, {"name": "SM00127", "regions": [{"end": 129, "start": 27}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}, {"end": 77, "start": 57}]}, {"name": "PIRSF001942", "regions": [{"end": 133, "start": 1}]}]}]}, {"end": 79717163, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710363}, {"end": 79717163, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520215", "start": 79645900, "translations": [{"cdna_coding_end": 120, "cdna_coding_start": 7, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 37, "start": 26}]}]}]}, {"end": 79717686, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710363}, {"end": 79717686, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520317", "start": 79645900, "translations": [{"cdna_coding_end": 643, "cdna_coding_start": 530, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 37, "start": 26}]}]}]}, {"end": 79652311, "exons": [{"end": 79646067, "start": 79645948}, {"end": 79652311, "start": 79652237}], "is_best_transcript": false, "name": "ENST00000541183", "start": 79645948, "translations": [{"cdna_coding_end": 195, "cdna_coding_start": 1, "domains": [{"name": "SM00127", "regions": [{"end": 60, "start": 1}]}, {"name": "PF01415", "regions": [{"end": 60, "start": 1}]}]}]}, {"end": 79717758, "exons": [{"end": 79659331, "start": 79659263}, {"end": 79710443, "start": 79710307}, {"end": 79717758, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000379113", "start": 79659263, "translations": [{"cdna_coding_end": 817, "cdna_coding_start": 602, "domains": [{"name": "PF01415", "regions": [{"end": 54, "start": 28}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}]}]}]}]}, {"aliases": ["SVEP1"], "chr": "9", "end": 113342160, "name": "ENSG00000165124", "start": 113127531, "strand": "-", "transcripts": [{"end": 113342160, "exons": [{"end": 113128840, "start": 113127531}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}, {"end": 113191614, "start": 113191423}, {"end": 113192284, "start": 113192200}, {"end": 113192730, "start": 113192554}, {"end": 113194314, "start": 113194195}, {"end": 113194915, "start": 113194742}, {"end": 113196786, "start": 113196616}, {"end": 113197644, "start": 113197521}, {"end": 113198784, "start": 113198660}, {"end": 113206000, "start": 113205825}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220842, "start": 113220751}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342160, "start": 113341293}], "is_best_transcript": true, "name": "ENST00000401783", "start": 113127531, "translations": [{"cdna_coding_end": 11053, "cdna_coding_start": 338, "domains": [{"name": "SM00032", "regions": [{"end": 433, "start": 378}, {"end": 493, "start": 438}, {"end": 559, "start": 498}, {"end": 787, "start": 727}, {"end": 1685, "start": 1631}, {"end": 1743, "start": 1690}, {"end": 1842, "start": 1789}, {"end": 1900, "start": 1847}, {"end": 1958, "start": 1905}, {"end": 2016, "start": 1963}, {"end": 2078, "start": 2021}, {"end": 2141, "start": 2083}, {"end": 2199, "start": 2146}, {"end": 2259, "start": 2204}, {"end": 2318, "start": 2264}, {"end": 2376, "start": 2323}, {"end": 2435, "start": 2381}, {"end": 2493, "start": 2440}, {"end": 2551, "start": 2498}, {"end": 2608, "start": 2556}, {"end": 2712, "start": 2654}, {"end": 2770, "start": 2717}, {"end": 2828, "start": 2775}, {"end": 2886, "start": 2833}, {"end": 2944, "start": 2891}, {"end": 3002, "start": 2949}, {"end": 3059, "start": 3007}, {"end": 3117, "start": 3064}, {"end": 3176, "start": 3122}, {"end": 3236, "start": 3181}, {"end": 3294, "start": 3241}, {"end": 3352, "start": 3299}, {"end": 3411, "start": 3357}, {"end": 3468, "start": 3416}]}, {"name": "PF02494", "regions": [{"end": 642, "start": 561}, {"end": 721, "start": 644}]}, {"name": "PR00895", "regions": [{"end": 1530, "start": 1512}, {"end": 1558, "start": 1539}, {"end": 1592, "start": 1559}]}, {"name": "SSF57535", "regions": [{"end": 433, "start": 374}, {"end": 493, "start": 434}, {"end": 560, "start": 494}, {"end": 790, "start": 727}, {"end": 1746, "start": 1626}, {"end": 1842, "start": 1785}, {"end": 1900, "start": 1843}, {"end": 1958, "start": 1901}, {"end": 2016, "start": 1959}, {"end": 2078, "start": 2017}, {"end": 2199, "start": 2081}, {"end": 2318, "start": 2202}, {"end": 2377, "start": 2321}, {"end": 2437, "start": 2379}, {"end": 2551, "start": 2438}, {"end": 2616, "start": 2552}, {"end": 2712, "start": 2643}, {"end": 2828, "start": 2715}, {"end": 2886, "start": 2829}, {"end": 2944, "start": 2887}, {"end": 3117, "start": 2945}, {"end": 3176, "start": 3118}, {"end": 3229, "start": 3177}, {"end": 3475, "start": 3239}]}, {"name": "SSF49899", "regions": [{"end": 1632, "start": 1421}]}, {"name": "SM00159", "regions": [{"end": 1627, "start": 1420}]}, {"name": "PF00354", "regions": [{"end": 1620, "start": 1442}]}, {"name": "PF07699", "regions": [{"end": 360, "start": 310}, {"end": 1052, "start": 1005}, {"end": 1106, "start": 1059}, {"end": 1160, "start": 1113}]}, {"name": "PS50311", "regions": [{"end": 1409, "start": 1197}, {"end": 3554, "start": 3468}]}, {"name": "PS50825", "regions": [{"end": 642, "start": 560}, {"end": 724, "start": 643}]}, {"name": "PF00092", "regions": [{"end": 252, "start": 84}]}, {"name": "SSF57196", "regions": [{"end": 1267, "start": 1189}, {"end": 1305, "start": 1268}, {"end": 1342, "start": 1306}, {"end": 1423, "start": 1344}, {"end": 1786, "start": 1735}, {"end": 3506, "start": 3463}, {"end": 3535, "start": 3507}, {"end": 3570, "start": 3537}]}, {"name": "PS50026", "regions": [{"end": 1229, "start": 1193}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}, {"end": 1784, "start": 1745}, {"end": 3532, "start": 3500}, {"end": 3564, "start": 3533}]}, {"name": "SM00181", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1234}, {"end": 1305, "start": 1272}, {"end": 1343, "start": 1310}, {"end": 1381, "start": 1348}, {"end": 1419, "start": 1386}, {"end": 1784, "start": 1748}, {"end": 3500, "start": 3471}, {"end": 3532, "start": 3503}, {"end": 3564, "start": 3535}]}, {"name": "SM00179", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}, {"end": 1784, "start": 1745}, {"end": 3532, "start": 3504}]}, {"name": "SSF57184", "regions": [{"end": 440, "start": 269}, {"end": 1144, "start": 988}]}, {"name": "PF07645", "regions": [{"end": 1783, "start": 1745}]}, {"name": "PS50923", "regions": [{"end": 435, "start": 376}, {"end": 495, "start": 436}, {"end": 561, "start": 496}, {"end": 789, "start": 725}, {"end": 1687, "start": 1629}, {"end": 1745, "start": 1688}, {"end": 1844, "start": 1787}, {"end": 1902, "start": 1845}, {"end": 1960, "start": 1903}, {"end": 2018, "start": 1961}, {"end": 2080, "start": 2019}, {"end": 2143, "start": 2081}, {"end": 2201, "start": 2144}, {"end": 2261, "start": 2202}, {"end": 2320, "start": 2262}, {"end": 2378, "start": 2321}, {"end": 2437, "start": 2379}, {"end": 2495, "start": 2438}, {"end": 2553, "start": 2496}, {"end": 2610, "start": 2554}, {"end": 2714, "start": 2663}, {"end": 2772, "start": 2715}, {"end": 2830, "start": 2773}, {"end": 2888, "start": 2831}, {"end": 2946, "start": 2889}, {"end": 3004, "start": 2947}, {"end": 3061, "start": 3005}, {"end": 3119, "start": 3062}, {"end": 3178, "start": 3120}, {"end": 3238, "start": 3179}, {"end": 3296, "start": 3239}, {"end": 3354, "start": 3297}, {"end": 3413, "start": 3355}, {"end": 3470, "start": 3414}]}, {"name": "SM00327", "regions": [{"end": 260, "start": 81}]}, {"name": "PF00008", "regions": [{"end": 1226, "start": 1197}, {"end": 1265, "start": 1235}, {"end": 1302, "start": 1273}, {"end": 1379, "start": 1349}, {"end": 1417, "start": 1387}]}, {"name": "PS50234", "regions": [{"end": 264, "start": 83}]}, {"name": "PF07974", "regions": [{"end": 1266, "start": 1235}, {"end": 3499, "start": 3475}, {"end": 3531, "start": 3507}, {"end": 3563, "start": 3536}]}, {"name": "SSF53300", "regions": [{"end": 262, "start": 79}]}, {"name": "PF00084", "regions": [{"end": 430, "start": 378}, {"end": 493, "start": 438}, {"end": 1685, "start": 1628}, {"end": 1743, "start": 1690}, {"end": 1842, "start": 1789}, {"end": 1900, "start": 1847}, {"end": 1958, "start": 1905}, {"end": 2016, "start": 1963}, {"end": 2078, "start": 2021}, {"end": 2136, "start": 2083}, {"end": 2199, "start": 2146}, {"end": 2259, "start": 2204}, {"end": 2318, "start": 2264}, {"end": 2376, "start": 2323}, {"end": 2435, "start": 2381}, {"end": 2493, "start": 2440}, {"end": 2551, "start": 2498}, {"end": 2608, "start": 2556}, {"end": 2712, "start": 2667}, {"end": 2770, "start": 2717}, {"end": 2828, "start": 2775}, {"end": 2886, "start": 2833}, {"end": 2944, "start": 2891}, {"end": 3002, "start": 2949}, {"end": 3059, "start": 3007}, {"end": 3117, "start": 3084}, {"end": 3172, "start": 3122}, {"end": 3236, "start": 3181}, {"end": 3290, "start": 3241}, {"end": 3352, "start": 3299}, {"end": 3411, "start": 3357}, {"end": 3468, "start": 3416}]}]}]}, {"end": 113190038, "exons": [{"end": 113128840, "start": 113127536}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}], "is_best_transcript": false, "name": "ENST00000297826", "start": 113127536, "translations": [{"cdna_coding_end": 4909, "cdna_coding_start": 416, "domains": [{"name": "PF00084", "regions": [{"end": 62, "start": 9}, {"end": 125, "start": 72}, {"end": 185, "start": 130}, {"end": 244, "start": 190}, {"end": 302, "start": 249}, {"end": 361, "start": 307}, {"end": 419, "start": 366}, {"end": 477, "start": 424}, {"end": 534, "start": 482}, {"end": 638, "start": 593}, {"end": 696, "start": 643}, {"end": 754, "start": 701}, {"end": 812, "start": 759}, {"end": 870, "start": 817}, {"end": 928, "start": 875}, {"end": 985, "start": 933}, {"end": 1043, "start": 1010}, {"end": 1098, "start": 1048}, {"end": 1162, "start": 1107}, {"end": 1216, "start": 1167}, {"end": 1278, "start": 1225}, {"end": 1337, "start": 1283}, {"end": 1394, "start": 1342}]}, {"name": "PF07974", "regions": [{"end": 1425, "start": 1401}, {"end": 1457, "start": 1433}, {"end": 1489, "start": 1462}]}, {"name": "PF00008", "regions": [{"end": 1456, "start": 1427}]}, {"name": "PS50923", "regions": [{"end": 69, "start": 7}, {"end": 127, "start": 70}, {"end": 187, "start": 128}, {"end": 246, "start": 188}, {"end": 304, "start": 247}, {"end": 363, "start": 305}, {"end": 421, "start": 364}, {"end": 479, "start": 422}, {"end": 536, "start": 480}, {"end": 640, "start": 589}, {"end": 698, "start": 641}, {"end": 756, "start": 699}, {"end": 814, "start": 757}, {"end": 872, "start": 815}, {"end": 930, "start": 873}, {"end": 987, "start": 931}, {"end": 1045, "start": 988}, {"end": 1104, "start": 1046}, {"end": 1164, "start": 1105}, {"end": 1222, "start": 1165}, {"end": 1280, "start": 1223}, {"end": 1339, "start": 1281}, {"end": 1396, "start": 1340}]}, {"name": "SM00181", "regions": [{"end": 1426, "start": 1397}, {"end": 1458, "start": 1429}, {"end": 1490, "start": 1461}]}, {"name": "SSF57196", "regions": [{"end": 1432, "start": 1389}, {"end": 1461, "start": 1433}, {"end": 1496, "start": 1463}]}, {"name": "PS50026", "regions": [{"end": 1458, "start": 1426}, {"end": 1490, "start": 1459}]}, {"name": "PS50311", "regions": [{"end": 1480, "start": 1394}]}, {"name": "SSF57535", "regions": [{"end": 125, "start": 7}, {"end": 244, "start": 128}, {"end": 303, "start": 247}, {"end": 363, "start": 305}, {"end": 477, "start": 364}, {"end": 542, "start": 478}, {"end": 638, "start": 569}, {"end": 754, "start": 641}, {"end": 812, "start": 755}, {"end": 870, "start": 813}, {"end": 1043, "start": 871}, {"end": 1102, "start": 1044}, {"end": 1155, "start": 1103}, {"end": 1401, "start": 1165}]}, {"name": "SM00032", "regions": [{"end": 67, "start": 9}, {"end": 125, "start": 72}, {"end": 185, "start": 130}, {"end": 244, "start": 190}, {"end": 302, "start": 249}, {"end": 361, "start": 307}, {"end": 419, "start": 366}, {"end": 477, "start": 424}, {"end": 534, "start": 482}, {"end": 638, "start": 580}, {"end": 696, "start": 643}, {"end": 754, "start": 701}, {"end": 812, "start": 759}, {"end": 870, "start": 817}, {"end": 928, "start": 875}, {"end": 985, "start": 933}, {"end": 1043, "start": 990}, {"end": 1102, "start": 1048}, {"end": 1162, "start": 1107}, {"end": 1220, "start": 1167}, {"end": 1278, "start": 1225}, {"end": 1337, "start": 1283}, {"end": 1394, "start": 1342}]}]}]}, {"end": 113342018, "exons": [{"end": 113128840, "start": 113127536}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}, {"end": 113191614, "start": 113191423}, {"end": 113192284, "start": 113192200}, {"end": 113192730, "start": 113192554}, {"end": 113194314, "start": 113194195}, {"end": 113194915, "start": 113194742}, {"end": 113196786, "start": 113196616}, {"end": 113197644, "start": 113197521}, {"end": 113198784, "start": 113198660}, {"end": 113206000, "start": 113205825}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220842, "start": 113220751}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342018, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000374469", "start": 113127536, "translations": [{"cdna_coding_end": 10911, "cdna_coding_start": 265, "domains": [{"name": "SSF57535", "regions": [{"end": 410, "start": 351}, {"end": 470, "start": 411}, {"end": 537, "start": 471}, {"end": 767, "start": 704}, {"end": 1723, "start": 1603}, {"end": 1819, "start": 1762}, {"end": 1877, "start": 1820}, {"end": 1935, "start": 1878}, {"end": 1993, "start": 1936}, {"end": 2055, "start": 1994}, {"end": 2176, "start": 2058}, {"end": 2295, "start": 2179}, {"end": 2354, "start": 2298}, {"end": 2414, "start": 2356}, {"end": 2528, "start": 2415}, {"end": 2593, "start": 2529}, {"end": 2689, "start": 2620}, {"end": 2805, "start": 2692}, {"end": 2863, "start": 2806}, {"end": 2921, "start": 2864}, {"end": 3094, "start": 2922}, {"end": 3153, "start": 3095}, {"end": 3206, "start": 3154}, {"end": 3452, "start": 3216}]}, {"name": "SSF49899", "regions": [{"end": 1609, "start": 1398}]}, {"name": "SM00159", "regions": [{"end": 1604, "start": 1397}]}, {"name": "PF00354", "regions": [{"end": 1597, "start": 1419}]}, {"name": "PR00895", "regions": [{"end": 1507, "start": 1489}, {"end": 1535, "start": 1516}, {"end": 1569, "start": 1536}]}, {"name": "PF02494", "regions": [{"end": 619, "start": 538}, {"end": 698, "start": 621}]}, {"name": "SM00032", "regions": [{"end": 410, "start": 355}, {"end": 470, "start": 415}, {"end": 536, "start": 475}, {"end": 764, "start": 704}, {"end": 1662, "start": 1608}, {"end": 1720, "start": 1667}, {"end": 1819, "start": 1766}, {"end": 1877, "start": 1824}, {"end": 1935, "start": 1882}, {"end": 1993, "start": 1940}, {"end": 2055, "start": 1998}, {"end": 2118, "start": 2060}, {"end": 2176, "start": 2123}, {"end": 2236, "start": 2181}, {"end": 2295, "start": 2241}, {"end": 2353, "start": 2300}, {"end": 2412, "start": 2358}, {"end": 2470, "start": 2417}, {"end": 2528, "start": 2475}, {"end": 2585, "start": 2533}, {"end": 2689, "start": 2631}, {"end": 2747, "start": 2694}, {"end": 2805, "start": 2752}, {"end": 2863, "start": 2810}, {"end": 2921, "start": 2868}, {"end": 2979, "start": 2926}, {"end": 3036, "start": 2984}, {"end": 3094, "start": 3041}, {"end": 3153, "start": 3099}, {"end": 3213, "start": 3158}, {"end": 3271, "start": 3218}, {"end": 3329, "start": 3276}, {"end": 3388, "start": 3334}, {"end": 3445, "start": 3393}]}, {"name": "SM00179", "regions": [{"end": 1206, "start": 1173}, {"end": 1244, "start": 1208}, {"end": 1282, "start": 1246}, {"end": 1320, "start": 1284}, {"end": 1358, "start": 1322}, {"end": 1396, "start": 1360}, {"end": 1761, "start": 1722}, {"end": 3509, "start": 3481}]}, {"name": "SSF57184", "regions": [{"end": 417, "start": 246}, {"end": 1121, "start": 965}]}, {"name": "SSF57196", "regions": [{"end": 1244, "start": 1166}, {"end": 1282, "start": 1245}, {"end": 1319, "start": 1283}, {"end": 1400, "start": 1321}, {"end": 1763, "start": 1712}, {"end": 3483, "start": 3440}, {"end": 3512, "start": 3484}, {"end": 3547, "start": 3514}]}, {"name": "PS50026", "regions": [{"end": 1206, "start": 1170}, {"end": 1244, "start": 1208}, {"end": 1282, "start": 1246}, {"end": 1320, "start": 1284}, {"end": 1358, "start": 1322}, {"end": 1396, "start": 1360}, {"end": 1761, "start": 1722}, {"end": 3509, "start": 3477}, {"end": 3541, "start": 3510}]}, {"name": "SM00181", "regions": [{"end": 1206, "start": 1173}, {"end": 1244, "start": 1211}, {"end": 1282, "start": 1249}, {"end": 1320, "start": 1287}, {"end": 1358, "start": 1325}, {"end": 1396, "start": 1363}, {"end": 1761, "start": 1725}, {"end": 3477, "start": 3448}, {"end": 3509, "start": 3480}, {"end": 3541, "start": 3512}]}, {"name": "PF00092", "regions": [{"end": 229, "start": 61}]}, {"name": "PS50825", "regions": [{"end": 619, "start": 537}, {"end": 701, "start": 620}]}, {"name": "PS50311", "regions": [{"end": 1386, "start": 1174}, {"end": 3531, "start": 3445}]}, {"name": "PF07699", "regions": [{"end": 337, "start": 287}, {"end": 1029, "start": 982}, {"end": 1083, "start": 1036}, {"end": 1137, "start": 1090}]}, {"name": "PF00008", "regions": [{"end": 1203, "start": 1174}, {"end": 1242, "start": 1212}, {"end": 1279, "start": 1250}, {"end": 1356, "start": 1326}, {"end": 1394, "start": 1364}]}, {"name": "SM00327", "regions": [{"end": 237, "start": 58}]}, {"name": "PS50923", "regions": [{"end": 412, "start": 353}, {"end": 472, "start": 413}, {"end": 538, "start": 473}, {"end": 766, "start": 702}, {"end": 1664, "start": 1606}, {"end": 1722, "start": 1665}, {"end": 1821, "start": 1764}, {"end": 1879, "start": 1822}, {"end": 1937, "start": 1880}, {"end": 1995, "start": 1938}, {"end": 2057, "start": 1996}, {"end": 2120, "start": 2058}, {"end": 2178, "start": 2121}, {"end": 2238, "start": 2179}, {"end": 2297, "start": 2239}, {"end": 2355, "start": 2298}, {"end": 2414, "start": 2356}, {"end": 2472, "start": 2415}, {"end": 2530, "start": 2473}, {"end": 2587, "start": 2531}, {"end": 2691, "start": 2640}, {"end": 2749, "start": 2692}, {"end": 2807, "start": 2750}, {"end": 2865, "start": 2808}, {"end": 2923, "start": 2866}, {"end": 2981, "start": 2924}, {"end": 3038, "start": 2982}, {"end": 3096, "start": 3039}, {"end": 3155, "start": 3097}, {"end": 3215, "start": 3156}, {"end": 3273, "start": 3216}, {"end": 3331, "start": 3274}, {"end": 3390, "start": 3332}, {"end": 3447, "start": 3391}]}, {"name": "PF07645", "regions": [{"end": 1760, "start": 1722}]}, {"name": "SSF53300", "regions": [{"end": 239, "start": 56}]}, {"name": "PF00084", "regions": [{"end": 407, "start": 355}, {"end": 470, "start": 415}, {"end": 1662, "start": 1605}, {"end": 1720, "start": 1667}, {"end": 1819, "start": 1766}, {"end": 1877, "start": 1824}, {"end": 1935, "start": 1882}, {"end": 1993, "start": 1940}, {"end": 2055, "start": 1998}, {"end": 2113, "start": 2060}, {"end": 2176, "start": 2123}, {"end": 2236, "start": 2181}, {"end": 2295, "start": 2241}, {"end": 2353, "start": 2300}, {"end": 2412, "start": 2358}, {"end": 2470, "start": 2417}, {"end": 2528, "start": 2475}, {"end": 2585, "start": 2533}, {"end": 2689, "start": 2644}, {"end": 2747, "start": 2694}, {"end": 2805, "start": 2752}, {"end": 2863, "start": 2810}, {"end": 2921, "start": 2868}, {"end": 2979, "start": 2926}, {"end": 3036, "start": 2984}, {"end": 3094, "start": 3061}, {"end": 3149, "start": 3099}, {"end": 3213, "start": 3158}, {"end": 3267, "start": 3218}, {"end": 3329, "start": 3276}, {"end": 3388, "start": 3334}, {"end": 3445, "start": 3393}]}, {"name": "PF07974", "regions": [{"end": 1243, "start": 1212}, {"end": 3476, "start": 3452}, {"end": 3508, "start": 3484}, {"end": 3540, "start": 3513}]}, {"name": "PS50234", "regions": [{"end": 241, "start": 60}]}]}]}, {"end": 113341823, "exons": [{"end": 113206000, "start": 113204759}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220399, "start": 113220395}, {"end": 113220842, "start": 113220756}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113341823, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000302728", "start": 113204759, "translations": [{"cdna_coding_end": 4650, "cdna_coding_start": 1, "domains": [{"name": "PS50825", "regions": [{"end": 642, "start": 560}, {"end": 724, "start": 643}]}, {"name": "PF07699", "regions": [{"end": 360, "start": 310}, {"end": 1052, "start": 1005}, {"end": 1106, "start": 1059}, {"end": 1160, "start": 1113}]}, {"name": "PS50311", "regions": [{"end": 1409, "start": 1197}]}, {"name": "SM00181", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1234}, {"end": 1305, "start": 1272}, {"end": 1343, "start": 1310}, {"end": 1381, "start": 1348}, {"end": 1419, "start": 1386}]}, {"name": "SSF57196", "regions": [{"end": 1267, "start": 1189}, {"end": 1305, "start": 1268}, {"end": 1342, "start": 1306}, {"end": 1423, "start": 1344}]}, {"name": "PS50026", "regions": [{"end": 1229, "start": 1193}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}]}, {"name": "SSF57184", "regions": [{"end": 440, "start": 269}, {"end": 1144, "start": 988}]}, {"name": "SM00179", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}]}, {"name": "PF00092", "regions": [{"end": 252, "start": 84}]}, {"name": "SM00032", "regions": [{"end": 433, "start": 378}, {"end": 493, "start": 438}, {"end": 559, "start": 498}, {"end": 787, "start": 727}]}, {"name": "PF02494", "regions": [{"end": 642, "start": 561}, {"end": 721, "start": 644}]}, {"name": "PR00010", "regions": [{"end": 1318, "start": 1307}, {"end": 1364, "start": 1357}, {"end": 1413, "start": 1403}, {"end": 1420, "start": 1414}]}, {"name": "PF00354", "regions": [{"end": 1532, "start": 1442}]}, {"name": "SSF57535", "regions": [{"end": 433, "start": 374}, {"end": 493, "start": 434}, {"end": 560, "start": 494}, {"end": 790, "start": 727}]}, {"name": "SSF49899", "regions": [{"end": 1547, "start": 1421}]}, {"name": "PS50234", "regions": [{"end": 264, "start": 83}]}, {"name": "SSF53300", "regions": [{"end": 262, "start": 79}]}, {"name": "PF00084", "regions": [{"end": 430, "start": 378}, {"end": 493, "start": 438}]}, {"name": "PS50923", "regions": [{"end": 435, "start": 376}, {"end": 495, "start": 436}, {"end": 561, "start": 496}, {"end": 789, "start": 725}]}, {"name": "PF07645", "regions": [{"end": 1262, "start": 1231}, {"end": 1338, "start": 1308}]}, {"name": "PF00008", "regions": [{"end": 1226, "start": 1197}, {"end": 1265, "start": 1235}, {"end": 1302, "start": 1273}, {"end": 1337, "start": 1311}, {"end": 1379, "start": 1349}, {"end": 1417, "start": 1387}]}, {"name": "SM00327", "regions": [{"end": 260, "start": 81}]}]}]}, {"end": 113342160, "exons": [{"end": 113238595, "start": 113238163}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342160, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000374461", "start": 113238163, "translations": [{"cdna_coding_end": 2944, "cdna_coding_start": 407, "domains": [{"name": "PF02494", "regions": [{"end": 619, "start": 538}, {"end": 698, "start": 621}]}, {"name": "SM00032", "regions": [{"end": 410, "start": 355}, {"end": 470, "start": 415}, {"end": 536, "start": 475}, {"end": 764, "start": 704}]}, {"name": "SSF57535", "regions": [{"end": 410, "start": 351}, {"end": 470, "start": 411}, {"end": 537, "start": 471}, {"end": 767, "start": 704}]}, {"name": "PF07699", "regions": [{"end": 337, "start": 287}]}, {"name": "PS50825", "regions": [{"end": 619, "start": 537}, {"end": 701, "start": 620}]}, {"name": "PF00092", "regions": [{"end": 229, "start": 61}]}, {"name": "SSF57184", "regions": [{"end": 417, "start": 246}]}, {"name": "PS50923", "regions": [{"end": 412, "start": 353}, {"end": 472, "start": 413}, {"end": 538, "start": 473}, {"end": 766, "start": 702}]}, {"name": "SM00327", "regions": [{"end": 237, "start": 58}]}, {"name": "PS50234", "regions": [{"end": 241, "start": 60}]}, {"name": "SSF53300", "regions": [{"end": 239, "start": 56}]}, {"name": "PF00084", "regions": [{"end": 407, "start": 355}, {"end": 470, "start": 415}]}]}]}]}, {"aliases": ["ARID1B"], "chr": "6", "end": 157530401, "name": "ENSG00000049618", "start": 157099063, "strand": "+", "transcripts": [{"end": 157529495, "exons": [{"end": 157100605, "start": 157099063}, {"end": 157150555, "start": 157150361}, {"end": 157192786, "start": 157192748}, {"end": 157222659, "start": 157222510}, {"end": 157256710, "start": 157256600}, {"end": 157406039, "start": 157405796}, {"end": 157431695, "start": 157431606}, {"end": 157454341, "start": 157454162}, {"end": 157470085, "start": 157469758}, {"end": 157488319, "start": 157488174}, {"end": 157495251, "start": 157495142}, {"end": 157502312, "start": 157502103}, {"end": 157505569, "start": 157505365}, {"end": 157510914, "start": 157510776}, {"end": 157511344, "start": 157511172}, {"end": 157517449, "start": 157517299}, {"end": 157520041, "start": 157519945}, {"end": 157522622, "start": 157521839}, {"end": 157525130, "start": 157525000}, {"end": 157529495, "start": 157527301}], "is_best_transcript": true, "name": "ENST00000346085", "start": 157099063, "translations": [{"cdna_coding_end": 6751, "cdna_coding_start": 2, "domains": [{"name": "PF12031", "regions": [{"end": 2195, "start": 1939}]}, {"name": "PS50324", "regions": [{"end": 57, "start": 35}, {"end": 784, "start": 697}]}, {"name": "PF01388", "regions": [{"end": 1153, "start": 1065}]}, {"name": "PS50099", "regions": [{"end": 820, "start": 715}, {"end": 1610, "start": 1472}]}, {"name": "SSF48371", "regions": [{"end": 2220, "start": 2075}]}, {"name": "PS50316", "regions": [{"end": 104, "start": 81}]}, {"name": "PS50322", "regions": [{"end": 131, "start": 107}, {"end": 646, "start": 574}]}, {"name": "PS51011", "regions": [{"end": 1157, "start": 1066}]}, {"name": "PS50310", "regions": [{"end": 47, "start": 2}, {"end": 493, "start": 329}]}, {"name": "PS50315", "regions": [{"end": 401, "start": 141}]}, {"name": "SSF46774", "regions": [{"end": 1168, "start": 1049}]}, {"name": "SM00501", "regions": [{"end": 1158, "start": 1067}]}]}]}]}]} \ No newline at end of file diff --git a/tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 b/tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 deleted file mode 100644 index be16e852..00000000 --- a/tests/tools/data/Homo_sapiens.GRCh38.105.kras.gff3 +++ /dev/null @@ -1,19 +0,0 @@ -12 ensembl_havana gene 25205246 25250936 . - . ID=gene:ENSG00000133703;Name=KRAS;biotype=protein_coding;description=KRAS proto-oncogene%2C GTPase [Source:HGNC Symbol%3BAcc:HGNC:6407];gene_id=ENSG00000133703;logic_name=ensembl_havana_gene_homo_sapiens;version=14 -12 havana mRNA 25205246 25225773 . - . ID=transcript:ENST00000690406;Parent=gene:ENSG00000133703;Name=KRAS-211;biotype=nonsense_mediated_decay;transcript_id=ENST00000690406;version=1 -12 ensembl_havana mRNA 25205246 25250929 . - . ID=transcript:ENST00000256078;Parent=gene:ENSG00000133703;Name=KRAS-201;biotype=protein_coding;ccdsid=CCDS8703.1;tag=basic;transcript_id=ENST00000256078;transcript_support_level=1 (assigned to previous version 8);version=10 -12 ensembl_havana mRNA 25205246 25250929 . - . ID=transcript:ENST00000311936;Parent=gene:ENSG00000133703;Name=KRAS-202;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000311936;transcript_support_level=1 (assigned to previous version 7);version=8 -12 havana mRNA 25205250 25250908 . - . ID=transcript:ENST00000686877;Parent=gene:ENSG00000133703;Name=KRAS-206;biotype=nonsense_mediated_decay;transcript_id=ENST00000686877;version=1 -12 havana mRNA 25205258 25250935 . - . ID=transcript:ENST00000685328;Parent=gene:ENSG00000133703;Name=KRAS-205;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000685328;version=1 -12 havana mRNA 25205260 25250899 . - . ID=transcript:ENST00000693229;Parent=gene:ENSG00000133703;Name=KRAS-214;biotype=protein_coding;tag=basic;transcript_id=ENST00000693229;version=1 -12 havana mRNA 25205270 25250927 . - . ID=transcript:ENST00000687356;Parent=gene:ENSG00000133703;Name=KRAS-208;biotype=nonsense_mediated_decay;transcript_id=ENST00000687356;version=1 -12 havana mRNA 25205343 25250917 . - . ID=transcript:ENST00000692768;Parent=gene:ENSG00000133703;Name=KRAS-213;biotype=protein_coding;tag=basic;transcript_id=ENST00000692768;version=1 -12 havana mRNA 25206933 25250444 . - . ID=transcript:ENST00000688940;Parent=gene:ENSG00000133703;Name=KRAS-210;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000688940;version=1 -12 havana mRNA 25207948 25250929 . - . ID=transcript:ENST00000690804;Parent=gene:ENSG00000133703;Name=KRAS-212;biotype=nonsense_mediated_decay;transcript_id=ENST00000690804;version=1 -12 havana mRNA 25209178 25250936 . - . ID=transcript:ENST00000557334;Parent=gene:ENSG00000133703;Name=KRAS-204;biotype=protein_coding;tag=basic;transcript_id=ENST00000557334;transcript_support_level=5 (assigned to previous version 5);version=6 -12 havana lnc_RNA 25209673 25227997 . - . ID=transcript:ENST00000688228;Parent=gene:ENSG00000133703;Name=KRAS-209;biotype=retained_intron;transcript_id=ENST00000688228;version=1 -12 havana mRNA 25232558 25250929 . - . ID=transcript:ENST00000686969;Parent=gene:ENSG00000133703;Name=KRAS-207;biotype=protein_coding;tag=basic;transcript_id=ENST00000686969;version=1 -12 havana mRNA 25232591 25250929 . - . ID=transcript:ENST00000556131;Parent=gene:ENSG00000133703;Name=KRAS-203;biotype=protein_coding;tag=basic;transcript_id=ENST00000556131;transcript_support_level=1 (assigned to previous version 1);version=2 -12 havana ncRNA_gene 25210652 25211233 . + . ID=gene:ENSG00000274987;biotype=lncRNA;description=novel transcript%2C antisense to KRAS;gene_id=ENSG00000274987;logic_name=havana_homo_sapiens;version=1 -12 havana ncRNA_gene 25225103 25225665 . + . ID=gene:ENSG00000275197;biotype=lncRNA;description=novel transcript%2C antisense to KRAS;gene_id=ENSG00000275197;logic_name=havana_homo_sapiens;version=1 -6 havana pseudogene 54770583 54771134 . + . ID=gene:ENSG00000220635;Name=KRASP1;biotype=processed_pseudogene;description=KRAS proto-oncogene%2C GTPase pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:6406];gene_id=ENSG00000220635;logic_name=havana_homo_sapiens;version=2 -6 havana pseudogenic_transcript 54770583 54771134 . + . ID=transcript:ENST00000407852;Parent=gene:ENSG00000220635;Name=KRASP1-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000407852;transcript_support_level=NA;version=2 diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3 b/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3 new file mode 100644 index 00000000..8ed7eb87 --- /dev/null +++ b/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3 @@ -0,0 +1,163 @@ +12 ensembl_havana CDS 25209795 25209911 . - 0 ID=CDS:ENSP00000308495;Parent=transcript:ENST00000311936;protein_id=ENSP00000308495 +12 ensembl_havana CDS 25215441 25215560 . - 0 ID=CDS:ENSP00000256078;Parent=transcript:ENST00000256078;protein_id=ENSP00000256078 +12 ensembl_havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000256078;Parent=transcript:ENST00000256078;protein_id=ENSP00000256078 +12 ensembl_havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000308495;Parent=transcript:ENST00000311936;protein_id=ENSP00000308495 +12 ensembl_havana CDS 25227234 25227412 . - 0 ID=CDS:ENSP00000256078;Parent=transcript:ENST00000256078;protein_id=ENSP00000256078 +12 ensembl_havana CDS 25227234 25227412 . - 0 ID=CDS:ENSP00000308495;Parent=transcript:ENST00000311936;protein_id=ENSP00000308495 +12 ensembl_havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000256078;Parent=transcript:ENST00000256078;protein_id=ENSP00000256078 +12 ensembl_havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000308495;Parent=transcript:ENST00000311936;protein_id=ENSP00000308495 +12 ensembl_havana exon 25205246 25209911 . - . Parent=transcript:ENST00000256078;Name=ENSE00002477035;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002477035;rank=6;version=3 +12 ensembl_havana exon 25205246 25209911 . - . Parent=transcript:ENST00000311936;Name=ENSE00002456976;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00002456976;rank=5;version=2 +12 ensembl_havana exon 25215437 25215560 . - . Parent=transcript:ENST00000256078;Name=ENSE00001189807;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00001189807;rank=5;version=5 +12 ensembl_havana exon 25225614 25225773 . - . Parent=transcript:ENST00000256078;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=4;version=1 +12 ensembl_havana exon 25225614 25225773 . - . Parent=transcript:ENST00000311936;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=4;version=1 +12 ensembl_havana exon 25227234 25227412 . - . Parent=transcript:ENST00000256078;Name=ENSE00001719809;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00001719809;rank=3;version=1 +12 ensembl_havana exon 25227234 25227412 . - . Parent=transcript:ENST00000311936;Name=ENSE00001719809;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00001719809;rank=3;version=1 +12 ensembl_havana exon 25245274 25245395 . - . Parent=transcript:ENST00000256078;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 ensembl_havana exon 25245274 25245395 . - . Parent=transcript:ENST00000311936;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 ensembl_havana exon 25250751 25250929 . - . Parent=transcript:ENST00000256078;Name=ENSE00003903543;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003903543;rank=1;version=1 +12 ensembl_havana exon 25250751 25250929 . - . Parent=transcript:ENST00000311936;Name=ENSE00003903543;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003903543;rank=1;version=1 +12 ensembl_havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000256078 +12 ensembl_havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000311936 +12 ensembl_havana five_prime_UTR 25250751 25250929 . - . Parent=transcript:ENST00000256078 +12 ensembl_havana five_prime_UTR 25250751 25250929 . - . Parent=transcript:ENST00000311936 +12 ensembl_havana gene 25205246 25250936 . - . ID=gene:ENSG00000133703;Name=KRAS;biotype=protein_coding;description=KRAS proto-oncogene%2C GTPase [Source:HGNC Symbol%3BAcc:HGNC:6407];gene_id=ENSG00000133703;logic_name=ensembl_havana_gene_homo_sapiens;version=14 +12 ensembl_havana mRNA 25205246 25250929 . - . ID=transcript:ENST00000256078;Parent=gene:ENSG00000133703;Name=KRAS-201;biotype=protein_coding;ccdsid=CCDS8703.1;tag=basic;transcript_id=ENST00000256078;transcript_support_level=1 (assigned to previous version 8);version=10 +12 ensembl_havana mRNA 25205246 25250929 . - . ID=transcript:ENST00000311936;Parent=gene:ENSG00000133703;Name=KRAS-202;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000311936;transcript_support_level=1 (assigned to previous version 7);version=8 +12 ensembl_havana three_prime_UTR 25205246 25209794 . - . Parent=transcript:ENST00000311936 +12 ensembl_havana three_prime_UTR 25205246 25209911 . - . Parent=transcript:ENST00000256078 +12 ensembl_havana three_prime_UTR 25215437 25215440 . - . Parent=transcript:ENST00000256078 +12 havana CDS 25209795 25209911 . - 0 ID=CDS:ENSP00000452512;Parent=transcript:ENST00000557334;protein_id=ENSP00000452512 +12 havana CDS 25209795 25209911 . - 0 ID=CDS:ENSP00000508921;Parent=transcript:ENST00000685328;protein_id=ENSP00000508921 +12 havana CDS 25209795 25209911 . - 0 ID=CDS:ENSP00000509223;Parent=transcript:ENST00000693229;protein_id=ENSP00000509223 +12 havana CDS 25209795 25209911 . - 0 ID=CDS:ENSP00000509238;Parent=transcript:ENST00000688940;protein_id=ENSP00000509238 +12 havana CDS 25209795 25209911 . - 0 ID=CDS:ENSP00000510254;Parent=transcript:ENST00000692768;protein_id=ENSP00000510254 +12 havana CDS 25213204 25213206 . - 0 ID=CDS:ENSP00000509798;Parent=transcript:ENST00000690406;protein_id=ENSP00000509798 +12 havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000508921;Parent=transcript:ENST00000685328;protein_id=ENSP00000508921 +12 havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000509223;Parent=transcript:ENST00000693229;protein_id=ENSP00000509223 +12 havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000509238;Parent=transcript:ENST00000688940;protein_id=ENSP00000509238 +12 havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000509798;Parent=transcript:ENST00000690406;protein_id=ENSP00000509798 +12 havana CDS 25225614 25225773 . - 1 ID=CDS:ENSP00000510254;Parent=transcript:ENST00000692768;protein_id=ENSP00000510254 +12 havana CDS 25225762 25225773 . - 0 ID=CDS:ENSP00000510511;Parent=transcript:ENST00000687356;protein_id=ENSP00000510511 +12 havana CDS 25227234 25227325 . - 0 ID=CDS:ENSP00000510254;Parent=transcript:ENST00000692768;protein_id=ENSP00000510254 +12 havana CDS 25227234 25227337 . - 0 ID=CDS:ENSP00000509223;Parent=transcript:ENST00000693229;protein_id=ENSP00000509223 +12 havana CDS 25227234 25227412 . - 0 ID=CDS:ENSP00000508921;Parent=transcript:ENST00000685328;protein_id=ENSP00000508921 +12 havana CDS 25227234 25227412 . - 0 ID=CDS:ENSP00000509238;Parent=transcript:ENST00000688940;protein_id=ENSP00000509238 +12 havana CDS 25228847 25228891 . - 0 ID=CDS:ENSP00000508568;Parent=transcript:ENST00000690804;protein_id=ENSP00000508568 +12 havana CDS 25230565 25230621 . - 0 ID=CDS:ENSP00000510431;Parent=transcript:ENST00000686877;protein_id=ENSP00000510431 +12 havana CDS 25235206 25235226 . - 0 ID=CDS:ENSP00000451856;Parent=transcript:ENST00000556131;protein_id=ENSP00000451856 +12 havana CDS 25235206 25235226 . - 0 ID=CDS:ENSP00000510479;Parent=transcript:ENST00000686969;protein_id=ENSP00000510479 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000451856;Parent=transcript:ENST00000556131;protein_id=ENSP00000451856 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000452512;Parent=transcript:ENST00000557334;protein_id=ENSP00000452512 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000508568;Parent=transcript:ENST00000690804;protein_id=ENSP00000508568 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000508921;Parent=transcript:ENST00000685328;protein_id=ENSP00000508921 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000509223;Parent=transcript:ENST00000693229;protein_id=ENSP00000509223 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000509238;Parent=transcript:ENST00000688940;protein_id=ENSP00000509238 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000510431;Parent=transcript:ENST00000686877;protein_id=ENSP00000510431 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000510479;Parent=transcript:ENST00000686969;protein_id=ENSP00000510479 +12 havana CDS 25245274 25245384 . - 0 ID=CDS:ENSP00000510511;Parent=transcript:ENST00000687356;protein_id=ENSP00000510511 +12 havana exon 25205246 25209911 . - . Parent=transcript:ENST00000690406;Name=ENSE00002477035;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002477035;rank=3;version=3 +12 havana exon 25205250 25209911 . - . Parent=transcript:ENST00000686877;Name=ENSE00003934058;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003934058;rank=6;version=1 +12 havana exon 25205258 25209911 . - . Parent=transcript:ENST00000685328;Name=ENSE00003924510;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003924510;rank=5;version=1 +12 havana exon 25205260 25209911 . - . Parent=transcript:ENST00000693229;Name=ENSE00003927775;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003927775;rank=5;version=1 +12 havana exon 25205270 25209911 . - . Parent=transcript:ENST00000687356;Name=ENSE00003933328;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003933328;rank=4;version=1 +12 havana exon 25205343 25209911 . - . Parent=transcript:ENST00000692768;Name=ENSE00003925822;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003925822;rank=4;version=1 +12 havana exon 25206933 25209911 . - . Parent=transcript:ENST00000688940;Name=ENSE00003930148;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003930148;rank=5;version=1 +12 havana exon 25207948 25209911 . - . Parent=transcript:ENST00000690804;Name=ENSE00003935620;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003935620;rank=6;version=1 +12 havana exon 25209178 25209911 . - . Parent=transcript:ENST00000557334;Name=ENSE00002464674;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00002464674;rank=3;version=2 +12 havana exon 25209673 25209911 . - . Parent=transcript:ENST00000688228;Name=ENSE00003925173;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003925173;rank=3;version=1 +12 havana exon 25213114 25213206 . - . Parent=transcript:ENST00000690406;Name=ENSE00003927570;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003927570;rank=2;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000685328;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=4;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000686877;Name=ENSE00003937476;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003937476;rank=5;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000687356;Name=ENSE00003930939;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003930939;rank=3;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000688228;Name=ENSE00003937476;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003937476;rank=2;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000688940;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=4;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000690406;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=1;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000690804;Name=ENSE00003937476;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003937476;rank=5;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000692768;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=3;version=1 +12 havana exon 25225614 25225773 . - . Parent=transcript:ENST00000693229;Name=ENSE00001644818;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00001644818;rank=4;version=1 +12 havana exon 25227234 25227337 . - . Parent=transcript:ENST00000693229;Name=ENSE00003923411;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00003923411;rank=3;version=1 +12 havana exon 25227234 25227412 . - . Parent=transcript:ENST00000685328;Name=ENSE00001719809;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00001719809;rank=3;version=1 +12 havana exon 25227234 25227412 . - . Parent=transcript:ENST00000686877;Name=ENSE00003930847;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003930847;rank=4;version=1 +12 havana exon 25227234 25227412 . - . Parent=transcript:ENST00000688940;Name=ENSE00001719809;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00001719809;rank=3;version=1 +12 havana exon 25227234 25227412 . - . Parent=transcript:ENST00000690804;Name=ENSE00003930847;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003930847;rank=4;version=1 +12 havana exon 25227234 25227412 . - . Parent=transcript:ENST00000692768;Name=ENSE00003923061;constitutive=0;ensembl_end_phase=2;ensembl_phase=-1;exon_id=ENSE00003923061;rank=2;version=1 +12 havana exon 25227234 25227997 . - . Parent=transcript:ENST00000688228;Name=ENSE00003935871;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003935871;rank=1;version=1 +12 havana exon 25228775 25228891 . - . Parent=transcript:ENST00000690804;Name=ENSE00003925179;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003925179;rank=3;version=1 +12 havana exon 25230483 25230621 . - . Parent=transcript:ENST00000686877;Name=ENSE00003930732;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003930732;rank=3;version=1 +12 havana exon 25232558 25235226 . - . Parent=transcript:ENST00000686969;Name=ENSE00003927408;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003927408;rank=3;version=1 +12 havana exon 25232591 25235226 . - . Parent=transcript:ENST00000556131;Name=ENSE00002478081;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00002478081;rank=3;version=2 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000556131;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000557334;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000685328;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000686877;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000686969;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000687356;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000688940;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000690804;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25245274 25245395 . - . Parent=transcript:ENST00000693229;Name=ENSE00000936617;constitutive=0;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00000936617;rank=2;version=1 +12 havana exon 25250255 25250444 . - . Parent=transcript:ENST00000688940;Name=ENSE00003932539;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003932539;rank=1;version=1 +12 havana exon 25250751 25250899 . - . Parent=transcript:ENST00000693229;Name=ENSE00003938559;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003938559;rank=1;version=1 +12 havana exon 25250751 25250908 . - . Parent=transcript:ENST00000686877;Name=ENSE00003928105;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003928105;rank=1;version=1 +12 havana exon 25250751 25250917 . - . Parent=transcript:ENST00000692768;Name=ENSE00003923448;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003923448;rank=1;version=1 +12 havana exon 25250751 25250927 . - . Parent=transcript:ENST00000687356;Name=ENSE00003930705;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003930705;rank=1;version=1 +12 havana exon 25250751 25250929 . - . Parent=transcript:ENST00000556131;Name=ENSE00003903543;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003903543;rank=1;version=1 +12 havana exon 25250751 25250929 . - . Parent=transcript:ENST00000690804;Name=ENSE00003903543;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003903543;rank=1;version=1 +12 havana exon 25250751 25250936 . - . Parent=transcript:ENST00000557334;Name=ENSE00002446502;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002446502;rank=1;version=1 +12 havana exon 25250764 25250929 . - . Parent=transcript:ENST00000686969;Name=ENSE00002530521;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002530521;rank=1;version=1 +12 havana exon 25250764 25250935 . - . Parent=transcript:ENST00000685328;Name=ENSE00003934964;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003934964;rank=1;version=1 +12 havana five_prime_UTR 25227326 25227412 . - . Parent=transcript:ENST00000692768 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000556131 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000557334 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000685328 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000686877 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000686969 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000687356 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000688940 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000690804 +12 havana five_prime_UTR 25245385 25245395 . - . Parent=transcript:ENST00000693229 +12 havana five_prime_UTR 25250255 25250444 . - . Parent=transcript:ENST00000688940 +12 havana five_prime_UTR 25250751 25250899 . - . Parent=transcript:ENST00000693229 +12 havana five_prime_UTR 25250751 25250908 . - . Parent=transcript:ENST00000686877 +12 havana five_prime_UTR 25250751 25250917 . - . Parent=transcript:ENST00000692768 +12 havana five_prime_UTR 25250751 25250927 . - . Parent=transcript:ENST00000687356 +12 havana five_prime_UTR 25250751 25250929 . - . Parent=transcript:ENST00000556131 +12 havana five_prime_UTR 25250751 25250929 . - . Parent=transcript:ENST00000690804 +12 havana five_prime_UTR 25250751 25250936 . - . Parent=transcript:ENST00000557334 +12 havana five_prime_UTR 25250764 25250929 . - . Parent=transcript:ENST00000686969 +12 havana five_prime_UTR 25250764 25250935 . - . Parent=transcript:ENST00000685328 +12 havana lnc_RNA 25209673 25227997 . - . ID=transcript:ENST00000688228;Parent=gene:ENSG00000133703;Name=KRAS-209;biotype=retained_intron;transcript_id=ENST00000688228;version=1 +12 havana lnc_RNA 25210652 25211233 . + . ID=transcript:ENST00000612734;Parent=gene:ENSG00000274987;biotype=lncRNA;tag=basic;transcript_id=ENST00000612734;transcript_support_level=NA;version=1 +12 havana lnc_RNA 25225103 25225665 . + . ID=transcript:ENST00000620933;Parent=gene:ENSG00000275197;biotype=lncRNA;tag=basic;transcript_id=ENST00000620933;transcript_support_level=NA;version=1 +12 havana mRNA 25205246 25225773 . - . ID=transcript:ENST00000690406;Parent=gene:ENSG00000133703;Name=KRAS-211;biotype=nonsense_mediated_decay;transcript_id=ENST00000690406;version=1 +12 havana mRNA 25205250 25250908 . - . ID=transcript:ENST00000686877;Parent=gene:ENSG00000133703;Name=KRAS-206;biotype=nonsense_mediated_decay;transcript_id=ENST00000686877;version=1 +12 havana mRNA 25205258 25250935 . - . ID=transcript:ENST00000685328;Parent=gene:ENSG00000133703;Name=KRAS-205;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000685328;version=1 +12 havana mRNA 25205260 25250899 . - . ID=transcript:ENST00000693229;Parent=gene:ENSG00000133703;Name=KRAS-214;biotype=protein_coding;tag=basic;transcript_id=ENST00000693229;version=1 +12 havana mRNA 25205270 25250927 . - . ID=transcript:ENST00000687356;Parent=gene:ENSG00000133703;Name=KRAS-208;biotype=nonsense_mediated_decay;transcript_id=ENST00000687356;version=1 +12 havana mRNA 25205343 25250917 . - . ID=transcript:ENST00000692768;Parent=gene:ENSG00000133703;Name=KRAS-213;biotype=protein_coding;tag=basic;transcript_id=ENST00000692768;version=1 +12 havana mRNA 25206933 25250444 . - . ID=transcript:ENST00000688940;Parent=gene:ENSG00000133703;Name=KRAS-210;biotype=protein_coding;ccdsid=CCDS8702.1;tag=basic;transcript_id=ENST00000688940;version=1 +12 havana mRNA 25207948 25250929 . - . ID=transcript:ENST00000690804;Parent=gene:ENSG00000133703;Name=KRAS-212;biotype=nonsense_mediated_decay;transcript_id=ENST00000690804;version=1 +12 havana mRNA 25209178 25250936 . - . ID=transcript:ENST00000557334;Parent=gene:ENSG00000133703;Name=KRAS-204;biotype=protein_coding;tag=basic;transcript_id=ENST00000557334;transcript_support_level=5 (assigned to previous version 5);version=6 +12 havana mRNA 25232558 25250929 . - . ID=transcript:ENST00000686969;Parent=gene:ENSG00000133703;Name=KRAS-207;biotype=protein_coding;tag=basic;transcript_id=ENST00000686969;version=1 +12 havana mRNA 25232591 25250929 . - . ID=transcript:ENST00000556131;Parent=gene:ENSG00000133703;Name=KRAS-203;biotype=protein_coding;tag=basic;transcript_id=ENST00000556131;transcript_support_level=1 (assigned to previous version 1);version=2 +12 havana ncRNA_gene 25210652 25211233 . + . ID=gene:ENSG00000274987;biotype=lncRNA;description=novel transcript%2C antisense to KRAS;gene_id=ENSG00000274987;logic_name=havana_homo_sapiens;version=1 +12 havana ncRNA_gene 25225103 25225665 . + . ID=gene:ENSG00000275197;biotype=lncRNA;description=novel transcript%2C antisense to KRAS;gene_id=ENSG00000275197;logic_name=havana_homo_sapiens;version=1 +12 havana three_prime_UTR 25205246 25209911 . - . Parent=transcript:ENST00000690406 +12 havana three_prime_UTR 25205250 25209911 . - . Parent=transcript:ENST00000686877 +12 havana three_prime_UTR 25205258 25209794 . - . Parent=transcript:ENST00000685328 +12 havana three_prime_UTR 25205260 25209794 . - . Parent=transcript:ENST00000693229 +12 havana three_prime_UTR 25205270 25209911 . - . Parent=transcript:ENST00000687356 +12 havana three_prime_UTR 25205343 25209794 . - . Parent=transcript:ENST00000692768 +12 havana three_prime_UTR 25206933 25209794 . - . Parent=transcript:ENST00000688940 +12 havana three_prime_UTR 25207948 25209911 . - . Parent=transcript:ENST00000690804 +12 havana three_prime_UTR 25209178 25209794 . - . Parent=transcript:ENST00000557334 +12 havana three_prime_UTR 25213114 25213203 . - . Parent=transcript:ENST00000690406 +12 havana three_prime_UTR 25225614 25225761 . - . Parent=transcript:ENST00000687356 +12 havana three_prime_UTR 25225614 25225773 . - . Parent=transcript:ENST00000686877 +12 havana three_prime_UTR 25225614 25225773 . - . Parent=transcript:ENST00000690804 +12 havana three_prime_UTR 25227234 25227412 . - . Parent=transcript:ENST00000686877 +12 havana three_prime_UTR 25227234 25227412 . - . Parent=transcript:ENST00000690804 +12 havana three_prime_UTR 25228775 25228846 . - . Parent=transcript:ENST00000690804 +12 havana three_prime_UTR 25230483 25230564 . - . Parent=transcript:ENST00000686877 +12 havana three_prime_UTR 25232558 25235205 . - . Parent=transcript:ENST00000686969 +12 havana three_prime_UTR 25232591 25235205 . - . Parent=transcript:ENST00000556131 diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3.json b/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3.json new file mode 100644 index 00000000..eb35287b --- /dev/null +++ b/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3.json @@ -0,0 +1 @@ +{"genes": [{"aliases": ["KRAS"], "biotype": "gene", "chr": "12", "end": 25250936, "name": "ENSG00000133703", "start": 25205246, "strand": "-", "transcripts": [{"aliases": ["KRAS-201"], "biotype": "mRNA", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25215560, "name": "ENSE00001189807", "start": 25215437, "version": "5"}, {"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00002477035", "start": 25205246, "version": "3"}, {"end": 25250929, "name": "ENSE00003903543", "start": 25250751, "version": "1"}], "name": "ENST00000256078", "start": 25205246, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000256078", "start": 25215441}], "version": "10"}, {"aliases": ["KRAS-202"], "biotype": "mRNA", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00002456976", "start": 25205246, "version": "2"}, {"end": 25250929, "name": "ENSE00003903543", "start": 25250751, "version": "1"}], "name": "ENST00000311936", "start": 25205246, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000308495", "start": 25209795}], "version": "8"}, {"aliases": ["KRAS-203"], "biotype": "mRNA", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25235226, "name": "ENSE00002478081", "start": 25232591, "version": "2"}, {"end": 25250929, "name": "ENSE00003903543", "start": 25250751, "version": "1"}], "name": "ENST00000556131", "start": 25232591, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000451856", "start": 25235206}], "version": "2"}, {"aliases": ["KRAS-204"], "biotype": "mRNA", "end": 25250936, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25250936, "name": "ENSE00002446502", "start": 25250751, "version": "1"}, {"end": 25209911, "name": "ENSE00002464674", "start": 25209178, "version": "2"}], "name": "ENST00000557334", "start": 25209178, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000452512", "start": 25209795}], "version": "6"}, {"aliases": ["KRAS-205"], "biotype": "mRNA", "end": 25250935, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003924510", "start": 25205258, "version": "1"}, {"end": 25250935, "name": "ENSE00003934964", "start": 25250764, "version": "1"}], "name": "ENST00000685328", "start": 25205258, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000508921", "start": 25209795}], "version": "1"}, {"aliases": ["KRAS-206"], "biotype": "mRNA", "end": 25250908, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25250908, "name": "ENSE00003928105", "start": 25250751, "version": "1"}, {"end": 25230621, "name": "ENSE00003930732", "start": 25230483, "version": "1"}, {"end": 25227412, "name": "ENSE00003930847", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003934058", "start": 25205250, "version": "1"}, {"end": 25225773, "name": "ENSE00003937476", "start": 25225614, "version": "1"}], "name": "ENST00000686877", "start": 25205250, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000510431", "start": 25230565}], "version": "1"}, {"aliases": ["KRAS-207"], "biotype": "mRNA", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25250929, "name": "ENSE00002530521", "start": 25250764, "version": "1"}, {"end": 25235226, "name": "ENSE00003927408", "start": 25232558, "version": "1"}], "name": "ENST00000686969", "start": 25232558, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000510479", "start": 25235206}], "version": "1"}, {"aliases": ["KRAS-208"], "biotype": "mRNA", "end": 25250927, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25250927, "name": "ENSE00003930705", "start": 25250751, "version": "1"}, {"end": 25225773, "name": "ENSE00003930939", "start": 25225614, "version": "1"}, {"end": 25209911, "name": "ENSE00003933328", "start": 25205270, "version": "1"}], "name": "ENST00000687356", "start": 25205270, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000510511", "start": 25225762}], "version": "1"}, {"aliases": ["KRAS-209"], "biotype": "lnc_RNA", "end": 25227997, "exons": [{"end": 25209911, "name": "ENSE00003925173", "start": 25209673, "version": "1"}, {"end": 25227997, "name": "ENSE00003935871", "start": 25227234, "version": "1"}, {"end": 25225773, "name": "ENSE00003937476", "start": 25225614, "version": "1"}], "name": "ENST00000688228", "start": 25209673, "version": "1"}, {"aliases": ["KRAS-210"], "biotype": "mRNA", "end": 25250444, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003930148", "start": 25206933, "version": "1"}, {"end": 25250444, "name": "ENSE00003932539", "start": 25250255, "version": "1"}], "name": "ENST00000688940", "start": 25206933, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000509238", "start": 25209795}], "version": "1"}, {"aliases": ["KRAS-211"], "biotype": "mRNA", "end": 25225773, "exons": [{"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25209911, "name": "ENSE00002477035", "start": 25205246, "version": "3"}, {"end": 25213206, "name": "ENSE00003927570", "start": 25213114, "version": "1"}], "name": "ENST00000690406", "start": 25205246, "translations": [{"biotype": "CDS", "end": 25225773, "name": "ENSP00000509798", "start": 25213204}], "version": "1"}, {"aliases": ["KRAS-212"], "biotype": "mRNA", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25250929, "name": "ENSE00003903543", "start": 25250751, "version": "1"}, {"end": 25228891, "name": "ENSE00003925179", "start": 25228775, "version": "1"}, {"end": 25227412, "name": "ENSE00003930847", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003935620", "start": 25207948, "version": "1"}, {"end": 25225773, "name": "ENSE00003937476", "start": 25225614, "version": "1"}], "name": "ENST00000690804", "start": 25207948, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000508568", "start": 25228847}], "version": "1"}, {"aliases": ["KRAS-213"], "biotype": "mRNA", "end": 25250917, "exons": [{"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00003923061", "start": 25227234, "version": "1"}, {"end": 25250917, "name": "ENSE00003923448", "start": 25250751, "version": "1"}, {"end": 25209911, "name": "ENSE00003925822", "start": 25205343, "version": "1"}], "name": "ENST00000692768", "start": 25205343, "translations": [{"biotype": "CDS", "end": 25227325, "name": "ENSP00000510254", "start": 25209795}], "version": "1"}, {"aliases": ["KRAS-214"], "biotype": "mRNA", "end": 25250899, "exons": [{"end": 25245395, "name": "ENSE00000936617", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "start": 25225614, "version": "1"}, {"end": 25227337, "name": "ENSE00003923411", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003927775", "start": 25205260, "version": "1"}, {"end": 25250899, "name": "ENSE00003938559", "start": 25250751, "version": "1"}], "name": "ENST00000693229", "start": 25205260, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000509223", "start": 25209795}], "version": "1"}], "version": "14"}, {"biotype": "ncRNA_gene", "chr": "12", "end": 25211233, "name": "ENSG00000274987", "start": 25210652, "strand": "+", "transcripts": [{"biotype": "lnc_RNA", "end": 25211233, "name": "ENST00000612734", "start": 25210652, "version": "1"}], "version": "1"}, {"biotype": "ncRNA_gene", "chr": "12", "end": 25225665, "name": "ENSG00000275197", "start": 25225103, "strand": "+", "transcripts": [{"biotype": "lnc_RNA", "end": 25225665, "name": "ENST00000620933", "start": 25225103, "version": "1"}], "version": "1"}]} \ No newline at end of file diff --git a/tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf b/tests/tools/data/Homo_sapiens.GRCh38.kras.gtf similarity index 100% rename from tests/tools/data/Homo_sapiens.GRCh38.105.chr.kras.gtf rename to tests/tools/data/Homo_sapiens.GRCh38.kras.gtf diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gtf.json b/tests/tools/data/Homo_sapiens.GRCh38.kras.gtf.json new file mode 100644 index 00000000..37fa8ad0 --- /dev/null +++ b/tests/tools/data/Homo_sapiens.GRCh38.kras.gtf.json @@ -0,0 +1 @@ +{"genes": [{"aliases": ["KRAS"], "biotype": "gene", "chr": "12", "end": 25250936, "name": "ENSG00000133703", "start": 25205246, "strand": "-", "transcripts": [{"aliases": ["KRAS-201"], "biotype": "transcript", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25215560, "name": "ENSE00001189807", "number": "5", "start": 25215437, "version": "5"}, {"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "number": "3", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00002477035", "number": "3;6", "start": 25205246, "version": "3"}, {"end": 25250929, "name": "ENSE00003903543", "number": "1", "start": 25250751, "version": "1"}], "name": "ENST00000256078", "start": 25205246, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000256078", "start": 25215444, "version": "5"}], "version": "10"}, {"aliases": ["KRAS-202"], "biotype": "transcript", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "number": "3", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00002456976", "number": "5", "start": 25205246, "version": "2"}, {"end": 25250929, "name": "ENSE00003903543", "number": "1", "start": 25250751, "version": "1"}], "name": "ENST00000311936", "start": 25205246, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000308495", "start": 25209798, "version": "3"}], "version": "8"}, {"aliases": ["KRAS-203"], "biotype": "transcript", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25235226, "name": "ENSE00002478081", "number": "3", "start": 25232591, "version": "2"}, {"end": 25250929, "name": "ENSE00003903543", "number": "1", "start": 25250751, "version": "1"}], "name": "ENST00000556131", "start": 25232591, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000451856", "start": 25235209, "version": "1"}], "version": "2"}, {"aliases": ["KRAS-204"], "biotype": "transcript", "end": 25250936, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25250936, "name": "ENSE00002446502", "number": "1", "start": 25250751, "version": "1"}, {"end": 25209911, "name": "ENSE00002464674", "number": "3", "start": 25209178, "version": "2"}], "name": "ENST00000557334", "start": 25209178, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000452512", "start": 25209798, "version": "1"}], "version": "6"}, {"aliases": ["KRAS-205"], "biotype": "transcript", "end": 25250935, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "number": "3", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003924510", "number": "5", "start": 25205258, "version": "1"}, {"end": 25250935, "name": "ENSE00003934964", "number": "1", "start": 25250764, "version": "1"}], "name": "ENST00000685328", "start": 25205258, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000508921", "start": 25209798, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-206"], "biotype": "transcript", "end": 25250908, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25250908, "name": "ENSE00003928105", "number": "1", "start": 25250751, "version": "1"}, {"end": 25230621, "name": "ENSE00003930732", "number": "3", "start": 25230483, "version": "1"}, {"end": 25227412, "name": "ENSE00003930847", "number": "4", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003934058", "number": "6", "start": 25205250, "version": "1"}, {"end": 25225773, "name": "ENSE00003937476", "number": "5;2", "start": 25225614, "version": "1"}], "name": "ENST00000686877", "start": 25205250, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000510431", "start": 25230568, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-207"], "biotype": "transcript", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25250929, "name": "ENSE00002530521", "number": "1", "start": 25250764, "version": "1"}, {"end": 25235226, "name": "ENSE00003927408", "number": "3", "start": 25232558, "version": "1"}], "name": "ENST00000686969", "start": 25232558, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000510479", "start": 25235209, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-208"], "biotype": "transcript", "end": 25250927, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25250927, "name": "ENSE00003930705", "number": "1", "start": 25250751, "version": "1"}, {"end": 25225773, "name": "ENSE00003930939", "number": "3", "start": 25225614, "version": "1"}, {"end": 25209911, "name": "ENSE00003933328", "number": "4", "start": 25205270, "version": "1"}], "name": "ENST00000687356", "start": 25205270, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000510511", "start": 25225765, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-209"], "biotype": "transcript", "end": 25227997, "exons": [{"end": 25209911, "name": "ENSE00003925173", "number": "3", "start": 25209673, "version": "1"}, {"end": 25227997, "name": "ENSE00003935871", "number": "1", "start": 25227234, "version": "1"}, {"end": 25225773, "name": "ENSE00003937476", "number": "5;2", "start": 25225614, "version": "1"}], "name": "ENST00000688228", "start": 25209673, "version": "1"}, {"aliases": ["KRAS-210"], "biotype": "transcript", "end": 25250444, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00001719809", "number": "3", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003930148", "number": "5", "start": 25206933, "version": "1"}, {"end": 25250444, "name": "ENSE00003932539", "number": "1", "start": 25250255, "version": "1"}], "name": "ENST00000688940", "start": 25206933, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000509238", "start": 25209798, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-211"], "biotype": "transcript", "end": 25225773, "exons": [{"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25209911, "name": "ENSE00002477035", "number": "3;6", "start": 25205246, "version": "3"}, {"end": 25213206, "name": "ENSE00003927570", "number": "2", "start": 25213114, "version": "1"}], "name": "ENST00000690406", "start": 25205246, "translations": [{"biotype": "CDS", "end": 25225773, "name": "ENSP00000509798", "start": 25225614, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-212"], "biotype": "transcript", "end": 25250929, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25250929, "name": "ENSE00003903543", "number": "1", "start": 25250751, "version": "1"}, {"end": 25228891, "name": "ENSE00003925179", "number": "3", "start": 25228775, "version": "1"}, {"end": 25227412, "name": "ENSE00003930847", "number": "4", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003935620", "number": "6", "start": 25207948, "version": "1"}, {"end": 25225773, "name": "ENSE00003937476", "number": "5;2", "start": 25225614, "version": "1"}], "name": "ENST00000690804", "start": 25207948, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000508568", "start": 25228850, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-213"], "biotype": "transcript", "end": 25250917, "exons": [{"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25227412, "name": "ENSE00003923061", "number": "2", "start": 25227234, "version": "1"}, {"end": 25250917, "name": "ENSE00003923448", "number": "1", "start": 25250751, "version": "1"}, {"end": 25209911, "name": "ENSE00003925822", "number": "4", "start": 25205343, "version": "1"}], "name": "ENST00000692768", "start": 25205343, "translations": [{"biotype": "CDS", "end": 25227325, "name": "ENSP00000510254", "start": 25209798, "version": "1"}], "version": "1"}, {"aliases": ["KRAS-214"], "biotype": "transcript", "end": 25250899, "exons": [{"end": 25245395, "name": "ENSE00000936617", "number": "2", "start": 25245274, "version": "1"}, {"end": 25225773, "name": "ENSE00001644818", "number": "1;4;3", "start": 25225614, "version": "1"}, {"end": 25227337, "name": "ENSE00003923411", "number": "3", "start": 25227234, "version": "1"}, {"end": 25209911, "name": "ENSE00003927775", "number": "5", "start": 25205260, "version": "1"}, {"end": 25250899, "name": "ENSE00003938559", "number": "1", "start": 25250751, "version": "1"}], "name": "ENST00000693229", "start": 25205260, "translations": [{"biotype": "CDS", "end": 25245384, "name": "ENSP00000509223", "start": 25209798, "version": "1"}], "version": "1"}], "version": "14"}, {"aliases": ["KRASP1"], "biotype": "gene", "chr": "6", "end": 54771134, "name": "ENSG00000220635", "start": 54770583, "strand": "+", "transcripts": [{"aliases": ["KRASP1-201"], "biotype": "transcript", "end": 54771134, "exons": [{"end": 54771134, "name": "ENSE00001550689", "number": "1", "start": 54770583, "version": "2"}], "name": "ENST00000407852", "start": 54770583, "version": "2"}], "version": "2"}]} \ No newline at end of file diff --git a/tests/tools/data/K02718.1.gff3 b/tests/tools/data/K02718.1.gff3 new file mode 100644 index 00000000..22645792 --- /dev/null +++ b/tests/tools/data/K02718.1.gff3 @@ -0,0 +1,24 @@ +K02718.1 Genbank CDS 1140 2813 . + 0 ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1 +K02718.1 Genbank CDS 2755 3852 . + 0 ID=cds-AAA46941.1;Parent=gene-E2;Dbxref=NCBI_GP:AAA46941.1;Name=AAA46941.1;Note=E2 ORF from 2725 to 3852%3B putative;gbkey=CDS;gene=E2;product=regulatory protein;protein_id=AAA46941.1 +K02718.1 Genbank CDS 3332 3619 . + 0 ID=cds-AAA46937.1;Parent=gene-E4;Dbxref=NCBI_GP:AAA46937.1;Name=AAA46937.1;gbkey=CDS;gene=E4;partial=true;product=AAA46937.1;protein_id=AAA46937.1;start_range=.,3332 +K02718.1 Genbank CDS 3863 4099 . + 0 ID=cds-AAA46938.1;Parent=gene-E5;Dbxref=NCBI_GP:AAA46938.1;Name=AAA46938.1;gbkey=CDS;gene=E5;partial=true;product=AAA46938.1;protein_id=AAA46938.1;start_range=.,3863 +K02718.1 Genbank CDS 4235 5656 . + 0 ID=cds-AAA46942.1;Parent=gene-L2;Dbxref=NCBI_GP:AAA46942.1;Name=AAA46942.1;Note=L2 ORF from 4133 to 5656%3B putative;gbkey=CDS;gene=L2;product=minor capsid protein;protein_id=AAA46942.1 +K02718.1 Genbank CDS 5559 7154 . + 0 ID=cds-AAA46943.1;Parent=gene-L1;Dbxref=NCBI_GP:AAA46943.1;Name=AAA46943.1;Note=L1 ORF from 5526 to 7154%3B putative;gbkey=CDS;gene=L1;product=major capsid protein;protein_id=AAA46943.1 +K02718.1 Genbank CDS 562 858 . + 0 ID=cds-AAA46940.1;Parent=gene-E7;Dbxref=NCBI_GP:AAA46940.1;Name=AAA46940.1;Note=E7 ORF from 544 to 858%3B putative;gbkey=CDS;gene=E7;product=transforming protein;protein_id=AAA46940.1 +K02718.1 Genbank CDS 83 559 . + 0 ID=cds-AAA46939.1;Parent=gene-E6;Dbxref=NCBI_GP:AAA46939.1;Name=AAA46939.1;Note=E6 ORF from 65 to 559%3B putative;gbkey=CDS;gene=E6;product=transforming protein;protein_id=AAA46939.1 +K02718.1 Genbank CDS 865 1140 . + 0 ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1 +K02718.1 Genbank gene 1140 2813 . + . ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding +K02718.1 Genbank gene 2755 3852 . + . ID=gene-E2;Name=E2;gbkey=Gene;gene=E2;gene_biotype=protein_coding +K02718.1 Genbank gene 3332 3619 . + . ID=gene-E4;Name=E4;gbkey=Gene;gene=E4;gene_biotype=protein_coding +K02718.1 Genbank gene 3863 4099 . + . ID=gene-E5;Name=E5;gbkey=Gene;gene=E5;gene_biotype=protein_coding +K02718.1 Genbank gene 4235 5656 . + . ID=gene-L2;Name=L2;gbkey=Gene;gene=L2;gene_biotype=protein_coding +K02718.1 Genbank gene 5559 7154 . + . ID=gene-L1;Name=L1;gbkey=Gene;gene=L1;gene_biotype=protein_coding +K02718.1 Genbank gene 562 858 . + . ID=gene-E7;Name=E7;gbkey=Gene;gene=E7;gene_biotype=protein_coding +K02718.1 Genbank gene 83 559 . + . ID=gene-E6;Name=E6;gbkey=Gene;gene=E6;gene_biotype=protein_coding +K02718.1 Genbank gene 865 1140 . + . ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding +K02718.1 Genbank region 17 23 . + . ID=id-K02718.1:17..23;gbkey=TATA_signal +K02718.1 Genbank region 1 7904 . + . ID=K02718.1:1..7904;Dbxref=taxon:333760;Is_circular=true;gbkey=Src;mol_type=genomic DNA +K02718.1 Genbank region 4213 4218 . + . ID=id-K02718.1:4213..4218;Note=putative;gbkey=polyA_signal +K02718.1 Genbank region 4289 4295 . + . ID=id-L2;gbkey=TATA_signal;gene=L2 +K02718.1 Genbank region 65 71 . + . ID=id-K02718.1:65..71;gbkey=TATA_signal +K02718.1 Genbank region 7260 7265 . + . ID=id-K02718.1:7260..7265;gbkey=polyA_signal diff --git a/tests/tools/data/K02718.1.gff3.json b/tests/tools/data/K02718.1.gff3.json new file mode 100644 index 00000000..f34a4ec3 --- /dev/null +++ b/tests/tools/data/K02718.1.gff3.json @@ -0,0 +1,243 @@ +{ + "genes": [ + { + "biotype": "region", + "chr": "K02718.1", + "end": 7904, + "name": "K02718.1:1..7904", + "start": 1, + "strand": "+" + }, + { + "biotype": "region", + "chr": "K02718.1", + "end": 23, + "name": "id-K02718.1:17..23", + "start": 17, + "strand": "+" + }, + { + "biotype": "region", + "chr": "K02718.1", + "end": 71, + "name": "id-K02718.1:65..71", + "start": 65, + "strand": "+" + }, + { + "biotype": "region", + "chr": "K02718.1", + "end": 7265, + "name": "id-K02718.1:7260..7265", + "start": 7260, + "strand": "+" + }, + { + "biotype": "region", + "chr": "K02718.1", + "end": 4295, + "name": "id-L2", + "start": 4289, + "strand": "+" + }, + { + "biotype": "region", + "chr": "K02718.1", + "end": 4218, + "name": "id-K02718.1:4213..4218", + "start": 4213, + "strand": "+", + "note": "putative" + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 2813, + "name": "E1", + "start": 865, + "strand": "+", + "transcripts": [ + { + "end": 2813, + "name": "E1_T", + "start": 865, + "translations": [ + { + "biotype": "CDS", + "end": 2813, + "name": "AAA46936.1", + "note": "E1 interrupted ORF from 859 to 2813%3B putative", + "start": 865 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 3852, + "name": "E2", + "start": 2755, + "strand": "+", + "transcripts": [ + { + "end": 3852, + "name": "E2_T", + "start": 2755, + "translations": [ + { + "biotype": "CDS", + "end": 3852, + "name": "AAA46941.1", + "note": "E2 ORF from 2725 to 3852%3B putative", + "start": 2755 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 3619, + "name": "E4", + "start": 3332, + "strand": "+", + "transcripts": [ + { + "end": 3619, + "name": "E4_T", + "start": 3332, + "translations": [ + { + "biotype": "CDS", + "end": 3619, + "name": "AAA46937.1", + "start": 3332 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 4099, + "name": "E5", + "start": 3863, + "strand": "+", + "transcripts": [ + { + "end": 4099, + "name": "E5_T", + "start": 3863, + "translations": [ + { + "biotype": "CDS", + "end": 4099, + "name": "AAA46938.1", + "start": 3863 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 5656, + "name": "L2", + "start": 4235, + "strand": "+", + "transcripts": [ + { + "end": 5656, + "name": "L2_T", + "start": 4235, + "translations": [ + { + "biotype": "CDS", + "end": 5656, + "name": "AAA46942.1", + "note": "L2 ORF from 4133 to 5656%3B putative", + "start": 4235 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 7154, + "name": "L1", + "start": 5559, + "strand": "+", + "transcripts": [ + { + "end": 7154, + "name": "L1_T", + "start": 5559, + "translations": [ + { + "biotype": "CDS", + "end": 7154, + "name": "AAA46943.1", + "note": "L1 ORF from 5526 to 7154%3B putative", + "start": 5559 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 559, + "name": "E6", + "start": 83, + "strand": "+", + "transcripts": [ + { + "end": 559, + "name": "E6_T", + "start": 83, + "translations": [ + { + "biotype": "CDS", + "end": 559, + "name": "AAA46939.1", + "note": "E6 ORF from 65 to 559%3B putative", + "start": 83 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 858, + "name": "E7", + "start": 562, + "strand": "+", + "transcripts": [ + { + "end": 858, + "name": "E7_T", + "start": 562, + "translations": [ + { + "biotype": "CDS", + "end": 858, + "name": "AAA46940.1", + "note": "E7 ORF from 544 to 858%3B putative", + "start": 562 + } + ] + } + ] + } + ] +} diff --git a/tests/tools/data/K02718.1.gtf b/tests/tools/data/K02718.1.gtf new file mode 100644 index 00000000..94cb884a --- /dev/null +++ b/tests/tools/data/K02718.1.gtf @@ -0,0 +1,32 @@ +K02718.1 Genbank gene 83 559 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 83 556 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 65 to 559; putative"; product "transforming protein"; protein_id "AAA46939.1"; exon_number "1"; +K02718.1 Genbank start_codon 83 85 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 65 to 559; putative"; product "transforming protein"; protein_id "AAA46939.1"; exon_number "1"; +K02718.1 Genbank stop_codon 557 559 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 65 to 559; putative"; product "transforming protein"; protein_id "AAA46939.1"; exon_number "1"; +K02718.1 Genbank gene 562 858 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 562 855 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from 544 to 858; putative"; product "transforming protein"; protein_id "AAA46940.1"; exon_number "1"; +K02718.1 Genbank start_codon 562 564 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from 544 to 858; putative"; product "transforming protein"; protein_id "AAA46940.1"; exon_number "1"; +K02718.1 Genbank stop_codon 856 858 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from 544 to 858; putative"; product "transforming protein"; protein_id "AAA46940.1"; exon_number "1"; +K02718.1 Genbank gene 865 1140 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; part "1"; +K02718.1 Genbank gene 1140 2813 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; part "2"; +K02718.1 Genbank CDS 865 1140 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "1"; +K02718.1 Genbank CDS 1140 2810 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "2"; +K02718.1 Genbank start_codon 865 867 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "1"; +K02718.1 Genbank stop_codon 2811 2813 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "2"; +K02718.1 Genbank gene 2755 3852 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 2755 3849 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from 2725 to 3852; putative"; product "regulatory protein"; protein_id "AAA46941.1"; exon_number "1"; +K02718.1 Genbank start_codon 2755 2757 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from 2725 to 3852; putative"; product "regulatory protein"; protein_id "AAA46941.1"; exon_number "1"; +K02718.1 Genbank stop_codon 3850 3852 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from 2725 to 3852; putative"; product "regulatory protein"; protein_id "AAA46941.1"; exon_number "1"; +K02718.1 Genbank gene 3332 3619 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 3332 3616 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46937.1"; protein_id "AAA46937.1"; exon_number "1"; +K02718.1 Genbank stop_codon 3617 3619 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46937.1"; protein_id "AAA46937.1"; exon_number "1"; +K02718.1 Genbank gene 3863 4099 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 3863 4096 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; partial "true"; product "AAA46938.1"; protein_id "AAA46938.1"; exon_number "1"; +K02718.1 Genbank stop_codon 4097 4099 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; partial "true"; product "AAA46938.1"; protein_id "AAA46938.1"; exon_number "1"; +K02718.1 Genbank gene 4235 5656 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 4235 5653 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from 4133 to 5656; putative"; product "minor capsid protein"; protein_id "AAA46942.1"; exon_number "1"; +K02718.1 Genbank start_codon 4235 4237 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from 4133 to 5656; putative"; product "minor capsid protein"; protein_id "AAA46942.1"; exon_number "1"; +K02718.1 Genbank stop_codon 5654 5656 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from 4133 to 5656; putative"; product "minor capsid protein"; protein_id "AAA46942.1"; exon_number "1"; +K02718.1 Genbank gene 5559 7154 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 5559 7151 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from 5526 to 7154; putative"; product "major capsid protein"; protein_id "AAA46943.1"; exon_number "1"; +K02718.1 Genbank start_codon 5559 5561 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from 5526 to 7154; putative"; product "major capsid protein"; protein_id "AAA46943.1"; exon_number "1"; +K02718.1 Genbank stop_codon 7152 7154 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from 5526 to 7154; putative"; product "major capsid protein"; protein_id "AAA46943.1"; exon_number "1"; diff --git a/tests/tools/data/K02718.1.gtf.json b/tests/tools/data/K02718.1.gtf.json new file mode 100644 index 00000000..71866865 --- /dev/null +++ b/tests/tools/data/K02718.1.gtf.json @@ -0,0 +1,188 @@ +{ + "genes": [ + { + "biotype": "gene", + "chr": "K02718.1", + "end": 2813, + "name": "E1", + "start": 865, + "strand": "+", + "transcripts": [ + { + "end": 2810, + "name": "E1_T", + "start": 865, + "translations": [ + { + "biotype": "CDS", + "end": 2810, + "name": "AAA46936.1", + "start": 865 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 3852, + "name": "E2", + "start": 2755, + "strand": "+", + "transcripts": [ + { + "end": 3849, + "name": "E2_T", + "start": 2755, + "translations": [ + { + "biotype": "CDS", + "end": 3849, + "name": "AAA46941.1", + "start": 2755 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 3619, + "name": "E4", + "start": 3332, + "strand": "+", + "transcripts": [ + { + "end": 3616, + "name": "E4_T", + "start": 3332, + "translations": [ + { + "biotype": "CDS", + "end": 3616, + "name": "AAA46937.1", + "start": 3332 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 4099, + "name": "E5", + "start": 3863, + "strand": "+", + "transcripts": [ + { + "end": 4096, + "name": "E5_T", + "start": 3863, + "translations": [ + { + "biotype": "CDS", + "end": 4096, + "name": "AAA46938.1", + "start": 3863 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 559, + "name": "E6", + "start": 83, + "strand": "+", + "transcripts": [ + { + "end": 556, + "name": "E6_T", + "start": 83, + "translations": [ + { + "biotype": "CDS", + "end": 556, + "name": "AAA46939.1", + "start": 83 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 858, + "name": "E7", + "start": 562, + "strand": "+", + "transcripts": [ + { + "end": 855, + "name": "E7_T", + "start": 562, + "translations": [ + { + "biotype": "CDS", + "end": 855, + "name": "AAA46940.1", + "start": 562 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 7154, + "name": "L1", + "start": 5559, + "strand": "+", + "transcripts": [ + { + "end": 7151, + "name": "L1_T", + "start": 5559, + "translations": [ + { + "biotype": "CDS", + "end": 7151, + "name": "AAA46943.1", + "start": 5559 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 5656, + "name": "L2", + "start": 4235, + "strand": "+", + "transcripts": [ + { + "end": 5653, + "name": "L2_T", + "start": 4235, + "translations": [ + { + "biotype": "CDS", + "end": 5653, + "name": "AAA46942.1", + "start": 4235 + } + ] + } + ] + } + ] +} diff --git a/tests/tools/data/example_genes.v2.json b/tests/tools/data/example_genes.v2.json new file mode 100644 index 00000000..f508d6d0 --- /dev/null +++ b/tests/tools/data/example_genes.v2.json @@ -0,0 +1,7700 @@ +{ + "genes": [ + { + "aliases": [ + "EGFR" + ], + "chr": "7", + "end": 55324313, + "name": "ENSG00000146648", + "start": 55086714, + "strand": "+", + "transcripts": [ + { + "cdna_coding_end": 3533, + "cdna_coding_start": 258, + "domains": [ + { + "name": "PIRSF000619", + "regions": [ + { + "end": 1090, + "start": 1 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 920, + "start": 669 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 191, + "start": 28 + }, + { + "end": 475, + "start": 283 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 293, + "start": 141 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 934, + "start": 667 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 219, + "start": 145 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 290, + "start": 142 + }, + { + "end": 593, + "start": 460 + } + ] + }, + { + "name": "PR00109", + "regions": [ + { + "end": 758, + "start": 745 + }, + { + "end": 800, + "start": 782 + }, + { + "end": 841, + "start": 831 + }, + { + "end": 872, + "start": 850 + }, + { + "end": 916, + "start": 894 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 975, + "start": 651 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 141, + "start": 57 + }, + { + "end": 435, + "start": 316 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 924, + "start": 667 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 225, + "start": 183 + }, + { + "end": 502, + "start": 451 + }, + { + "end": 556, + "start": 507 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 923, + "start": 667 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 919, + "start": 667 + } + ] + } + ], + "end": 55270769, + "exons": [ + { + "end": 55087058, + "start": 55086714 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240817, + "start": 55240676 + }, + { + "end": 55241736, + "start": 55241614 + }, + { + "end": 55242513, + "start": 55242415 + }, + { + "end": 55249171, + "start": 55248986 + }, + { + "end": 55259567, + "start": 55259412 + }, + { + "end": 55260534, + "start": 55260459 + }, + { + "end": 55266556, + "start": 55266410 + }, + { + "end": 55268106, + "start": 55268009 + }, + { + "end": 55269048, + "start": 55268881 + }, + { + "end": 55269475, + "start": 55269428 + }, + { + "end": 55270769, + "start": 55270210 + } + ], + "is_best_transcript": false, + "name": "ENST00000455089", + "start": 55086714 + }, + { + "cdna_coding_end": 2133, + "cdna_coding_start": 247, + "domains": [ + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 624, + "start": 505 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + } + ], + "end": 55236328, + "exons": [ + { + "end": 55087058, + "start": 55086725 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55236328, + "start": 55236216 + } + ], + "is_best_transcript": false, + "name": "ENST00000342916", + "start": 55086725 + }, + { + "cdna_coding_end": 2363, + "cdna_coding_start": 246, + "domains": [ + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 624, + "start": 505 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + } + ], + "end": 55238738, + "exons": [ + { + "end": 55087058, + "start": 55086726 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238738, + "start": 55238000 + } + ], + "is_best_transcript": false, + "name": "ENST00000344576", + "start": 55086726 + }, + { + "cdna_coding_end": 1462, + "cdna_coding_start": 245, + "domains": [ + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 403, + "start": 328 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + } + ] + } + ], + "end": 55224644, + "exons": [ + { + "end": 55087058, + "start": 55086727 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224644, + "start": 55224452 + } + ], + "is_best_transcript": false, + "name": "ENST00000420316", + "start": 55086727 + }, + { + "cdna_coding_end": 3810, + "cdna_coding_start": 178, + "domains": [ + { + "name": "SM00220", + "regions": [ + { + "end": 969, + "start": 712 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 1020, + "start": 696 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 964, + "start": 712 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 968, + "start": 712 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 965, + "start": 714 + } + ] + }, + { + "name": "PIRSF000619", + "regions": [ + { + "end": 1210, + "start": 1 + } + ] + }, + { + "name": "PR00109", + "regions": [ + { + "end": 803, + "start": 790 + }, + { + "end": 845, + "start": 827 + }, + { + "end": 886, + "start": 876 + }, + { + "end": 917, + "start": 895 + }, + { + "end": 961, + "start": 939 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 638, + "start": 505 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 979, + "start": 712 + } + ] + } + ], + "end": 55279321, + "exons": [ + { + "end": 55087058, + "start": 55086794 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240817, + "start": 55240676 + }, + { + "end": 55241736, + "start": 55241614 + }, + { + "end": 55242513, + "start": 55242415 + }, + { + "end": 55249171, + "start": 55248986 + }, + { + "end": 55259567, + "start": 55259412 + }, + { + "end": 55260534, + "start": 55260459 + }, + { + "end": 55266556, + "start": 55266410 + }, + { + "end": 55268106, + "start": 55268009 + }, + { + "end": 55269048, + "start": 55268881 + }, + { + "end": 55269475, + "start": 55269428 + }, + { + "end": 55270318, + "start": 55270210 + }, + { + "end": 55279321, + "start": 55272949 + } + ], + "is_best_transcript": true, + "name": "ENST00000275493", + "start": 55086794 + }, + { + "cdna_coding_end": 2134, + "cdna_coding_start": 161, + "domains": [ + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + }, + { + "end": 653, + "start": 614 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 638, + "start": 505 + } + ] + } + ], + "end": 55324313, + "exons": [ + { + "end": 55087058, + "start": 55086811 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240621, + "start": 55240539 + }, + { + "end": 55324313, + "start": 55323947 + } + ], + "is_best_transcript": false, + "name": "ENST00000442591", + "start": 55086811 + }, + { + "cdna_coding_end": 691, + "cdna_coding_start": 308, + "domains": [ + { + "name": "SSF52058", + "regions": [ + { + "end": 127, + "start": 1 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 114, + "start": 4 + } + ] + } + ], + "end": 55214417, + "exons": [ + { + "end": 55177651, + "start": 55177416 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214417, + "start": 55214299 + } + ], + "is_best_transcript": false, + "name": "ENST00000450046", + "start": 55177416 + }, + { + "cdna_coding_end": 3657, + "cdna_coding_start": 184, + "domains": [ + { + "name": "SM00261", + "regions": [ + { + "end": 217, + "start": 175 + }, + { + "end": 494, + "start": 443 + }, + { + "end": 548, + "start": 499 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 911, + "start": 659 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 915, + "start": 659 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 967, + "start": 643 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 916, + "start": 659 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 114, + "start": 4 + }, + { + "end": 427, + "start": 308 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 211, + "start": 134 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 926, + "start": 659 + } + ] + }, + { + "name": "PR00109", + "regions": [ + { + "end": 750, + "start": 737 + }, + { + "end": 792, + "start": 774 + }, + { + "end": 833, + "start": 823 + }, + { + "end": 864, + "start": 842 + }, + { + "end": 908, + "start": 886 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 286, + "start": 129 + }, + { + "end": 585, + "start": 452 + } + ] + }, + { + "name": "PIRSF000619", + "regions": [ + { + "end": 1157, + "start": 1 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 912, + "start": 661 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 158, + "start": 1 + }, + { + "end": 467, + "start": 275 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 285, + "start": 132 + } + ] + } + ], + "end": 55273591, + "exons": [ + { + "end": 55177651, + "start": 55177540 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240817, + "start": 55240676 + }, + { + "end": 55241736, + "start": 55241614 + }, + { + "end": 55242513, + "start": 55242415 + }, + { + "end": 55249171, + "start": 55248986 + }, + { + "end": 55259567, + "start": 55259412 + }, + { + "end": 55260534, + "start": 55260459 + }, + { + "end": 55266556, + "start": 55266410 + }, + { + "end": 55268106, + "start": 55268009 + }, + { + "end": 55269048, + "start": 55268881 + }, + { + "end": 55269475, + "start": 55269428 + }, + { + "end": 55270318, + "start": 55270210 + }, + { + "end": 55273591, + "start": 55272949 + } + ], + "is_best_transcript": false, + "name": "ENST00000454757", + "start": 55177540 + } + ] + }, + { + "aliases": [ + "DSTYK" + ], + "chr": "1", + "end": 205180727, + "name": "ENSG00000133059", + "start": 205111632, + "strand": "-", + "transcripts": [ + { + "cdna_coding_end": 1831, + "cdna_coding_start": 65, + "domains": [ + { + "name": "SM00220", + "regions": [ + { + "end": 565, + "start": 337 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 585, + "start": 452 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 556, + "start": 451 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 558, + "start": 471 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 565, + "start": 312 + } + ] + } + ], + "end": 205180727, + "exons": [ + { + "end": 205116873, + "start": 205111632 + }, + { + "end": 205117467, + "start": 205117333 + }, + { + "end": 205119898, + "start": 205119808 + }, + { + "end": 205133083, + "start": 205133055 + }, + { + "end": 205138960, + "start": 205138291 + }, + { + "end": 205156934, + "start": 205156546 + }, + { + "end": 205180727, + "start": 205180399 + } + ], + "is_best_transcript": false, + "name": "ENST00000367160", + "start": 205111632 + }, + { + "cdna_coding_end": 2686, + "cdna_coding_start": 32, + "domains": [ + { + "name": "PF07714", + "regions": [ + { + "end": 820, + "start": 654 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 884, + "start": 652 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 853, + "start": 627 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 861, + "start": 652 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 824, + "start": 654 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 861, + "start": 652 + } + ] + } + ], + "end": 205180694, + "exons": [ + { + "end": 205116873, + "start": 205111633 + }, + { + "end": 205119922, + "start": 205119808 + }, + { + "end": 205126514, + "start": 205126401 + }, + { + "end": 205128807, + "start": 205128675 + }, + { + "end": 205129398, + "start": 205129242 + }, + { + "end": 205130515, + "start": 205130386 + }, + { + "end": 205131340, + "start": 205131164 + }, + { + "end": 205132134, + "start": 205132051 + }, + { + "end": 205133083, + "start": 205132851 + }, + { + "end": 205138960, + "start": 205138291 + }, + { + "end": 205156934, + "start": 205156546 + }, + { + "end": 205180694, + "start": 205180399 + } + ], + "is_best_transcript": false, + "name": "ENST00000367161", + "start": 205111633 + }, + { + "cdna_coding_end": 2821, + "cdna_coding_start": 32, + "domains": [ + { + "name": "PF07714", + "regions": [ + { + "end": 899, + "start": 654 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 906, + "start": 652 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 897, + "start": 638 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 906, + "start": 652 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 906, + "start": 652 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 897, + "start": 654 + } + ] + } + ], + "end": 205180694, + "exons": [ + { + "end": 205116873, + "start": 205111633 + }, + { + "end": 205117467, + "start": 205117333 + }, + { + "end": 205119922, + "start": 205119808 + }, + { + "end": 205126514, + "start": 205126401 + }, + { + "end": 205128807, + "start": 205128675 + }, + { + "end": 205129398, + "start": 205129242 + }, + { + "end": 205130515, + "start": 205130386 + }, + { + "end": 205131340, + "start": 205131164 + }, + { + "end": 205132134, + "start": 205132051 + }, + { + "end": 205133083, + "start": 205132851 + }, + { + "end": 205138960, + "start": 205138291 + }, + { + "end": 205156934, + "start": 205156546 + }, + { + "end": 205180694, + "start": 205180399 + } + ], + "is_best_transcript": true, + "name": "ENST00000367162", + "start": 205111633 + } + ] + }, + { + "aliases": [ + "NDUFA12" + ], + "chr": "12", + "end": 95397546, + "name": "ENSG00000184752", + "start": 95290831, + "strand": "-", + "transcripts": [ + { + "domains": [ + ], + "end": 95397436, + "exons": [ + { + "end": 95291086, + "start": 95290831 + }, + { + "end": 95318582, + "start": 95318422 + }, + { + "end": 95322039, + "start": 95321793 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397436, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000552205", + "start": 95290831 + }, + { + "cdna_coding_end": 188, + "cdna_coding_start": 21, + "domains": [ + ], + "end": 95397476, + "exons": [ + { + "end": 95365261, + "start": 95365108 + }, + { + "end": 95396597, + "start": 95396582 + }, + { + "end": 95397476, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000547157", + "start": 95365108 + }, + { + "cdna_coding_end": 144, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 33, + "start": 12 + } + ] + } + ], + "end": 95397384, + "exons": [ + { + "end": 95365396, + "start": 95365109 + }, + { + "end": 95388033, + "start": 95387946 + }, + { + "end": 95390752, + "start": 95390680 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397384, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000551991", + "start": 95365109 + }, + { + "cdna_coding_end": 528, + "cdna_coding_start": 91, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 137, + "start": 36 + } + ] + } + ], + "end": 95397546, + "exons": [ + { + "end": 95365396, + "start": 95365109 + }, + { + "end": 95388033, + "start": 95387946 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397546, + "start": 95397371 + } + ], + "is_best_transcript": true, + "name": "ENST00000327772", + "start": 95365109 + }, + { + "cdna_coding_end": 225, + "cdna_coding_start": 34, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 53, + "start": 36 + } + ] + } + ], + "end": 95397489, + "exons": [ + { + "end": 95365396, + "start": 95365112 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397489, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000547986", + "start": 95365112 + }, + { + "cdna_coding_end": 368, + "cdna_coding_start": 69, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 87, + "start": 36 + } + ] + } + ], + "end": 95397524, + "exons": [ + { + "end": 95365396, + "start": 95365254 + }, + { + "end": 95366265, + "start": 95366171 + }, + { + "end": 95388033, + "start": 95387946 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397524, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000546788", + "start": 95365254 + } + ] + }, + { + "aliases": [ + "FRMD6" + ], + "chr": "14", + "end": 52197445, + "name": "ENSG00000139926", + "start": 51955818, + "strand": "+", + "transcripts": [ + { + "cdna_coding_end": 2338, + "cdna_coding_start": 494, + "domains": [ + { + "name": "PF09379", + "regions": [ + { + "end": 109, + "start": 20 + } + ] + }, + { + "name": "PF09380", + "regions": [ + { + "end": 322, + "start": 237 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 375, + "start": 219 + } + ] + }, + { + "name": "SM00295", + "regions": [ + { + "end": 226, + "start": 12 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 320, + "start": 16 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 226, + "start": 115 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 218, + "start": 110 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 110, + "start": 14 + } + ] + } + ], + "end": 52197177, + "exons": [ + { + "end": 51956138, + "start": 51955855 + }, + { + "end": 52037128, + "start": 52037066 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164950, + "start": 52164860 + }, + { + "end": 52167853, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52197177, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000356218", + "start": 51955855 + }, + { + "cdna_coding_end": 2130, + "cdna_coding_start": 286, + "domains": [ + { + "name": "PF00373", + "regions": [ + { + "end": 226, + "start": 115 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 218, + "start": 110 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 110, + "start": 14 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 320, + "start": 16 + } + ] + }, + { + "name": "SM00295", + "regions": [ + { + "end": 226, + "start": 12 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 375, + "start": 219 + } + ] + }, + { + "name": "PF09380", + "regions": [ + { + "end": 322, + "start": 237 + } + ] + }, + { + "name": "PF09379", + "regions": [ + { + "end": 109, + "start": 20 + } + ] + } + ], + "end": 52197445, + "exons": [ + { + "end": 52118714, + "start": 52118576 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164950, + "start": 52164860 + }, + { + "end": 52167853, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52197445, + "start": 52194463 + } + ], + "is_best_transcript": true, + "name": "ENST00000395718", + "start": 52118576 + }, + { + "cdna_coding_end": 2065, + "cdna_coding_start": 197, + "domains": [ + { + "name": "PF09380", + "regions": [ + { + "end": 330, + "start": 245 + } + ] + }, + { + "name": "PF09379", + "regions": [ + { + "end": 117, + "start": 20 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 226, + "start": 118 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 234, + "start": 123 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 118, + "start": 14 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 328, + "start": 16 + } + ] + }, + { + "name": "SM00295", + "regions": [ + { + "end": 234, + "start": 12 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 383, + "start": 227 + } + ] + } + ], + "end": 52195654, + "exons": [ + { + "end": 52118714, + "start": 52118665 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164950, + "start": 52164860 + }, + { + "end": 52167877, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52195654, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000344768", + "start": 52118665 + }, + { + "domains": [ + ], + "end": 52164945, + "exons": [ + { + "end": 52118935, + "start": 52118698 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164945, + "start": 52164860 + } + ], + "is_best_transcript": false, + "name": "ENST00000554778", + "start": 52118698 + }, + { + "domains": [ + ], + "end": 52174806, + "exons": [ + { + "end": 52164950, + "start": 52164706 + }, + { + "end": 52167877, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174806, + "start": 52174796 + } + ], + "is_best_transcript": false, + "name": "ENST00000555936", + "start": 52164706 + }, + { + "cdna_coding_end": 1775, + "cdna_coding_start": 138, + "domains": [ + { + "name": "SSF50729", + "regions": [ + { + "end": 306, + "start": 150 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 251, + "start": 1 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 41, + "start": 1 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 149, + "start": 41 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 157, + "start": 46 + } + ] + }, + { + "name": "PF09380", + "regions": [ + { + "end": 253, + "start": 168 + } + ] + } + ], + "end": 52197148, + "exons": [ + { + "end": 52164950, + "start": 52164831 + }, + { + "end": 52167853, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52197148, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000554167", + "start": 52164831 + }, + { + "cdna_coding_end": 390, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PS50057", + "regions": [ + { + "end": 129, + "start": 1 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 124, + "start": 13 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 116, + "start": 8 + } + ] + } + ], + "end": 52175062, + "exons": [ + { + "end": 52169306, + "start": 52169266 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52175062, + "start": 52174796 + } + ], + "is_best_transcript": false, + "name": "ENST00000557405", + "start": 52169266 + }, + { + "cdna_coding_end": 618, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PF09380", + "regions": [ + { + "end": 60, + "start": 2 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 58, + "start": 1 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 113, + "start": 2 + } + ] + } + ], + "end": 52187243, + "exons": [ + { + "end": 52179269, + "start": 52179231 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187243, + "start": 52186773 + } + ], + "is_best_transcript": false, + "name": "ENST00000555197", + "start": 52179231 + }, + { + "cdna_coding_end": 573, + "cdna_coding_start": 145, + "domains": [ + ], + "end": 52192513, + "exons": [ + { + "end": 52184066, + "start": 52183973 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188673 + }, + { + "end": 52192513, + "start": 52192497 + } + ], + "is_best_transcript": false, + "name": "ENST00000555703", + "start": 52183973 + }, + { + "cdna_coding_end": 939, + "cdna_coding_start": 145, + "domains": [ + ], + "end": 52195487, + "exons": [ + { + "end": 52184066, + "start": 52183973 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52195487, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000553556", + "start": 52183973 + } + ] + }, + { + "aliases": [ + "PRKCB" + ], + "chr": "16", + "end": 24231932, + "name": "ENSG00000166501", + "start": 23847322, + "strand": "+", + "transcripts": [ + { + "cdna_coding_end": 2191, + "cdna_coding_start": 176, + "domains": [ + { + "name": "SM00239", + "regions": [ + { + "end": 275, + "start": 172 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 583, + "start": 344 + } + ] + }, + { + "name": "SSF49562", + "regions": [ + { + "end": 288, + "start": 157 + } + ] + }, + { + "name": "SM00109", + "regions": [ + { + "end": 86, + "start": 37 + }, + { + "end": 151, + "start": 102 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "PR00008", + "regions": [ + { + "end": 48, + "start": 34 + }, + { + "end": 59, + "start": 50 + }, + { + "end": 74, + "start": 63 + }, + { + "end": 152, + "start": 140 + } + ] + }, + { + "name": "PF00433", + "regions": [ + { + "end": 666, + "start": 623 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "PF00168", + "regions": [ + { + "end": 259, + "start": 175 + } + ] + }, + { + "name": "SSF57889", + "regions": [ + { + "end": 92, + "start": 6 + }, + { + "end": 157, + "start": 101 + } + ] + }, + { + "name": "PF00130", + "regions": [ + { + "end": 87, + "start": 37 + }, + { + "end": 153, + "start": 102 + } + ] + }, + { + "name": "PS50081", + "regions": [ + { + "end": 86, + "start": 36 + }, + { + "end": 151, + "start": 101 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 627, + "start": 317 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 586, + "start": 343 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 576, + "start": 342 + } + ] + }, + { + "name": "PR00360", + "regions": [ + { + "end": 200, + "start": 188 + }, + { + "end": 230, + "start": 217 + }, + { + "end": 248, + "start": 240 + } + ] + }, + { + "name": "SM00133", + "regions": [ + { + "end": 664, + "start": 601 + } + ] + }, + { + "name": "PS50004", + "regions": [ + { + "end": 260, + "start": 173 + } + ] + }, + { + "name": "PIRSF000550", + "regions": [ + { + "end": 671, + "start": 1 + } + ] + } + ], + "end": 24231932, + "exons": [ + { + "end": 23847669, + "start": 23847322 + }, + { + "end": 23848727, + "start": 23848696 + }, + { + "end": 23999911, + "start": 23999829 + }, + { + "end": 24043568, + "start": 24043457 + }, + { + "end": 24046868, + "start": 24046740 + }, + { + "end": 24104268, + "start": 24104112 + }, + { + "end": 24105618, + "start": 24105484 + }, + { + "end": 24124390, + "start": 24124294 + }, + { + "end": 24135302, + "start": 24135156 + }, + { + "end": 24166178, + "start": 24166005 + }, + { + "end": 24183682, + "start": 24183591 + }, + { + "end": 24185901, + "start": 24185839 + }, + { + "end": 24192249, + "start": 24192111 + }, + { + "end": 24196512, + "start": 24196432 + }, + { + "end": 24196888, + "start": 24196781 + }, + { + "end": 24202551, + "start": 24202411 + }, + { + "end": 24231932, + "start": 24231282 + } + ], + "is_best_transcript": true, + "name": "ENST00000321728", + "start": 23847322 + }, + { + "cdna_coding_end": 2174, + "cdna_coding_start": 153, + "domains": [ + { + "name": "SM00133", + "regions": [ + { + "end": 663, + "start": 601 + } + ] + }, + { + "name": "PS50004", + "regions": [ + { + "end": 260, + "start": 173 + } + ] + }, + { + "name": "PIRSF000550", + "regions": [ + { + "end": 672, + "start": 1 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 586, + "start": 343 + } + ] + }, + { + "name": "PR00360", + "regions": [ + { + "end": 200, + "start": 188 + }, + { + "end": 230, + "start": 217 + }, + { + "end": 248, + "start": 240 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 576, + "start": 342 + } + ] + }, + { + "name": "PS50081", + "regions": [ + { + "end": 86, + "start": 36 + }, + { + "end": 151, + "start": 101 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 627, + "start": 317 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "PF00433", + "regions": [ + { + "end": 664, + "start": 627 + } + ] + }, + { + "name": "PF00130", + "regions": [ + { + "end": 87, + "start": 37 + }, + { + "end": 153, + "start": 102 + } + ] + }, + { + "name": "PF00168", + "regions": [ + { + "end": 259, + "start": 175 + } + ] + }, + { + "name": "SSF57889", + "regions": [ + { + "end": 92, + "start": 6 + }, + { + "end": 157, + "start": 101 + } + ] + }, + { + "name": "PR00008", + "regions": [ + { + "end": 48, + "start": 34 + }, + { + "end": 59, + "start": 50 + }, + { + "end": 74, + "start": 63 + }, + { + "end": 152, + "start": 140 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "SM00109", + "regions": [ + { + "end": 86, + "start": 37 + }, + { + "end": 151, + "start": 102 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 583, + "start": 344 + } + ] + }, + { + "name": "SSF49562", + "regions": [ + { + "end": 288, + "start": 157 + } + ] + }, + { + "name": "SM00239", + "regions": [ + { + "end": 275, + "start": 172 + } + ] + } + ], + "end": 24231932, + "exons": [ + { + "end": 23847669, + "start": 23847345 + }, + { + "end": 23848727, + "start": 23848696 + }, + { + "end": 23999911, + "start": 23999829 + }, + { + "end": 24043568, + "start": 24043457 + }, + { + "end": 24046868, + "start": 24046740 + }, + { + "end": 24104268, + "start": 24104112 + }, + { + "end": 24105618, + "start": 24105484 + }, + { + "end": 24124390, + "start": 24124294 + }, + { + "end": 24135302, + "start": 24135156 + }, + { + "end": 24166178, + "start": 24166005 + }, + { + "end": 24183682, + "start": 24183591 + }, + { + "end": 24185901, + "start": 24185839 + }, + { + "end": 24192249, + "start": 24192111 + }, + { + "end": 24196512, + "start": 24196432 + }, + { + "end": 24196888, + "start": 24196781 + }, + { + "end": 24202551, + "start": 24202411 + }, + { + "end": 24231932, + "start": 24225979 + } + ], + "is_best_transcript": false, + "name": "ENST00000303531", + "start": 23847345 + }, + { + "cdna_coding_end": 268, + "cdna_coding_start": 95, + "domains": [ + { + "name": "PR00008", + "regions": [ + { + "end": 48, + "start": 34 + }, + { + "end": 57, + "start": 50 + } + ] + }, + { + "name": "PS50081", + "regions": [ + { + "end": 57, + "start": 36 + } + ] + }, + { + "name": "SSF57889", + "regions": [ + { + "end": 57, + "start": 6 + } + ] + } + ], + "end": 23880647, + "exons": [ + { + "end": 23847669, + "start": 23847403 + }, + { + "end": 23880647, + "start": 23880435 + } + ], + "is_best_transcript": false, + "name": "ENST00000498058", + "start": 23847403 + }, + { + "domains": [ + ], + "end": 24124386, + "exons": [ + { + "end": 23848727, + "start": 23848544 + }, + { + "end": 24104268, + "start": 24104112 + }, + { + "end": 24105618, + "start": 24105484 + }, + { + "end": 24124386, + "start": 24124294 + } + ], + "is_best_transcript": false, + "name": "ENST00000498739", + "start": 23848544 + }, + { + "domains": [ + ], + "end": 24192166, + "exons": [ + { + "end": 24163176, + "start": 24163006 + }, + { + "end": 24166178, + "start": 24166005 + }, + { + "end": 24183682, + "start": 24183591 + }, + { + "end": 24185901, + "start": 24185839 + }, + { + "end": 24192166, + "start": 24192111 + } + ], + "is_best_transcript": false, + "name": "ENST00000472066", + "start": 24163006 + }, + { + "domains": [ + ], + "end": 24202909, + "exons": [ + { + "end": 24196888, + "start": 24196852 + }, + { + "end": 24202909, + "start": 24202411 + } + ], + "is_best_transcript": false, + "name": "ENST00000466124", + "start": 24196852 + } + ] + }, + { + "aliases": [ + "GIMAP4" + ], + "chr": "7", + "end": 150271041, + "name": "ENSG00000133574", + "start": 150264365, + "strand": "+", + "transcripts": [ + { + "cdna_coding_end": 1165, + "cdna_coding_start": 176, + "domains": [ + { + "name": "PF04548", + "regions": [ + { + "end": 238, + "start": 31 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 288, + "start": 24 + } + ] + } + ], + "end": 150271041, + "exons": [ + { + "end": 150264525, + "start": 150264365 + }, + { + "end": 150267047, + "start": 150266976 + }, + { + "end": 150271041, + "start": 150269217 + } + ], + "is_best_transcript": true, + "name": "ENST00000255945", + "start": 150264365 + }, + { + "cdna_coding_end": 1115, + "cdna_coding_start": 84, + "domains": [ + { + "name": "PF04548", + "regions": [ + { + "end": 252, + "start": 45 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 302, + "start": 38 + } + ] + } + ], + "end": 150270602, + "exons": [ + { + "end": 150264525, + "start": 150264457 + }, + { + "end": 150267089, + "start": 150266976 + }, + { + "end": 150270602, + "start": 150269217 + } + ], + "is_best_transcript": false, + "name": "ENST00000461940", + "start": 150264457 + }, + { + "cdna_coding_end": 552, + "cdna_coding_start": 100, + "domains": [ + { + "name": "SSF52540", + "regions": [ + { + "end": 151, + "start": 38 + } + ] + }, + { + "name": "PF04548", + "regions": [ + { + "end": 151, + "start": 45 + } + ] + } + ], + "end": 150269569, + "exons": [ + { + "end": 150264608, + "start": 150264524 + }, + { + "end": 150267089, + "start": 150266976 + }, + { + "end": 150269569, + "start": 150269217 + } + ], + "is_best_transcript": false, + "name": "ENST00000479232", + "start": 150264524 + } + ] + }, + { + "aliases": [ + "IL7" + ], + "chr": "8", + "end": 79717758, + "name": "ENSG00000104432", + "start": 79587978, + "strand": "-", + "transcripts": [ + { + "cdna_coding_end": 1135, + "cdna_coding_start": 602, + "domains": [ + { + "name": "PIRSF001942", + "regions": [ + { + "end": 177, + "start": 1 + } + ] + }, + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + }, + { + "end": 77, + "start": 57 + }, + { + "end": 98, + "start": 78 + }, + { + "end": 118, + "start": 99 + }, + { + "end": 173, + "start": 151 + } + ] + }, + { + "name": "PF01415", + "regions": [ + { + "end": 173, + "start": 28 + } + ] + }, + { + "name": "SM00127", + "regions": [ + { + "end": 173, + "start": 27 + } + ] + } + ], + "end": 79717758, + "exons": [ + { + "end": 79646067, + "start": 79645007 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79650870, + "start": 79650739 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717758, + "start": 79717148 + } + ], + "is_best_transcript": true, + "name": "ENST00000263851", + "start": 79645007 + }, + { + "cdna_coding_end": 758, + "cdna_coding_start": 543, + "domains": [ + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + } + ] + }, + { + "name": "PF01415", + "regions": [ + { + "end": 54, + "start": 28 + } + ] + } + ], + "end": 79717699, + "exons": [ + { + "end": 79646063, + "start": 79645283 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79650870, + "start": 79650739 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79659331, + "start": 79659129 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717699, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000518982", + "start": 79645283 + }, + { + "cdna_coding_end": 408, + "cdna_coding_start": 7, + "domains": [ + { + "name": "PF01415", + "regions": [ + { + "end": 77, + "start": 28 + }, + { + "end": 129, + "start": 91 + } + ] + }, + { + "name": "SM00127", + "regions": [ + { + "end": 129, + "start": 27 + } + ] + }, + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + }, + { + "end": 77, + "start": 57 + } + ] + }, + { + "name": "PIRSF001942", + "regions": [ + { + "end": 133, + "start": 1 + } + ] + } + ], + "end": 79717163, + "exons": [ + { + "end": 79646067, + "start": 79645900 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717163, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000520269", + "start": 79645900 + }, + { + "cdna_coding_end": 120, + "cdna_coding_start": 7, + "domains": [ + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 37, + "start": 26 + } + ] + } + ], + "end": 79717163, + "exons": [ + { + "end": 79646067, + "start": 79645900 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710363 + }, + { + "end": 79717163, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000520215", + "start": 79645900 + }, + { + "cdna_coding_end": 643, + "cdna_coding_start": 530, + "domains": [ + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 37, + "start": 26 + } + ] + } + ], + "end": 79717686, + "exons": [ + { + "end": 79646067, + "start": 79645900 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79650870, + "start": 79650739 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710363 + }, + { + "end": 79717686, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000520317", + "start": 79645900 + }, + { + "cdna_coding_end": 195, + "cdna_coding_start": 1, + "domains": [ + { + "name": "SM00127", + "regions": [ + { + "end": 60, + "start": 1 + } + ] + }, + { + "name": "PF01415", + "regions": [ + { + "end": 60, + "start": 1 + } + ] + } + ], + "end": 79652311, + "exons": [ + { + "end": 79646067, + "start": 79645948 + }, + { + "end": 79652311, + "start": 79652237 + } + ], + "is_best_transcript": false, + "name": "ENST00000541183", + "start": 79645948 + }, + { + "cdna_coding_end": 817, + "cdna_coding_start": 602, + "domains": [ + { + "name": "PF01415", + "regions": [ + { + "end": 54, + "start": 28 + } + ] + }, + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + } + ] + } + ], + "end": 79717758, + "exons": [ + { + "end": 79659331, + "start": 79659263 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717758, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000379113", + "start": 79659263 + } + ] + }, + { + "aliases": [ + "SVEP1" + ], + "chr": "9", + "end": 113342160, + "name": "ENSG00000165124", + "start": 113127531, + "strand": "-", + "transcripts": [ + { + "cdna_coding_end": 11053, + "cdna_coding_start": 338, + "domains": [ + { + "name": "SM00032", + "regions": [ + { + "end": 433, + "start": 378 + }, + { + "end": 493, + "start": 438 + }, + { + "end": 559, + "start": 498 + }, + { + "end": 787, + "start": 727 + }, + { + "end": 1685, + "start": 1631 + }, + { + "end": 1743, + "start": 1690 + }, + { + "end": 1842, + "start": 1789 + }, + { + "end": 1900, + "start": 1847 + }, + { + "end": 1958, + "start": 1905 + }, + { + "end": 2016, + "start": 1963 + }, + { + "end": 2078, + "start": 2021 + }, + { + "end": 2141, + "start": 2083 + }, + { + "end": 2199, + "start": 2146 + }, + { + "end": 2259, + "start": 2204 + }, + { + "end": 2318, + "start": 2264 + }, + { + "end": 2376, + "start": 2323 + }, + { + "end": 2435, + "start": 2381 + }, + { + "end": 2493, + "start": 2440 + }, + { + "end": 2551, + "start": 2498 + }, + { + "end": 2608, + "start": 2556 + }, + { + "end": 2712, + "start": 2654 + }, + { + "end": 2770, + "start": 2717 + }, + { + "end": 2828, + "start": 2775 + }, + { + "end": 2886, + "start": 2833 + }, + { + "end": 2944, + "start": 2891 + }, + { + "end": 3002, + "start": 2949 + }, + { + "end": 3059, + "start": 3007 + }, + { + "end": 3117, + "start": 3064 + }, + { + "end": 3176, + "start": 3122 + }, + { + "end": 3236, + "start": 3181 + }, + { + "end": 3294, + "start": 3241 + }, + { + "end": 3352, + "start": 3299 + }, + { + "end": 3411, + "start": 3357 + }, + { + "end": 3468, + "start": 3416 + } + ] + }, + { + "name": "PF02494", + "regions": [ + { + "end": 642, + "start": 561 + }, + { + "end": 721, + "start": 644 + } + ] + }, + { + "name": "PR00895", + "regions": [ + { + "end": 1530, + "start": 1512 + }, + { + "end": 1558, + "start": 1539 + }, + { + "end": 1592, + "start": 1559 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 433, + "start": 374 + }, + { + "end": 493, + "start": 434 + }, + { + "end": 560, + "start": 494 + }, + { + "end": 790, + "start": 727 + }, + { + "end": 1746, + "start": 1626 + }, + { + "end": 1842, + "start": 1785 + }, + { + "end": 1900, + "start": 1843 + }, + { + "end": 1958, + "start": 1901 + }, + { + "end": 2016, + "start": 1959 + }, + { + "end": 2078, + "start": 2017 + }, + { + "end": 2199, + "start": 2081 + }, + { + "end": 2318, + "start": 2202 + }, + { + "end": 2377, + "start": 2321 + }, + { + "end": 2437, + "start": 2379 + }, + { + "end": 2551, + "start": 2438 + }, + { + "end": 2616, + "start": 2552 + }, + { + "end": 2712, + "start": 2643 + }, + { + "end": 2828, + "start": 2715 + }, + { + "end": 2886, + "start": 2829 + }, + { + "end": 2944, + "start": 2887 + }, + { + "end": 3117, + "start": 2945 + }, + { + "end": 3176, + "start": 3118 + }, + { + "end": 3229, + "start": 3177 + }, + { + "end": 3475, + "start": 3239 + } + ] + }, + { + "name": "SSF49899", + "regions": [ + { + "end": 1632, + "start": 1421 + } + ] + }, + { + "name": "SM00159", + "regions": [ + { + "end": 1627, + "start": 1420 + } + ] + }, + { + "name": "PF00354", + "regions": [ + { + "end": 1620, + "start": 1442 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 360, + "start": 310 + }, + { + "end": 1052, + "start": 1005 + }, + { + "end": 1106, + "start": 1059 + }, + { + "end": 1160, + "start": 1113 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1409, + "start": 1197 + }, + { + "end": 3554, + "start": 3468 + } + ] + }, + { + "name": "PS50825", + "regions": [ + { + "end": 642, + "start": 560 + }, + { + "end": 724, + "start": 643 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 252, + "start": 84 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1267, + "start": 1189 + }, + { + "end": 1305, + "start": 1268 + }, + { + "end": 1342, + "start": 1306 + }, + { + "end": 1423, + "start": 1344 + }, + { + "end": 1786, + "start": 1735 + }, + { + "end": 3506, + "start": 3463 + }, + { + "end": 3535, + "start": 3507 + }, + { + "end": 3570, + "start": 3537 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1229, + "start": 1193 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + }, + { + "end": 1784, + "start": 1745 + }, + { + "end": 3532, + "start": 3500 + }, + { + "end": 3564, + "start": 3533 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1234 + }, + { + "end": 1305, + "start": 1272 + }, + { + "end": 1343, + "start": 1310 + }, + { + "end": 1381, + "start": 1348 + }, + { + "end": 1419, + "start": 1386 + }, + { + "end": 1784, + "start": 1748 + }, + { + "end": 3500, + "start": 3471 + }, + { + "end": 3532, + "start": 3503 + }, + { + "end": 3564, + "start": 3535 + } + ] + }, + { + "name": "SM00179", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + }, + { + "end": 1784, + "start": 1745 + }, + { + "end": 3532, + "start": 3504 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 440, + "start": 269 + }, + { + "end": 1144, + "start": 988 + } + ] + }, + { + "name": "PF07645", + "regions": [ + { + "end": 1783, + "start": 1745 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 435, + "start": 376 + }, + { + "end": 495, + "start": 436 + }, + { + "end": 561, + "start": 496 + }, + { + "end": 789, + "start": 725 + }, + { + "end": 1687, + "start": 1629 + }, + { + "end": 1745, + "start": 1688 + }, + { + "end": 1844, + "start": 1787 + }, + { + "end": 1902, + "start": 1845 + }, + { + "end": 1960, + "start": 1903 + }, + { + "end": 2018, + "start": 1961 + }, + { + "end": 2080, + "start": 2019 + }, + { + "end": 2143, + "start": 2081 + }, + { + "end": 2201, + "start": 2144 + }, + { + "end": 2261, + "start": 2202 + }, + { + "end": 2320, + "start": 2262 + }, + { + "end": 2378, + "start": 2321 + }, + { + "end": 2437, + "start": 2379 + }, + { + "end": 2495, + "start": 2438 + }, + { + "end": 2553, + "start": 2496 + }, + { + "end": 2610, + "start": 2554 + }, + { + "end": 2714, + "start": 2663 + }, + { + "end": 2772, + "start": 2715 + }, + { + "end": 2830, + "start": 2773 + }, + { + "end": 2888, + "start": 2831 + }, + { + "end": 2946, + "start": 2889 + }, + { + "end": 3004, + "start": 2947 + }, + { + "end": 3061, + "start": 3005 + }, + { + "end": 3119, + "start": 3062 + }, + { + "end": 3178, + "start": 3120 + }, + { + "end": 3238, + "start": 3179 + }, + { + "end": 3296, + "start": 3239 + }, + { + "end": 3354, + "start": 3297 + }, + { + "end": 3413, + "start": 3355 + }, + { + "end": 3470, + "start": 3414 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 260, + "start": 81 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1226, + "start": 1197 + }, + { + "end": 1265, + "start": 1235 + }, + { + "end": 1302, + "start": 1273 + }, + { + "end": 1379, + "start": 1349 + }, + { + "end": 1417, + "start": 1387 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 264, + "start": 83 + } + ] + }, + { + "name": "PF07974", + "regions": [ + { + "end": 1266, + "start": 1235 + }, + { + "end": 3499, + "start": 3475 + }, + { + "end": 3531, + "start": 3507 + }, + { + "end": 3563, + "start": 3536 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 262, + "start": 79 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 430, + "start": 378 + }, + { + "end": 493, + "start": 438 + }, + { + "end": 1685, + "start": 1628 + }, + { + "end": 1743, + "start": 1690 + }, + { + "end": 1842, + "start": 1789 + }, + { + "end": 1900, + "start": 1847 + }, + { + "end": 1958, + "start": 1905 + }, + { + "end": 2016, + "start": 1963 + }, + { + "end": 2078, + "start": 2021 + }, + { + "end": 2136, + "start": 2083 + }, + { + "end": 2199, + "start": 2146 + }, + { + "end": 2259, + "start": 2204 + }, + { + "end": 2318, + "start": 2264 + }, + { + "end": 2376, + "start": 2323 + }, + { + "end": 2435, + "start": 2381 + }, + { + "end": 2493, + "start": 2440 + }, + { + "end": 2551, + "start": 2498 + }, + { + "end": 2608, + "start": 2556 + }, + { + "end": 2712, + "start": 2667 + }, + { + "end": 2770, + "start": 2717 + }, + { + "end": 2828, + "start": 2775 + }, + { + "end": 2886, + "start": 2833 + }, + { + "end": 2944, + "start": 2891 + }, + { + "end": 3002, + "start": 2949 + }, + { + "end": 3059, + "start": 3007 + }, + { + "end": 3117, + "start": 3084 + }, + { + "end": 3172, + "start": 3122 + }, + { + "end": 3236, + "start": 3181 + }, + { + "end": 3290, + "start": 3241 + }, + { + "end": 3352, + "start": 3299 + }, + { + "end": 3411, + "start": 3357 + }, + { + "end": 3468, + "start": 3416 + } + ] + } + ], + "end": 113342160, + "exons": [ + { + "end": 113128840, + "start": 113127531 + }, + { + "end": 113132296, + "start": 113132203 + }, + { + "end": 113137743, + "start": 113137648 + }, + { + "end": 113139646, + "start": 113139551 + }, + { + "end": 113141797, + "start": 113141627 + }, + { + "end": 113148354, + "start": 113148178 + }, + { + "end": 113149738, + "start": 113149565 + }, + { + "end": 113151867, + "start": 113151804 + }, + { + "end": 113163289, + "start": 113163134 + }, + { + "end": 113166832, + "start": 113166607 + }, + { + "end": 113171231, + "start": 113168440 + }, + { + "end": 113174015, + "start": 113173343 + }, + { + "end": 113190038, + "start": 113189871 + }, + { + "end": 113191614, + "start": 113191423 + }, + { + "end": 113192284, + "start": 113192200 + }, + { + "end": 113192730, + "start": 113192554 + }, + { + "end": 113194314, + "start": 113194195 + }, + { + "end": 113194915, + "start": 113194742 + }, + { + "end": 113196786, + "start": 113196616 + }, + { + "end": 113197644, + "start": 113197521 + }, + { + "end": 113198784, + "start": 113198660 + }, + { + "end": 113206000, + "start": 113205825 + }, + { + "end": 113208318, + "start": 113208117 + }, + { + "end": 113209337, + "start": 113209180 + }, + { + "end": 113212540, + "start": 113212339 + }, + { + "end": 113213682, + "start": 113213569 + }, + { + "end": 113217983, + "start": 113217870 + }, + { + "end": 113219632, + "start": 113219536 + }, + { + "end": 113220842, + "start": 113220751 + }, + { + "end": 113221393, + "start": 113221232 + }, + { + "end": 113228306, + "start": 113228145 + }, + { + "end": 113231381, + "start": 113231220 + }, + { + "end": 113233877, + "start": 113233644 + }, + { + "end": 113234603, + "start": 113234439 + }, + { + "end": 113238595, + "start": 113238484 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113342160, + "start": 113341293 + } + ], + "is_best_transcript": true, + "name": "ENST00000401783", + "start": 113127531 + }, + { + "cdna_coding_end": 4909, + "cdna_coding_start": 416, + "domains": [ + { + "name": "PF00084", + "regions": [ + { + "end": 62, + "start": 9 + }, + { + "end": 125, + "start": 72 + }, + { + "end": 185, + "start": 130 + }, + { + "end": 244, + "start": 190 + }, + { + "end": 302, + "start": 249 + }, + { + "end": 361, + "start": 307 + }, + { + "end": 419, + "start": 366 + }, + { + "end": 477, + "start": 424 + }, + { + "end": 534, + "start": 482 + }, + { + "end": 638, + "start": 593 + }, + { + "end": 696, + "start": 643 + }, + { + "end": 754, + "start": 701 + }, + { + "end": 812, + "start": 759 + }, + { + "end": 870, + "start": 817 + }, + { + "end": 928, + "start": 875 + }, + { + "end": 985, + "start": 933 + }, + { + "end": 1043, + "start": 1010 + }, + { + "end": 1098, + "start": 1048 + }, + { + "end": 1162, + "start": 1107 + }, + { + "end": 1216, + "start": 1167 + }, + { + "end": 1278, + "start": 1225 + }, + { + "end": 1337, + "start": 1283 + }, + { + "end": 1394, + "start": 1342 + } + ] + }, + { + "name": "PF07974", + "regions": [ + { + "end": 1425, + "start": 1401 + }, + { + "end": 1457, + "start": 1433 + }, + { + "end": 1489, + "start": 1462 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1456, + "start": 1427 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 69, + "start": 7 + }, + { + "end": 127, + "start": 70 + }, + { + "end": 187, + "start": 128 + }, + { + "end": 246, + "start": 188 + }, + { + "end": 304, + "start": 247 + }, + { + "end": 363, + "start": 305 + }, + { + "end": 421, + "start": 364 + }, + { + "end": 479, + "start": 422 + }, + { + "end": 536, + "start": 480 + }, + { + "end": 640, + "start": 589 + }, + { + "end": 698, + "start": 641 + }, + { + "end": 756, + "start": 699 + }, + { + "end": 814, + "start": 757 + }, + { + "end": 872, + "start": 815 + }, + { + "end": 930, + "start": 873 + }, + { + "end": 987, + "start": 931 + }, + { + "end": 1045, + "start": 988 + }, + { + "end": 1104, + "start": 1046 + }, + { + "end": 1164, + "start": 1105 + }, + { + "end": 1222, + "start": 1165 + }, + { + "end": 1280, + "start": 1223 + }, + { + "end": 1339, + "start": 1281 + }, + { + "end": 1396, + "start": 1340 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1426, + "start": 1397 + }, + { + "end": 1458, + "start": 1429 + }, + { + "end": 1490, + "start": 1461 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1432, + "start": 1389 + }, + { + "end": 1461, + "start": 1433 + }, + { + "end": 1496, + "start": 1463 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1458, + "start": 1426 + }, + { + "end": 1490, + "start": 1459 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1480, + "start": 1394 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 125, + "start": 7 + }, + { + "end": 244, + "start": 128 + }, + { + "end": 303, + "start": 247 + }, + { + "end": 363, + "start": 305 + }, + { + "end": 477, + "start": 364 + }, + { + "end": 542, + "start": 478 + }, + { + "end": 638, + "start": 569 + }, + { + "end": 754, + "start": 641 + }, + { + "end": 812, + "start": 755 + }, + { + "end": 870, + "start": 813 + }, + { + "end": 1043, + "start": 871 + }, + { + "end": 1102, + "start": 1044 + }, + { + "end": 1155, + "start": 1103 + }, + { + "end": 1401, + "start": 1165 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 67, + "start": 9 + }, + { + "end": 125, + "start": 72 + }, + { + "end": 185, + "start": 130 + }, + { + "end": 244, + "start": 190 + }, + { + "end": 302, + "start": 249 + }, + { + "end": 361, + "start": 307 + }, + { + "end": 419, + "start": 366 + }, + { + "end": 477, + "start": 424 + }, + { + "end": 534, + "start": 482 + }, + { + "end": 638, + "start": 580 + }, + { + "end": 696, + "start": 643 + }, + { + "end": 754, + "start": 701 + }, + { + "end": 812, + "start": 759 + }, + { + "end": 870, + "start": 817 + }, + { + "end": 928, + "start": 875 + }, + { + "end": 985, + "start": 933 + }, + { + "end": 1043, + "start": 990 + }, + { + "end": 1102, + "start": 1048 + }, + { + "end": 1162, + "start": 1107 + }, + { + "end": 1220, + "start": 1167 + }, + { + "end": 1278, + "start": 1225 + }, + { + "end": 1337, + "start": 1283 + }, + { + "end": 1394, + "start": 1342 + } + ] + } + ], + "end": 113190038, + "exons": [ + { + "end": 113128840, + "start": 113127536 + }, + { + "end": 113132296, + "start": 113132203 + }, + { + "end": 113137743, + "start": 113137648 + }, + { + "end": 113139646, + "start": 113139551 + }, + { + "end": 113141797, + "start": 113141627 + }, + { + "end": 113148354, + "start": 113148178 + }, + { + "end": 113149738, + "start": 113149565 + }, + { + "end": 113151867, + "start": 113151804 + }, + { + "end": 113163289, + "start": 113163134 + }, + { + "end": 113166832, + "start": 113166607 + }, + { + "end": 113171231, + "start": 113168440 + }, + { + "end": 113174015, + "start": 113173343 + }, + { + "end": 113190038, + "start": 113189871 + } + ], + "is_best_transcript": false, + "name": "ENST00000297826", + "start": 113127536 + }, + { + "cdna_coding_end": 10911, + "cdna_coding_start": 265, + "domains": [ + { + "name": "SSF57535", + "regions": [ + { + "end": 410, + "start": 351 + }, + { + "end": 470, + "start": 411 + }, + { + "end": 537, + "start": 471 + }, + { + "end": 767, + "start": 704 + }, + { + "end": 1723, + "start": 1603 + }, + { + "end": 1819, + "start": 1762 + }, + { + "end": 1877, + "start": 1820 + }, + { + "end": 1935, + "start": 1878 + }, + { + "end": 1993, + "start": 1936 + }, + { + "end": 2055, + "start": 1994 + }, + { + "end": 2176, + "start": 2058 + }, + { + "end": 2295, + "start": 2179 + }, + { + "end": 2354, + "start": 2298 + }, + { + "end": 2414, + "start": 2356 + }, + { + "end": 2528, + "start": 2415 + }, + { + "end": 2593, + "start": 2529 + }, + { + "end": 2689, + "start": 2620 + }, + { + "end": 2805, + "start": 2692 + }, + { + "end": 2863, + "start": 2806 + }, + { + "end": 2921, + "start": 2864 + }, + { + "end": 3094, + "start": 2922 + }, + { + "end": 3153, + "start": 3095 + }, + { + "end": 3206, + "start": 3154 + }, + { + "end": 3452, + "start": 3216 + } + ] + }, + { + "name": "SSF49899", + "regions": [ + { + "end": 1609, + "start": 1398 + } + ] + }, + { + "name": "SM00159", + "regions": [ + { + "end": 1604, + "start": 1397 + } + ] + }, + { + "name": "PF00354", + "regions": [ + { + "end": 1597, + "start": 1419 + } + ] + }, + { + "name": "PR00895", + "regions": [ + { + "end": 1507, + "start": 1489 + }, + { + "end": 1535, + "start": 1516 + }, + { + "end": 1569, + "start": 1536 + } + ] + }, + { + "name": "PF02494", + "regions": [ + { + "end": 619, + "start": 538 + }, + { + "end": 698, + "start": 621 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 410, + "start": 355 + }, + { + "end": 470, + "start": 415 + }, + { + "end": 536, + "start": 475 + }, + { + "end": 764, + "start": 704 + }, + { + "end": 1662, + "start": 1608 + }, + { + "end": 1720, + "start": 1667 + }, + { + "end": 1819, + "start": 1766 + }, + { + "end": 1877, + "start": 1824 + }, + { + "end": 1935, + "start": 1882 + }, + { + "end": 1993, + "start": 1940 + }, + { + "end": 2055, + "start": 1998 + }, + { + "end": 2118, + "start": 2060 + }, + { + "end": 2176, + "start": 2123 + }, + { + "end": 2236, + "start": 2181 + }, + { + "end": 2295, + "start": 2241 + }, + { + "end": 2353, + "start": 2300 + }, + { + "end": 2412, + "start": 2358 + }, + { + "end": 2470, + "start": 2417 + }, + { + "end": 2528, + "start": 2475 + }, + { + "end": 2585, + "start": 2533 + }, + { + "end": 2689, + "start": 2631 + }, + { + "end": 2747, + "start": 2694 + }, + { + "end": 2805, + "start": 2752 + }, + { + "end": 2863, + "start": 2810 + }, + { + "end": 2921, + "start": 2868 + }, + { + "end": 2979, + "start": 2926 + }, + { + "end": 3036, + "start": 2984 + }, + { + "end": 3094, + "start": 3041 + }, + { + "end": 3153, + "start": 3099 + }, + { + "end": 3213, + "start": 3158 + }, + { + "end": 3271, + "start": 3218 + }, + { + "end": 3329, + "start": 3276 + }, + { + "end": 3388, + "start": 3334 + }, + { + "end": 3445, + "start": 3393 + } + ] + }, + { + "name": "SM00179", + "regions": [ + { + "end": 1206, + "start": 1173 + }, + { + "end": 1244, + "start": 1208 + }, + { + "end": 1282, + "start": 1246 + }, + { + "end": 1320, + "start": 1284 + }, + { + "end": 1358, + "start": 1322 + }, + { + "end": 1396, + "start": 1360 + }, + { + "end": 1761, + "start": 1722 + }, + { + "end": 3509, + "start": 3481 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 417, + "start": 246 + }, + { + "end": 1121, + "start": 965 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1244, + "start": 1166 + }, + { + "end": 1282, + "start": 1245 + }, + { + "end": 1319, + "start": 1283 + }, + { + "end": 1400, + "start": 1321 + }, + { + "end": 1763, + "start": 1712 + }, + { + "end": 3483, + "start": 3440 + }, + { + "end": 3512, + "start": 3484 + }, + { + "end": 3547, + "start": 3514 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1206, + "start": 1170 + }, + { + "end": 1244, + "start": 1208 + }, + { + "end": 1282, + "start": 1246 + }, + { + "end": 1320, + "start": 1284 + }, + { + "end": 1358, + "start": 1322 + }, + { + "end": 1396, + "start": 1360 + }, + { + "end": 1761, + "start": 1722 + }, + { + "end": 3509, + "start": 3477 + }, + { + "end": 3541, + "start": 3510 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1206, + "start": 1173 + }, + { + "end": 1244, + "start": 1211 + }, + { + "end": 1282, + "start": 1249 + }, + { + "end": 1320, + "start": 1287 + }, + { + "end": 1358, + "start": 1325 + }, + { + "end": 1396, + "start": 1363 + }, + { + "end": 1761, + "start": 1725 + }, + { + "end": 3477, + "start": 3448 + }, + { + "end": 3509, + "start": 3480 + }, + { + "end": 3541, + "start": 3512 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 229, + "start": 61 + } + ] + }, + { + "name": "PS50825", + "regions": [ + { + "end": 619, + "start": 537 + }, + { + "end": 701, + "start": 620 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1386, + "start": 1174 + }, + { + "end": 3531, + "start": 3445 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 337, + "start": 287 + }, + { + "end": 1029, + "start": 982 + }, + { + "end": 1083, + "start": 1036 + }, + { + "end": 1137, + "start": 1090 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1203, + "start": 1174 + }, + { + "end": 1242, + "start": 1212 + }, + { + "end": 1279, + "start": 1250 + }, + { + "end": 1356, + "start": 1326 + }, + { + "end": 1394, + "start": 1364 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 237, + "start": 58 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 412, + "start": 353 + }, + { + "end": 472, + "start": 413 + }, + { + "end": 538, + "start": 473 + }, + { + "end": 766, + "start": 702 + }, + { + "end": 1664, + "start": 1606 + }, + { + "end": 1722, + "start": 1665 + }, + { + "end": 1821, + "start": 1764 + }, + { + "end": 1879, + "start": 1822 + }, + { + "end": 1937, + "start": 1880 + }, + { + "end": 1995, + "start": 1938 + }, + { + "end": 2057, + "start": 1996 + }, + { + "end": 2120, + "start": 2058 + }, + { + "end": 2178, + "start": 2121 + }, + { + "end": 2238, + "start": 2179 + }, + { + "end": 2297, + "start": 2239 + }, + { + "end": 2355, + "start": 2298 + }, + { + "end": 2414, + "start": 2356 + }, + { + "end": 2472, + "start": 2415 + }, + { + "end": 2530, + "start": 2473 + }, + { + "end": 2587, + "start": 2531 + }, + { + "end": 2691, + "start": 2640 + }, + { + "end": 2749, + "start": 2692 + }, + { + "end": 2807, + "start": 2750 + }, + { + "end": 2865, + "start": 2808 + }, + { + "end": 2923, + "start": 2866 + }, + { + "end": 2981, + "start": 2924 + }, + { + "end": 3038, + "start": 2982 + }, + { + "end": 3096, + "start": 3039 + }, + { + "end": 3155, + "start": 3097 + }, + { + "end": 3215, + "start": 3156 + }, + { + "end": 3273, + "start": 3216 + }, + { + "end": 3331, + "start": 3274 + }, + { + "end": 3390, + "start": 3332 + }, + { + "end": 3447, + "start": 3391 + } + ] + }, + { + "name": "PF07645", + "regions": [ + { + "end": 1760, + "start": 1722 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 239, + "start": 56 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 407, + "start": 355 + }, + { + "end": 470, + "start": 415 + }, + { + "end": 1662, + "start": 1605 + }, + { + "end": 1720, + "start": 1667 + }, + { + "end": 1819, + "start": 1766 + }, + { + "end": 1877, + "start": 1824 + }, + { + "end": 1935, + "start": 1882 + }, + { + "end": 1993, + "start": 1940 + }, + { + "end": 2055, + "start": 1998 + }, + { + "end": 2113, + "start": 2060 + }, + { + "end": 2176, + "start": 2123 + }, + { + "end": 2236, + "start": 2181 + }, + { + "end": 2295, + "start": 2241 + }, + { + "end": 2353, + "start": 2300 + }, + { + "end": 2412, + "start": 2358 + }, + { + "end": 2470, + "start": 2417 + }, + { + "end": 2528, + "start": 2475 + }, + { + "end": 2585, + "start": 2533 + }, + { + "end": 2689, + "start": 2644 + }, + { + "end": 2747, + "start": 2694 + }, + { + "end": 2805, + "start": 2752 + }, + { + "end": 2863, + "start": 2810 + }, + { + "end": 2921, + "start": 2868 + }, + { + "end": 2979, + "start": 2926 + }, + { + "end": 3036, + "start": 2984 + }, + { + "end": 3094, + "start": 3061 + }, + { + "end": 3149, + "start": 3099 + }, + { + "end": 3213, + "start": 3158 + }, + { + "end": 3267, + "start": 3218 + }, + { + "end": 3329, + "start": 3276 + }, + { + "end": 3388, + "start": 3334 + }, + { + "end": 3445, + "start": 3393 + } + ] + }, + { + "name": "PF07974", + "regions": [ + { + "end": 1243, + "start": 1212 + }, + { + "end": 3476, + "start": 3452 + }, + { + "end": 3508, + "start": 3484 + }, + { + "end": 3540, + "start": 3513 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 241, + "start": 60 + } + ] + } + ], + "end": 113342018, + "exons": [ + { + "end": 113128840, + "start": 113127536 + }, + { + "end": 113132296, + "start": 113132203 + }, + { + "end": 113137743, + "start": 113137648 + }, + { + "end": 113139646, + "start": 113139551 + }, + { + "end": 113141797, + "start": 113141627 + }, + { + "end": 113148354, + "start": 113148178 + }, + { + "end": 113149738, + "start": 113149565 + }, + { + "end": 113151867, + "start": 113151804 + }, + { + "end": 113163289, + "start": 113163134 + }, + { + "end": 113166832, + "start": 113166607 + }, + { + "end": 113171231, + "start": 113168440 + }, + { + "end": 113174015, + "start": 113173343 + }, + { + "end": 113190038, + "start": 113189871 + }, + { + "end": 113191614, + "start": 113191423 + }, + { + "end": 113192284, + "start": 113192200 + }, + { + "end": 113192730, + "start": 113192554 + }, + { + "end": 113194314, + "start": 113194195 + }, + { + "end": 113194915, + "start": 113194742 + }, + { + "end": 113196786, + "start": 113196616 + }, + { + "end": 113197644, + "start": 113197521 + }, + { + "end": 113198784, + "start": 113198660 + }, + { + "end": 113206000, + "start": 113205825 + }, + { + "end": 113208318, + "start": 113208117 + }, + { + "end": 113209337, + "start": 113209180 + }, + { + "end": 113212540, + "start": 113212339 + }, + { + "end": 113213682, + "start": 113213569 + }, + { + "end": 113217983, + "start": 113217870 + }, + { + "end": 113219632, + "start": 113219536 + }, + { + "end": 113220842, + "start": 113220751 + }, + { + "end": 113221393, + "start": 113221232 + }, + { + "end": 113228306, + "start": 113228145 + }, + { + "end": 113231381, + "start": 113231220 + }, + { + "end": 113233877, + "start": 113233644 + }, + { + "end": 113234603, + "start": 113234439 + }, + { + "end": 113238595, + "start": 113238484 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113342018, + "start": 113341293 + } + ], + "is_best_transcript": false, + "name": "ENST00000374469", + "start": 113127536 + }, + { + "cdna_coding_end": 4650, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PS50825", + "regions": [ + { + "end": 642, + "start": 560 + }, + { + "end": 724, + "start": 643 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 360, + "start": 310 + }, + { + "end": 1052, + "start": 1005 + }, + { + "end": 1106, + "start": 1059 + }, + { + "end": 1160, + "start": 1113 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1409, + "start": 1197 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1234 + }, + { + "end": 1305, + "start": 1272 + }, + { + "end": 1343, + "start": 1310 + }, + { + "end": 1381, + "start": 1348 + }, + { + "end": 1419, + "start": 1386 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1267, + "start": 1189 + }, + { + "end": 1305, + "start": 1268 + }, + { + "end": 1342, + "start": 1306 + }, + { + "end": 1423, + "start": 1344 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1229, + "start": 1193 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 440, + "start": 269 + }, + { + "end": 1144, + "start": 988 + } + ] + }, + { + "name": "SM00179", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 252, + "start": 84 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 433, + "start": 378 + }, + { + "end": 493, + "start": 438 + }, + { + "end": 559, + "start": 498 + }, + { + "end": 787, + "start": 727 + } + ] + }, + { + "name": "PF02494", + "regions": [ + { + "end": 642, + "start": 561 + }, + { + "end": 721, + "start": 644 + } + ] + }, + { + "name": "PR00010", + "regions": [ + { + "end": 1318, + "start": 1307 + }, + { + "end": 1364, + "start": 1357 + }, + { + "end": 1413, + "start": 1403 + }, + { + "end": 1420, + "start": 1414 + } + ] + }, + { + "name": "PF00354", + "regions": [ + { + "end": 1532, + "start": 1442 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 433, + "start": 374 + }, + { + "end": 493, + "start": 434 + }, + { + "end": 560, + "start": 494 + }, + { + "end": 790, + "start": 727 + } + ] + }, + { + "name": "SSF49899", + "regions": [ + { + "end": 1547, + "start": 1421 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 264, + "start": 83 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 262, + "start": 79 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 430, + "start": 378 + }, + { + "end": 493, + "start": 438 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 435, + "start": 376 + }, + { + "end": 495, + "start": 436 + }, + { + "end": 561, + "start": 496 + }, + { + "end": 789, + "start": 725 + } + ] + }, + { + "name": "PF07645", + "regions": [ + { + "end": 1262, + "start": 1231 + }, + { + "end": 1338, + "start": 1308 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1226, + "start": 1197 + }, + { + "end": 1265, + "start": 1235 + }, + { + "end": 1302, + "start": 1273 + }, + { + "end": 1337, + "start": 1311 + }, + { + "end": 1379, + "start": 1349 + }, + { + "end": 1417, + "start": 1387 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 260, + "start": 81 + } + ] + } + ], + "end": 113341823, + "exons": [ + { + "end": 113206000, + "start": 113204759 + }, + { + "end": 113208318, + "start": 113208117 + }, + { + "end": 113209337, + "start": 113209180 + }, + { + "end": 113212540, + "start": 113212339 + }, + { + "end": 113213682, + "start": 113213569 + }, + { + "end": 113217983, + "start": 113217870 + }, + { + "end": 113219632, + "start": 113219536 + }, + { + "end": 113220399, + "start": 113220395 + }, + { + "end": 113220842, + "start": 113220756 + }, + { + "end": 113221393, + "start": 113221232 + }, + { + "end": 113228306, + "start": 113228145 + }, + { + "end": 113231381, + "start": 113231220 + }, + { + "end": 113233877, + "start": 113233644 + }, + { + "end": 113234603, + "start": 113234439 + }, + { + "end": 113238595, + "start": 113238484 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113341823, + "start": 113341293 + } + ], + "is_best_transcript": false, + "name": "ENST00000302728", + "start": 113204759 + }, + { + "cdna_coding_end": 2944, + "cdna_coding_start": 407, + "domains": [ + { + "name": "PF02494", + "regions": [ + { + "end": 619, + "start": 538 + }, + { + "end": 698, + "start": 621 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 410, + "start": 355 + }, + { + "end": 470, + "start": 415 + }, + { + "end": 536, + "start": 475 + }, + { + "end": 764, + "start": 704 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 410, + "start": 351 + }, + { + "end": 470, + "start": 411 + }, + { + "end": 537, + "start": 471 + }, + { + "end": 767, + "start": 704 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 337, + "start": 287 + } + ] + }, + { + "name": "PS50825", + "regions": [ + { + "end": 619, + "start": 537 + }, + { + "end": 701, + "start": 620 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 229, + "start": 61 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 417, + "start": 246 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 412, + "start": 353 + }, + { + "end": 472, + "start": 413 + }, + { + "end": 538, + "start": 473 + }, + { + "end": 766, + "start": 702 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 237, + "start": 58 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 241, + "start": 60 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 239, + "start": 56 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 407, + "start": 355 + }, + { + "end": 470, + "start": 415 + } + ] + } + ], + "end": 113342160, + "exons": [ + { + "end": 113238595, + "start": 113238163 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113342160, + "start": 113341293 + } + ], + "is_best_transcript": false, + "name": "ENST00000374461", + "start": 113238163 + } + ] + }, + { + "aliases": [ + "ARID1B" + ], + "chr": "6", + "end": 157530401, + "name": "ENSG00000049618", + "start": 157099063, + "strand": "+", + "transcripts": [ + { + "cdna_coding_end": 6751, + "cdna_coding_start": 2, + "domains": [ + { + "name": "PF12031", + "regions": [ + { + "end": 2195, + "start": 1939 + } + ] + }, + { + "name": "PS50324", + "regions": [ + { + "end": 57, + "start": 35 + }, + { + "end": 784, + "start": 697 + } + ] + }, + { + "name": "PF01388", + "regions": [ + { + "end": 1153, + "start": 1065 + } + ] + }, + { + "name": "PS50099", + "regions": [ + { + "end": 820, + "start": 715 + }, + { + "end": 1610, + "start": 1472 + } + ] + }, + { + "name": "SSF48371", + "regions": [ + { + "end": 2220, + "start": 2075 + } + ] + }, + { + "name": "PS50316", + "regions": [ + { + "end": 104, + "start": 81 + } + ] + }, + { + "name": "PS50322", + "regions": [ + { + "end": 131, + "start": 107 + }, + { + "end": 646, + "start": 574 + } + ] + }, + { + "name": "PS51011", + "regions": [ + { + "end": 1157, + "start": 1066 + } + ] + }, + { + "name": "PS50310", + "regions": [ + { + "end": 47, + "start": 2 + }, + { + "end": 493, + "start": 329 + } + ] + }, + { + "name": "PS50315", + "regions": [ + { + "end": 401, + "start": 141 + } + ] + }, + { + "name": "SSF46774", + "regions": [ + { + "end": 1168, + "start": 1049 + } + ] + }, + { + "name": "SM00501", + "regions": [ + { + "end": 1158, + "start": 1067 + } + ] + } + ], + "end": 157529495, + "exons": [ + { + "end": 157100605, + "start": 157099063 + }, + { + "end": 157150555, + "start": 157150361 + }, + { + "end": 157192786, + "start": 157192748 + }, + { + "end": 157222659, + "start": 157222510 + }, + { + "end": 157256710, + "start": 157256600 + }, + { + "end": 157406039, + "start": 157405796 + }, + { + "end": 157431695, + "start": 157431606 + }, + { + "end": 157454341, + "start": 157454162 + }, + { + "end": 157470085, + "start": 157469758 + }, + { + "end": 157488319, + "start": 157488174 + }, + { + "end": 157495251, + "start": 157495142 + }, + { + "end": 157502312, + "start": 157502103 + }, + { + "end": 157505569, + "start": 157505365 + }, + { + "end": 157510914, + "start": 157510776 + }, + { + "end": 157511344, + "start": 157511172 + }, + { + "end": 157517449, + "start": 157517299 + }, + { + "end": 157520041, + "start": 157519945 + }, + { + "end": 157522622, + "start": 157521839 + }, + { + "end": 157525130, + "start": 157525000 + }, + { + "end": 157529495, + "start": 157527301 + } + ], + "is_best_transcript": true, + "name": "ENST00000346085", + "start": 157099063 + } + ] + } + ] +} diff --git a/tests/tools/data/example_genes.v3.json b/tests/tools/data/example_genes.v3.json new file mode 100644 index 00000000..6a590488 --- /dev/null +++ b/tests/tools/data/example_genes.v3.json @@ -0,0 +1 @@ +{"genes": [{"aliases": ["EGFR"], "chr": "7", "end": 55324313, "name": "ENSG00000146648", "start": 55086714, "strand": "+", "transcripts": [{"end": 55270769, "exons": [{"end": 55087058, "start": 55086714}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270769, "start": 55270210}], "is_best_transcript": false, "name": "ENST00000455089", "start": 55086714, "translations": [{"cdna_coding_end": 3533, "cdna_coding_start": 258, "domains": [{"name": "PIRSF000619", "regions": [{"end": 1090, "start": 1}]}, {"name": "PF07714", "regions": [{"end": 920, "start": 669}]}, {"name": "SSF52058", "regions": [{"end": 191, "start": 28}, {"end": 475, "start": 283}]}, {"name": "PF00757", "regions": [{"end": 293, "start": 141}]}, {"name": "PS50011", "regions": [{"end": 934, "start": 667}]}, {"name": "PS50311", "regions": [{"end": 219, "start": 145}]}, {"name": "SSF57184", "regions": [{"end": 290, "start": 142}, {"end": 593, "start": 460}]}, {"name": "PR00109", "regions": [{"end": 758, "start": 745}, {"end": 800, "start": 782}, {"end": 841, "start": 831}, {"end": 872, "start": 850}, {"end": 916, "start": 894}]}, {"name": "SSF56112", "regions": [{"end": 975, "start": 651}]}, {"name": "PF01030", "regions": [{"end": 141, "start": 57}, {"end": 435, "start": 316}]}, {"name": "SM00220", "regions": [{"end": 924, "start": 667}]}, {"name": "SM00261", "regions": [{"end": 225, "start": 183}, {"end": 502, "start": 451}, {"end": 556, "start": 507}]}, {"name": "SM00219", "regions": [{"end": 923, "start": 667}]}, {"name": "PF00069", "regions": [{"end": 919, "start": 667}]}]}]}, {"end": 55236328, "exons": [{"end": 55087058, "start": 55086725}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55236328, "start": 55236216}], "is_best_transcript": false, "name": "ENST00000342916", "start": 55086725, "translations": [{"cdna_coding_end": 2133, "cdna_coding_start": 247, "domains": [{"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 624, "start": 505}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}]}]}, {"end": 55238738, "exons": [{"end": 55087058, "start": 55086726}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238738, "start": 55238000}], "is_best_transcript": false, "name": "ENST00000344576", "start": 55086726, "translations": [{"cdna_coding_end": 2363, "cdna_coding_start": 246, "domains": [{"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 624, "start": 505}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}]}]}, {"end": 55224644, "exons": [{"end": 55087058, "start": 55086727}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224644, "start": 55224452}], "is_best_transcript": false, "name": "ENST00000420316", "start": 55086727, "translations": [{"cdna_coding_end": 1462, "cdna_coding_start": 245, "domains": [{"name": "SSF57184", "regions": [{"end": 339, "start": 182}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 403, "start": 328}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}]}]}]}, {"end": 55279321, "exons": [{"end": 55087058, "start": 55086794}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270318, "start": 55270210}, {"end": 55279321, "start": 55272949}], "is_best_transcript": true, "name": "ENST00000275493", "start": 55086794, "translations": [{"cdna_coding_end": 3810, "cdna_coding_start": 178, "domains": [{"name": "SM00220", "regions": [{"end": 969, "start": 712}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}, {"name": "SSF56112", "regions": [{"end": 1020, "start": 696}]}, {"name": "PF00069", "regions": [{"end": 964, "start": 712}]}, {"name": "SM00219", "regions": [{"end": 968, "start": 712}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF07714", "regions": [{"end": 965, "start": 714}]}, {"name": "PIRSF000619", "regions": [{"end": 1210, "start": 1}]}, {"name": "PR00109", "regions": [{"end": 803, "start": 790}, {"end": 845, "start": 827}, {"end": 886, "start": 876}, {"end": 917, "start": 895}, {"end": 961, "start": 939}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 638, "start": 505}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PS50011", "regions": [{"end": 979, "start": 712}]}]}]}, {"end": 55324313, "exons": [{"end": 55087058, "start": 55086811}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240621, "start": 55240539}, {"end": 55324313, "start": 55323947}], "is_best_transcript": false, "name": "ENST00000442591", "start": 55086811, "translations": [{"cdna_coding_end": 2134, "cdna_coding_start": 161, "domains": [{"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}, {"end": 653, "start": 614}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 638, "start": 505}]}]}]}, {"end": 55214417, "exons": [{"end": 55177651, "start": 55177416}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214417, "start": 55214299}], "is_best_transcript": false, "name": "ENST00000450046", "start": 55177416, "translations": [{"cdna_coding_end": 691, "cdna_coding_start": 308, "domains": [{"name": "SSF52058", "regions": [{"end": 127, "start": 1}]}, {"name": "PF01030", "regions": [{"end": 114, "start": 4}]}]}]}, {"end": 55273591, "exons": [{"end": 55177651, "start": 55177540}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270318, "start": 55270210}, {"end": 55273591, "start": 55272949}], "is_best_transcript": false, "name": "ENST00000454757", "start": 55177540, "translations": [{"cdna_coding_end": 3657, "cdna_coding_start": 184, "domains": [{"name": "SM00261", "regions": [{"end": 217, "start": 175}, {"end": 494, "start": 443}, {"end": 548, "start": 499}]}, {"name": "PF00069", "regions": [{"end": 911, "start": 659}]}, {"name": "SM00219", "regions": [{"end": 915, "start": 659}]}, {"name": "SSF56112", "regions": [{"end": 967, "start": 643}]}, {"name": "SM00220", "regions": [{"end": 916, "start": 659}]}, {"name": "PF01030", "regions": [{"end": 114, "start": 4}, {"end": 427, "start": 308}]}, {"name": "PS50311", "regions": [{"end": 211, "start": 134}]}, {"name": "PS50011", "regions": [{"end": 926, "start": 659}]}, {"name": "PR00109", "regions": [{"end": 750, "start": 737}, {"end": 792, "start": 774}, {"end": 833, "start": 823}, {"end": 864, "start": 842}, {"end": 908, "start": 886}]}, {"name": "SSF57184", "regions": [{"end": 286, "start": 129}, {"end": 585, "start": 452}]}, {"name": "PIRSF000619", "regions": [{"end": 1157, "start": 1}]}, {"name": "PF07714", "regions": [{"end": 912, "start": 661}]}, {"name": "SSF52058", "regions": [{"end": 158, "start": 1}, {"end": 467, "start": 275}]}, {"name": "PF00757", "regions": [{"end": 285, "start": 132}]}]}]}]}, {"aliases": ["DSTYK"], "chr": "1", "end": 205180727, "name": "ENSG00000133059", "start": 205111632, "strand": "-", "transcripts": [{"end": 205180727, "exons": [{"end": 205116873, "start": 205111632}, {"end": 205117467, "start": 205117333}, {"end": 205119898, "start": 205119808}, {"end": 205133083, "start": 205133055}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180727, "start": 205180399}], "is_best_transcript": false, "name": "ENST00000367160", "start": 205111632, "translations": [{"cdna_coding_end": 65, "cdna_coding_start": 1831, "domains": [{"name": "SM00220", "regions": [{"end": 565, "start": 337}]}, {"name": "SSF56112", "regions": [{"end": 585, "start": 452}]}, {"name": "PF00069", "regions": [{"end": 556, "start": 451}]}, {"name": "PF07714", "regions": [{"end": 558, "start": 471}]}, {"name": "PS50011", "regions": [{"end": 565, "start": 312}]}]}]}, {"end": 205180694, "exons": [{"end": 205116873, "start": 205111633}, {"end": 205119922, "start": 205119808}, {"end": 205126514, "start": 205126401}, {"end": 205128807, "start": 205128675}, {"end": 205129398, "start": 205129242}, {"end": 205130515, "start": 205130386}, {"end": 205131340, "start": 205131164}, {"end": 205132134, "start": 205132051}, {"end": 205133083, "start": 205132851}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180694, "start": 205180399}], "is_best_transcript": false, "name": "ENST00000367161", "start": 205111633, "translations": [{"cdna_coding_end": 32, "cdna_coding_start": 2686, "domains": [{"name": "PF07714", "regions": [{"end": 820, "start": 654}]}, {"name": "PS50011", "regions": [{"end": 884, "start": 652}]}, {"name": "SSF56112", "regions": [{"end": 853, "start": 627}]}, {"name": "SM00220", "regions": [{"end": 861, "start": 652}]}, {"name": "PF00069", "regions": [{"end": 824, "start": 654}]}, {"name": "SM00219", "regions": [{"end": 861, "start": 652}]}]}]}, {"end": 205180694, "exons": [{"end": 205116873, "start": 205111633}, {"end": 205117467, "start": 205117333}, {"end": 205119922, "start": 205119808}, {"end": 205126514, "start": 205126401}, {"end": 205128807, "start": 205128675}, {"end": 205129398, "start": 205129242}, {"end": 205130515, "start": 205130386}, {"end": 205131340, "start": 205131164}, {"end": 205132134, "start": 205132051}, {"end": 205133083, "start": 205132851}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180694, "start": 205180399}], "is_best_transcript": true, "name": "ENST00000367162", "start": 205111633, "translations": [{"cdna_coding_end": 32, "cdna_coding_start": 2821, "domains": [{"name": "PF07714", "regions": [{"end": 899, "start": 654}]}, {"name": "PS50011", "regions": [{"end": 906, "start": 652}]}, {"name": "SSF56112", "regions": [{"end": 897, "start": 638}]}, {"name": "SM00220", "regions": [{"end": 906, "start": 652}]}, {"name": "SM00219", "regions": [{"end": 906, "start": 652}]}, {"name": "PF00069", "regions": [{"end": 897, "start": 654}]}]}]}]}, {"aliases": ["NDUFA12"], "chr": "12", "end": 95397546, "name": "ENSG00000184752", "start": 95290831, "strand": "-", "transcripts": [{"end": 95397436, "exons": [{"end": 95291086, "start": 95290831}, {"end": 95318582, "start": 95318422}, {"end": 95322039, "start": 95321793}, {"end": 95396597, "start": 95396515}, {"end": 95397436, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000552205", "start": 95290831}, {"end": 95397476, "exons": [{"end": 95365261, "start": 95365108}, {"end": 95396597, "start": 95396582}, {"end": 95397476, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000547157", "start": 95365108, "translations": [{"cdna_coding_end": 21, "cdna_coding_start": 188}]}, {"end": 95397384, "exons": [{"end": 95365396, "start": 95365109}, {"end": 95388033, "start": 95387946}, {"end": 95390752, "start": 95390680}, {"end": 95396597, "start": 95396515}, {"end": 95397384, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000551991", "start": 95365109, "translations": [{"cdna_coding_end": 1, "cdna_coding_start": 144, "domains": [{"name": "PF05071", "regions": [{"end": 33, "start": 12}]}]}]}, {"end": 95397546, "exons": [{"end": 95365396, "start": 95365109}, {"end": 95388033, "start": 95387946}, {"end": 95396597, "start": 95396515}, {"end": 95397546, "start": 95397371}], "is_best_transcript": true, "name": "ENST00000327772", "start": 95365109, "translations": [{"cdna_coding_end": 91, "cdna_coding_start": 528, "domains": [{"name": "PF05071", "regions": [{"end": 137, "start": 36}]}]}]}, {"end": 95397489, "exons": [{"end": 95365396, "start": 95365112}, {"end": 95396597, "start": 95396515}, {"end": 95397489, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000547986", "start": 95365112, "translations": [{"cdna_coding_end": 34, "cdna_coding_start": 225, "domains": [{"name": "PF05071", "regions": [{"end": 53, "start": 36}]}]}]}, {"end": 95397524, "exons": [{"end": 95365396, "start": 95365254}, {"end": 95366265, "start": 95366171}, {"end": 95388033, "start": 95387946}, {"end": 95396597, "start": 95396515}, {"end": 95397524, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000546788", "start": 95365254, "translations": [{"cdna_coding_end": 69, "cdna_coding_start": 368, "domains": [{"name": "PF05071", "regions": [{"end": 87, "start": 36}]}]}]}]}, {"aliases": ["FRMD6"], "chr": "14", "end": 52197445, "name": "ENSG00000139926", "start": 51955818, "strand": "+", "transcripts": [{"end": 52197177, "exons": [{"end": 51956138, "start": 51955855}, {"end": 52037128, "start": 52037066}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197177, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000356218", "start": 51955855, "translations": [{"cdna_coding_end": 2338, "cdna_coding_start": 494, "domains": [{"name": "PF09379", "regions": [{"end": 109, "start": 20}]}, {"name": "PF09380", "regions": [{"end": 322, "start": 237}]}, {"name": "SSF50729", "regions": [{"end": 375, "start": 219}]}, {"name": "SM00295", "regions": [{"end": 226, "start": 12}]}, {"name": "PS50057", "regions": [{"end": 320, "start": 16}]}, {"name": "PF00373", "regions": [{"end": 226, "start": 115}]}, {"name": "SSF47031", "regions": [{"end": 218, "start": 110}]}, {"name": "SSF54236", "regions": [{"end": 110, "start": 14}]}]}]}, {"end": 52197445, "exons": [{"end": 52118714, "start": 52118576}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197445, "start": 52194463}], "is_best_transcript": true, "name": "ENST00000395718", "start": 52118576, "translations": [{"cdna_coding_end": 2130, "cdna_coding_start": 286, "domains": [{"name": "PF00373", "regions": [{"end": 226, "start": 115}]}, {"name": "SSF47031", "regions": [{"end": 218, "start": 110}]}, {"name": "SSF54236", "regions": [{"end": 110, "start": 14}]}, {"name": "PS50057", "regions": [{"end": 320, "start": 16}]}, {"name": "SM00295", "regions": [{"end": 226, "start": 12}]}, {"name": "SSF50729", "regions": [{"end": 375, "start": 219}]}, {"name": "PF09380", "regions": [{"end": 322, "start": 237}]}, {"name": "PF09379", "regions": [{"end": 109, "start": 20}]}]}]}, {"end": 52195654, "exons": [{"end": 52118714, "start": 52118665}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167877, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52195654, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000344768", "start": 52118665, "translations": [{"cdna_coding_end": 2065, "cdna_coding_start": 197, "domains": [{"name": "PF09380", "regions": [{"end": 330, "start": 245}]}, {"name": "PF09379", "regions": [{"end": 117, "start": 20}]}, {"name": "SSF47031", "regions": [{"end": 226, "start": 118}]}, {"name": "PF00373", "regions": [{"end": 234, "start": 123}]}, {"name": "SSF54236", "regions": [{"end": 118, "start": 14}]}, {"name": "PS50057", "regions": [{"end": 328, "start": 16}]}, {"name": "SM00295", "regions": [{"end": 234, "start": 12}]}, {"name": "SSF50729", "regions": [{"end": 383, "start": 227}]}]}]}, {"end": 52164945, "exons": [{"end": 52118935, "start": 52118698}, {"end": 52156653, "start": 52156409}, {"end": 52164945, "start": 52164860}], "is_best_transcript": false, "name": "ENST00000554778", "start": 52118698}, {"end": 52174806, "exons": [{"end": 52164950, "start": 52164706}, {"end": 52167877, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174806, "start": 52174796}], "is_best_transcript": false, "name": "ENST00000555936", "start": 52164706}, {"end": 52197148, "exons": [{"end": 52164950, "start": 52164831}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197148, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000554167", "start": 52164831, "translations": [{"cdna_coding_end": 1775, "cdna_coding_start": 138, "domains": [{"name": "SSF50729", "regions": [{"end": 306, "start": 150}]}, {"name": "PS50057", "regions": [{"end": 251, "start": 1}]}, {"name": "SSF54236", "regions": [{"end": 41, "start": 1}]}, {"name": "SSF47031", "regions": [{"end": 149, "start": 41}]}, {"name": "PF00373", "regions": [{"end": 157, "start": 46}]}, {"name": "PF09380", "regions": [{"end": 253, "start": 168}]}]}]}, {"end": 52175062, "exons": [{"end": 52169306, "start": 52169266}, {"end": 52171653, "start": 52171467}, {"end": 52175062, "start": 52174796}], "is_best_transcript": false, "name": "ENST00000557405", "start": 52169266, "translations": [{"cdna_coding_end": 390, "cdna_coding_start": 1, "domains": [{"name": "PS50057", "regions": [{"end": 129, "start": 1}]}, {"name": "PF00373", "regions": [{"end": 124, "start": 13}]}, {"name": "SSF47031", "regions": [{"end": 116, "start": 8}]}]}]}, {"end": 52187243, "exons": [{"end": 52179269, "start": 52179231}, {"end": 52182217, "start": 52182043}, {"end": 52187243, "start": 52186773}], "is_best_transcript": false, "name": "ENST00000555197", "start": 52179231, "translations": [{"cdna_coding_end": 618, "cdna_coding_start": 1, "domains": [{"name": "PF09380", "regions": [{"end": 60, "start": 2}]}, {"name": "PS50057", "regions": [{"end": 58, "start": 1}]}, {"name": "SSF50729", "regions": [{"end": 113, "start": 2}]}]}]}, {"end": 52192513, "exons": [{"end": 52184066, "start": 52183973}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188673}, {"end": 52192513, "start": 52192497}], "is_best_transcript": false, "name": "ENST00000555703", "start": 52183973, "translations": [{"cdna_coding_end": 573, "cdna_coding_start": 145}]}, {"end": 52195487, "exons": [{"end": 52184066, "start": 52183973}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52195487, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000553556", "start": 52183973, "translations": [{"cdna_coding_end": 939, "cdna_coding_start": 145}]}]}, {"aliases": ["PRKCB"], "chr": "16", "end": 24231932, "name": "ENSG00000166501", "start": 23847322, "strand": "+", "transcripts": [{"end": 24231932, "exons": [{"end": 23847669, "start": 23847322}, {"end": 23848727, "start": 23848696}, {"end": 23999911, "start": 23999829}, {"end": 24043568, "start": 24043457}, {"end": 24046868, "start": 24046740}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124390, "start": 24124294}, {"end": 24135302, "start": 24135156}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192249, "start": 24192111}, {"end": 24196512, "start": 24196432}, {"end": 24196888, "start": 24196781}, {"end": 24202551, "start": 24202411}, {"end": 24231932, "start": 24231282}], "is_best_transcript": true, "name": "ENST00000321728", "start": 23847322, "translations": [{"cdna_coding_end": 2191, "cdna_coding_start": 176, "domains": [{"name": "SM00239", "regions": [{"end": 275, "start": 172}]}, {"name": "PF07714", "regions": [{"end": 583, "start": 344}]}, {"name": "SSF49562", "regions": [{"end": 288, "start": 157}]}, {"name": "SM00109", "regions": [{"end": 86, "start": 37}, {"end": 151, "start": 102}]}, {"name": "PS50011", "regions": [{"end": 600, "start": 342}]}, {"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 59, "start": 50}, {"end": 74, "start": 63}, {"end": 152, "start": 140}]}, {"name": "PF00433", "regions": [{"end": 666, "start": 623}]}, {"name": "SM00220", "regions": [{"end": 600, "start": 342}]}, {"name": "PF00168", "regions": [{"end": 259, "start": 175}]}, {"name": "SSF57889", "regions": [{"end": 92, "start": 6}, {"end": 157, "start": 101}]}, {"name": "PF00130", "regions": [{"end": 87, "start": 37}, {"end": 153, "start": 102}]}, {"name": "PS50081", "regions": [{"end": 86, "start": 36}, {"end": 151, "start": 101}]}, {"name": "SSF56112", "regions": [{"end": 627, "start": 317}]}, {"name": "PF00069", "regions": [{"end": 586, "start": 343}]}, {"name": "SM00219", "regions": [{"end": 576, "start": 342}]}, {"name": "PR00360", "regions": [{"end": 200, "start": 188}, {"end": 230, "start": 217}, {"end": 248, "start": 240}]}, {"name": "SM00133", "regions": [{"end": 664, "start": 601}]}, {"name": "PS50004", "regions": [{"end": 260, "start": 173}]}, {"name": "PIRSF000550", "regions": [{"end": 671, "start": 1}]}]}]}, {"end": 24231932, "exons": [{"end": 23847669, "start": 23847345}, {"end": 23848727, "start": 23848696}, {"end": 23999911, "start": 23999829}, {"end": 24043568, "start": 24043457}, {"end": 24046868, "start": 24046740}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124390, "start": 24124294}, {"end": 24135302, "start": 24135156}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192249, "start": 24192111}, {"end": 24196512, "start": 24196432}, {"end": 24196888, "start": 24196781}, {"end": 24202551, "start": 24202411}, {"end": 24231932, "start": 24225979}], "is_best_transcript": false, "name": "ENST00000303531", "start": 23847345, "translations": [{"cdna_coding_end": 2174, "cdna_coding_start": 153, "domains": [{"name": "SM00133", "regions": [{"end": 663, "start": 601}]}, {"name": "PS50004", "regions": [{"end": 260, "start": 173}]}, {"name": "PIRSF000550", "regions": [{"end": 672, "start": 1}]}, {"name": "PF00069", "regions": [{"end": 586, "start": 343}]}, {"name": "PR00360", "regions": [{"end": 200, "start": 188}, {"end": 230, "start": 217}, {"end": 248, "start": 240}]}, {"name": "SM00219", "regions": [{"end": 576, "start": 342}]}, {"name": "PS50081", "regions": [{"end": 86, "start": 36}, {"end": 151, "start": 101}]}, {"name": "SSF56112", "regions": [{"end": 627, "start": 317}]}, {"name": "SM00220", "regions": [{"end": 600, "start": 342}]}, {"name": "PF00433", "regions": [{"end": 664, "start": 627}]}, {"name": "PF00130", "regions": [{"end": 87, "start": 37}, {"end": 153, "start": 102}]}, {"name": "PF00168", "regions": [{"end": 259, "start": 175}]}, {"name": "SSF57889", "regions": [{"end": 92, "start": 6}, {"end": 157, "start": 101}]}, {"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 59, "start": 50}, {"end": 74, "start": 63}, {"end": 152, "start": 140}]}, {"name": "PS50011", "regions": [{"end": 600, "start": 342}]}, {"name": "SM00109", "regions": [{"end": 86, "start": 37}, {"end": 151, "start": 102}]}, {"name": "PF07714", "regions": [{"end": 583, "start": 344}]}, {"name": "SSF49562", "regions": [{"end": 288, "start": 157}]}, {"name": "SM00239", "regions": [{"end": 275, "start": 172}]}]}]}, {"end": 23880647, "exons": [{"end": 23847669, "start": 23847403}, {"end": 23880647, "start": 23880435}], "is_best_transcript": false, "name": "ENST00000498058", "start": 23847403, "translations": [{"cdna_coding_end": 268, "cdna_coding_start": 95, "domains": [{"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 57, "start": 50}]}, {"name": "PS50081", "regions": [{"end": 57, "start": 36}]}, {"name": "SSF57889", "regions": [{"end": 57, "start": 6}]}]}]}, {"end": 24124386, "exons": [{"end": 23848727, "start": 23848544}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124386, "start": 24124294}], "is_best_transcript": false, "name": "ENST00000498739", "start": 23848544}, {"end": 24192166, "exons": [{"end": 24163176, "start": 24163006}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192166, "start": 24192111}], "is_best_transcript": false, "name": "ENST00000472066", "start": 24163006}, {"end": 24202909, "exons": [{"end": 24196888, "start": 24196852}, {"end": 24202909, "start": 24202411}], "is_best_transcript": false, "name": "ENST00000466124", "start": 24196852}]}, {"aliases": ["GIMAP4"], "chr": "7", "end": 150271041, "name": "ENSG00000133574", "start": 150264365, "strand": "+", "transcripts": [{"end": 150271041, "exons": [{"end": 150264525, "start": 150264365}, {"end": 150267047, "start": 150266976}, {"end": 150271041, "start": 150269217}], "is_best_transcript": true, "name": "ENST00000255945", "start": 150264365, "translations": [{"cdna_coding_end": 1165, "cdna_coding_start": 176, "domains": [{"name": "PF04548", "regions": [{"end": 238, "start": 31}]}, {"name": "SSF52540", "regions": [{"end": 288, "start": 24}]}]}]}, {"end": 150270602, "exons": [{"end": 150264525, "start": 150264457}, {"end": 150267089, "start": 150266976}, {"end": 150270602, "start": 150269217}], "is_best_transcript": false, "name": "ENST00000461940", "start": 150264457, "translations": [{"cdna_coding_end": 1115, "cdna_coding_start": 84, "domains": [{"name": "PF04548", "regions": [{"end": 252, "start": 45}]}, {"name": "SSF52540", "regions": [{"end": 302, "start": 38}]}]}]}, {"end": 150269569, "exons": [{"end": 150264608, "start": 150264524}, {"end": 150267089, "start": 150266976}, {"end": 150269569, "start": 150269217}], "is_best_transcript": false, "name": "ENST00000479232", "start": 150264524, "translations": [{"cdna_coding_end": 552, "cdna_coding_start": 100, "domains": [{"name": "SSF52540", "regions": [{"end": 151, "start": 38}]}, {"name": "PF04548", "regions": [{"end": 151, "start": 45}]}]}]}]}, {"aliases": ["IL7"], "chr": "8", "end": 79717758, "name": "ENSG00000104432", "start": 79587978, "strand": "-", "transcripts": [{"end": 79717758, "exons": [{"end": 79646067, "start": 79645007}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710307}, {"end": 79717758, "start": 79717148}], "is_best_transcript": true, "name": "ENST00000263851", "start": 79645007, "translations": [{"cdna_coding_end": 602, "cdna_coding_start": 1135, "domains": [{"name": "PIRSF001942", "regions": [{"end": 177, "start": 1}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}, {"end": 77, "start": 57}, {"end": 98, "start": 78}, {"end": 118, "start": 99}, {"end": 173, "start": 151}]}, {"name": "PF01415", "regions": [{"end": 173, "start": 28}]}, {"name": "SM00127", "regions": [{"end": 173, "start": 27}]}]}]}, {"end": 79717699, "exons": [{"end": 79646063, "start": 79645283}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79659331, "start": 79659129}, {"end": 79710443, "start": 79710307}, {"end": 79717699, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000518982", "start": 79645283, "translations": [{"cdna_coding_end": 543, "cdna_coding_start": 758, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}]}, {"name": "PF01415", "regions": [{"end": 54, "start": 28}]}]}]}, {"end": 79717163, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710307}, {"end": 79717163, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520269", "start": 79645900, "translations": [{"cdna_coding_end": 7, "cdna_coding_start": 408, "domains": [{"name": "PF01415", "regions": [{"end": 77, "start": 28}, {"end": 129, "start": 91}]}, {"name": "SM00127", "regions": [{"end": 129, "start": 27}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}, {"end": 77, "start": 57}]}, {"name": "PIRSF001942", "regions": [{"end": 133, "start": 1}]}]}]}, {"end": 79717163, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710363}, {"end": 79717163, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520215", "start": 79645900, "translations": [{"cdna_coding_end": 7, "cdna_coding_start": 120, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 37, "start": 26}]}]}]}, {"end": 79717686, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710363}, {"end": 79717686, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520317", "start": 79645900, "translations": [{"cdna_coding_end": 530, "cdna_coding_start": 643, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 37, "start": 26}]}]}]}, {"end": 79652311, "exons": [{"end": 79646067, "start": 79645948}, {"end": 79652311, "start": 79652237}], "is_best_transcript": false, "name": "ENST00000541183", "start": 79645948, "translations": [{"cdna_coding_end": 1, "cdna_coding_start": 195, "domains": [{"name": "SM00127", "regions": [{"end": 60, "start": 1}]}, {"name": "PF01415", "regions": [{"end": 60, "start": 1}]}]}]}, {"end": 79717758, "exons": [{"end": 79659331, "start": 79659263}, {"end": 79710443, "start": 79710307}, {"end": 79717758, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000379113", "start": 79659263, "translations": [{"cdna_coding_end": 602, "cdna_coding_start": 817, "domains": [{"name": "PF01415", "regions": [{"end": 54, "start": 28}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}]}]}]}]}, {"aliases": ["SVEP1"], "chr": "9", "end": 113342160, "name": "ENSG00000165124", "start": 113127531, "strand": "-", "transcripts": [{"end": 113342160, "exons": [{"end": 113128840, "start": 113127531}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}, {"end": 113191614, "start": 113191423}, {"end": 113192284, "start": 113192200}, {"end": 113192730, "start": 113192554}, {"end": 113194314, "start": 113194195}, {"end": 113194915, "start": 113194742}, {"end": 113196786, "start": 113196616}, {"end": 113197644, "start": 113197521}, {"end": 113198784, "start": 113198660}, {"end": 113206000, "start": 113205825}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220842, "start": 113220751}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342160, "start": 113341293}], "is_best_transcript": true, "name": "ENST00000401783", "start": 113127531, "translations": [{"cdna_coding_end": 338, "cdna_coding_start": 11053, "domains": [{"name": "SM00032", "regions": [{"end": 433, "start": 378}, {"end": 493, "start": 438}, {"end": 559, "start": 498}, {"end": 787, "start": 727}, {"end": 1685, "start": 1631}, {"end": 1743, "start": 1690}, {"end": 1842, "start": 1789}, {"end": 1900, "start": 1847}, {"end": 1958, "start": 1905}, {"end": 2016, "start": 1963}, {"end": 2078, "start": 2021}, {"end": 2141, "start": 2083}, {"end": 2199, "start": 2146}, {"end": 2259, "start": 2204}, {"end": 2318, "start": 2264}, {"end": 2376, "start": 2323}, {"end": 2435, "start": 2381}, {"end": 2493, "start": 2440}, {"end": 2551, "start": 2498}, {"end": 2608, "start": 2556}, {"end": 2712, "start": 2654}, {"end": 2770, "start": 2717}, {"end": 2828, "start": 2775}, {"end": 2886, "start": 2833}, {"end": 2944, "start": 2891}, {"end": 3002, "start": 2949}, {"end": 3059, "start": 3007}, {"end": 3117, "start": 3064}, {"end": 3176, "start": 3122}, {"end": 3236, "start": 3181}, {"end": 3294, "start": 3241}, {"end": 3352, "start": 3299}, {"end": 3411, "start": 3357}, {"end": 3468, "start": 3416}]}, {"name": "PF02494", "regions": [{"end": 642, "start": 561}, {"end": 721, "start": 644}]}, {"name": "PR00895", "regions": [{"end": 1530, "start": 1512}, {"end": 1558, "start": 1539}, {"end": 1592, "start": 1559}]}, {"name": "SSF57535", "regions": [{"end": 433, "start": 374}, {"end": 493, "start": 434}, {"end": 560, "start": 494}, {"end": 790, "start": 727}, {"end": 1746, "start": 1626}, {"end": 1842, "start": 1785}, {"end": 1900, "start": 1843}, {"end": 1958, "start": 1901}, {"end": 2016, "start": 1959}, {"end": 2078, "start": 2017}, {"end": 2199, "start": 2081}, {"end": 2318, "start": 2202}, {"end": 2377, "start": 2321}, {"end": 2437, "start": 2379}, {"end": 2551, "start": 2438}, {"end": 2616, "start": 2552}, {"end": 2712, "start": 2643}, {"end": 2828, "start": 2715}, {"end": 2886, "start": 2829}, {"end": 2944, "start": 2887}, {"end": 3117, "start": 2945}, {"end": 3176, "start": 3118}, {"end": 3229, "start": 3177}, {"end": 3475, "start": 3239}]}, {"name": "SSF49899", "regions": [{"end": 1632, "start": 1421}]}, {"name": "SM00159", "regions": [{"end": 1627, "start": 1420}]}, {"name": "PF00354", "regions": [{"end": 1620, "start": 1442}]}, {"name": "PF07699", "regions": [{"end": 360, "start": 310}, {"end": 1052, "start": 1005}, {"end": 1106, "start": 1059}, {"end": 1160, "start": 1113}]}, {"name": "PS50311", "regions": [{"end": 1409, "start": 1197}, {"end": 3554, "start": 3468}]}, {"name": "PS50825", "regions": [{"end": 642, "start": 560}, {"end": 724, "start": 643}]}, {"name": "PF00092", "regions": [{"end": 252, "start": 84}]}, {"name": "SSF57196", "regions": [{"end": 1267, "start": 1189}, {"end": 1305, "start": 1268}, {"end": 1342, "start": 1306}, {"end": 1423, "start": 1344}, {"end": 1786, "start": 1735}, {"end": 3506, "start": 3463}, {"end": 3535, "start": 3507}, {"end": 3570, "start": 3537}]}, {"name": "PS50026", "regions": [{"end": 1229, "start": 1193}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}, {"end": 1784, "start": 1745}, {"end": 3532, "start": 3500}, {"end": 3564, "start": 3533}]}, {"name": "SM00181", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1234}, {"end": 1305, "start": 1272}, {"end": 1343, "start": 1310}, {"end": 1381, "start": 1348}, {"end": 1419, "start": 1386}, {"end": 1784, "start": 1748}, {"end": 3500, "start": 3471}, {"end": 3532, "start": 3503}, {"end": 3564, "start": 3535}]}, {"name": "SM00179", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}, {"end": 1784, "start": 1745}, {"end": 3532, "start": 3504}]}, {"name": "SSF57184", "regions": [{"end": 440, "start": 269}, {"end": 1144, "start": 988}]}, {"name": "PF07645", "regions": [{"end": 1783, "start": 1745}]}, {"name": "PS50923", "regions": [{"end": 435, "start": 376}, {"end": 495, "start": 436}, {"end": 561, "start": 496}, {"end": 789, "start": 725}, {"end": 1687, "start": 1629}, {"end": 1745, "start": 1688}, {"end": 1844, "start": 1787}, {"end": 1902, "start": 1845}, {"end": 1960, "start": 1903}, {"end": 2018, "start": 1961}, {"end": 2080, "start": 2019}, {"end": 2143, "start": 2081}, {"end": 2201, "start": 2144}, {"end": 2261, "start": 2202}, {"end": 2320, "start": 2262}, {"end": 2378, "start": 2321}, {"end": 2437, "start": 2379}, {"end": 2495, "start": 2438}, {"end": 2553, "start": 2496}, {"end": 2610, "start": 2554}, {"end": 2714, "start": 2663}, {"end": 2772, "start": 2715}, {"end": 2830, "start": 2773}, {"end": 2888, "start": 2831}, {"end": 2946, "start": 2889}, {"end": 3004, "start": 2947}, {"end": 3061, "start": 3005}, {"end": 3119, "start": 3062}, {"end": 3178, "start": 3120}, {"end": 3238, "start": 3179}, {"end": 3296, "start": 3239}, {"end": 3354, "start": 3297}, {"end": 3413, "start": 3355}, {"end": 3470, "start": 3414}]}, {"name": "SM00327", "regions": [{"end": 260, "start": 81}]}, {"name": "PF00008", "regions": [{"end": 1226, "start": 1197}, {"end": 1265, "start": 1235}, {"end": 1302, "start": 1273}, {"end": 1379, "start": 1349}, {"end": 1417, "start": 1387}]}, {"name": "PS50234", "regions": [{"end": 264, "start": 83}]}, {"name": "PF07974", "regions": [{"end": 1266, "start": 1235}, {"end": 3499, "start": 3475}, {"end": 3531, "start": 3507}, {"end": 3563, "start": 3536}]}, {"name": "SSF53300", "regions": [{"end": 262, "start": 79}]}, {"name": "PF00084", "regions": [{"end": 430, "start": 378}, {"end": 493, "start": 438}, {"end": 1685, "start": 1628}, {"end": 1743, "start": 1690}, {"end": 1842, "start": 1789}, {"end": 1900, "start": 1847}, {"end": 1958, "start": 1905}, {"end": 2016, "start": 1963}, {"end": 2078, "start": 2021}, {"end": 2136, "start": 2083}, {"end": 2199, "start": 2146}, {"end": 2259, "start": 2204}, {"end": 2318, "start": 2264}, {"end": 2376, "start": 2323}, {"end": 2435, "start": 2381}, {"end": 2493, "start": 2440}, {"end": 2551, "start": 2498}, {"end": 2608, "start": 2556}, {"end": 2712, "start": 2667}, {"end": 2770, "start": 2717}, {"end": 2828, "start": 2775}, {"end": 2886, "start": 2833}, {"end": 2944, "start": 2891}, {"end": 3002, "start": 2949}, {"end": 3059, "start": 3007}, {"end": 3117, "start": 3084}, {"end": 3172, "start": 3122}, {"end": 3236, "start": 3181}, {"end": 3290, "start": 3241}, {"end": 3352, "start": 3299}, {"end": 3411, "start": 3357}, {"end": 3468, "start": 3416}]}]}]}, {"end": 113190038, "exons": [{"end": 113128840, "start": 113127536}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}], "is_best_transcript": false, "name": "ENST00000297826", "start": 113127536, "translations": [{"cdna_coding_end": 416, "cdna_coding_start": 4909, "domains": [{"name": "PF00084", "regions": [{"end": 62, "start": 9}, {"end": 125, "start": 72}, {"end": 185, "start": 130}, {"end": 244, "start": 190}, {"end": 302, "start": 249}, {"end": 361, "start": 307}, {"end": 419, "start": 366}, {"end": 477, "start": 424}, {"end": 534, "start": 482}, {"end": 638, "start": 593}, {"end": 696, "start": 643}, {"end": 754, "start": 701}, {"end": 812, "start": 759}, {"end": 870, "start": 817}, {"end": 928, "start": 875}, {"end": 985, "start": 933}, {"end": 1043, "start": 1010}, {"end": 1098, "start": 1048}, {"end": 1162, "start": 1107}, {"end": 1216, "start": 1167}, {"end": 1278, "start": 1225}, {"end": 1337, "start": 1283}, {"end": 1394, "start": 1342}]}, {"name": "PF07974", "regions": [{"end": 1425, "start": 1401}, {"end": 1457, "start": 1433}, {"end": 1489, "start": 1462}]}, {"name": "PF00008", "regions": [{"end": 1456, "start": 1427}]}, {"name": "PS50923", "regions": [{"end": 69, "start": 7}, {"end": 127, "start": 70}, {"end": 187, "start": 128}, {"end": 246, "start": 188}, {"end": 304, "start": 247}, {"end": 363, "start": 305}, {"end": 421, "start": 364}, {"end": 479, "start": 422}, {"end": 536, "start": 480}, {"end": 640, "start": 589}, {"end": 698, "start": 641}, {"end": 756, "start": 699}, {"end": 814, "start": 757}, {"end": 872, "start": 815}, {"end": 930, "start": 873}, {"end": 987, "start": 931}, {"end": 1045, "start": 988}, {"end": 1104, "start": 1046}, {"end": 1164, "start": 1105}, {"end": 1222, "start": 1165}, {"end": 1280, "start": 1223}, {"end": 1339, "start": 1281}, {"end": 1396, "start": 1340}]}, {"name": "SM00181", "regions": [{"end": 1426, "start": 1397}, {"end": 1458, "start": 1429}, {"end": 1490, "start": 1461}]}, {"name": "SSF57196", "regions": [{"end": 1432, "start": 1389}, {"end": 1461, "start": 1433}, {"end": 1496, "start": 1463}]}, {"name": "PS50026", "regions": [{"end": 1458, "start": 1426}, {"end": 1490, "start": 1459}]}, {"name": "PS50311", "regions": [{"end": 1480, "start": 1394}]}, {"name": "SSF57535", "regions": [{"end": 125, "start": 7}, {"end": 244, "start": 128}, {"end": 303, "start": 247}, {"end": 363, "start": 305}, {"end": 477, "start": 364}, {"end": 542, "start": 478}, {"end": 638, "start": 569}, {"end": 754, "start": 641}, {"end": 812, "start": 755}, {"end": 870, "start": 813}, {"end": 1043, "start": 871}, {"end": 1102, "start": 1044}, {"end": 1155, "start": 1103}, {"end": 1401, "start": 1165}]}, {"name": "SM00032", "regions": [{"end": 67, "start": 9}, {"end": 125, "start": 72}, {"end": 185, "start": 130}, {"end": 244, "start": 190}, {"end": 302, "start": 249}, {"end": 361, "start": 307}, {"end": 419, "start": 366}, {"end": 477, "start": 424}, {"end": 534, "start": 482}, {"end": 638, "start": 580}, {"end": 696, "start": 643}, {"end": 754, "start": 701}, {"end": 812, "start": 759}, {"end": 870, "start": 817}, {"end": 928, "start": 875}, {"end": 985, "start": 933}, {"end": 1043, "start": 990}, {"end": 1102, "start": 1048}, {"end": 1162, "start": 1107}, {"end": 1220, "start": 1167}, {"end": 1278, "start": 1225}, {"end": 1337, "start": 1283}, {"end": 1394, "start": 1342}]}]}]}, {"end": 113342018, "exons": [{"end": 113128840, "start": 113127536}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}, {"end": 113191614, "start": 113191423}, {"end": 113192284, "start": 113192200}, {"end": 113192730, "start": 113192554}, {"end": 113194314, "start": 113194195}, {"end": 113194915, "start": 113194742}, {"end": 113196786, "start": 113196616}, {"end": 113197644, "start": 113197521}, {"end": 113198784, "start": 113198660}, {"end": 113206000, "start": 113205825}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220842, "start": 113220751}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342018, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000374469", "start": 113127536, "translations": [{"cdna_coding_end": 265, "cdna_coding_start": 10911, "domains": [{"name": "SSF57535", "regions": [{"end": 410, "start": 351}, {"end": 470, "start": 411}, {"end": 537, "start": 471}, {"end": 767, "start": 704}, {"end": 1723, "start": 1603}, {"end": 1819, "start": 1762}, {"end": 1877, "start": 1820}, {"end": 1935, "start": 1878}, {"end": 1993, "start": 1936}, {"end": 2055, "start": 1994}, {"end": 2176, "start": 2058}, {"end": 2295, "start": 2179}, {"end": 2354, "start": 2298}, {"end": 2414, "start": 2356}, {"end": 2528, "start": 2415}, {"end": 2593, "start": 2529}, {"end": 2689, "start": 2620}, {"end": 2805, "start": 2692}, {"end": 2863, "start": 2806}, {"end": 2921, "start": 2864}, {"end": 3094, "start": 2922}, {"end": 3153, "start": 3095}, {"end": 3206, "start": 3154}, {"end": 3452, "start": 3216}]}, {"name": "SSF49899", "regions": [{"end": 1609, "start": 1398}]}, {"name": "SM00159", "regions": [{"end": 1604, "start": 1397}]}, {"name": "PF00354", "regions": [{"end": 1597, "start": 1419}]}, {"name": "PR00895", "regions": [{"end": 1507, "start": 1489}, {"end": 1535, "start": 1516}, {"end": 1569, "start": 1536}]}, {"name": "PF02494", "regions": [{"end": 619, "start": 538}, {"end": 698, "start": 621}]}, {"name": "SM00032", "regions": [{"end": 410, "start": 355}, {"end": 470, "start": 415}, {"end": 536, "start": 475}, {"end": 764, "start": 704}, {"end": 1662, "start": 1608}, {"end": 1720, "start": 1667}, {"end": 1819, "start": 1766}, {"end": 1877, "start": 1824}, {"end": 1935, "start": 1882}, {"end": 1993, "start": 1940}, {"end": 2055, "start": 1998}, {"end": 2118, "start": 2060}, {"end": 2176, "start": 2123}, {"end": 2236, "start": 2181}, {"end": 2295, "start": 2241}, {"end": 2353, "start": 2300}, {"end": 2412, "start": 2358}, {"end": 2470, "start": 2417}, {"end": 2528, "start": 2475}, {"end": 2585, "start": 2533}, {"end": 2689, "start": 2631}, {"end": 2747, "start": 2694}, {"end": 2805, "start": 2752}, {"end": 2863, "start": 2810}, {"end": 2921, "start": 2868}, {"end": 2979, "start": 2926}, {"end": 3036, "start": 2984}, {"end": 3094, "start": 3041}, {"end": 3153, "start": 3099}, {"end": 3213, "start": 3158}, {"end": 3271, "start": 3218}, {"end": 3329, "start": 3276}, {"end": 3388, "start": 3334}, {"end": 3445, "start": 3393}]}, {"name": "SM00179", "regions": [{"end": 1206, "start": 1173}, {"end": 1244, "start": 1208}, {"end": 1282, "start": 1246}, {"end": 1320, "start": 1284}, {"end": 1358, "start": 1322}, {"end": 1396, "start": 1360}, {"end": 1761, "start": 1722}, {"end": 3509, "start": 3481}]}, {"name": "SSF57184", "regions": [{"end": 417, "start": 246}, {"end": 1121, "start": 965}]}, {"name": "SSF57196", "regions": [{"end": 1244, "start": 1166}, {"end": 1282, "start": 1245}, {"end": 1319, "start": 1283}, {"end": 1400, "start": 1321}, {"end": 1763, "start": 1712}, {"end": 3483, "start": 3440}, {"end": 3512, "start": 3484}, {"end": 3547, "start": 3514}]}, {"name": "PS50026", "regions": [{"end": 1206, "start": 1170}, {"end": 1244, "start": 1208}, {"end": 1282, "start": 1246}, {"end": 1320, "start": 1284}, {"end": 1358, "start": 1322}, {"end": 1396, "start": 1360}, {"end": 1761, "start": 1722}, {"end": 3509, "start": 3477}, {"end": 3541, "start": 3510}]}, {"name": "SM00181", "regions": [{"end": 1206, "start": 1173}, {"end": 1244, "start": 1211}, {"end": 1282, "start": 1249}, {"end": 1320, "start": 1287}, {"end": 1358, "start": 1325}, {"end": 1396, "start": 1363}, {"end": 1761, "start": 1725}, {"end": 3477, "start": 3448}, {"end": 3509, "start": 3480}, {"end": 3541, "start": 3512}]}, {"name": "PF00092", "regions": [{"end": 229, "start": 61}]}, {"name": "PS50825", "regions": [{"end": 619, "start": 537}, {"end": 701, "start": 620}]}, {"name": "PS50311", "regions": [{"end": 1386, "start": 1174}, {"end": 3531, "start": 3445}]}, {"name": "PF07699", "regions": [{"end": 337, "start": 287}, {"end": 1029, "start": 982}, {"end": 1083, "start": 1036}, {"end": 1137, "start": 1090}]}, {"name": "PF00008", "regions": [{"end": 1203, "start": 1174}, {"end": 1242, "start": 1212}, {"end": 1279, "start": 1250}, {"end": 1356, "start": 1326}, {"end": 1394, "start": 1364}]}, {"name": "SM00327", "regions": [{"end": 237, "start": 58}]}, {"name": "PS50923", "regions": [{"end": 412, "start": 353}, {"end": 472, "start": 413}, {"end": 538, "start": 473}, {"end": 766, "start": 702}, {"end": 1664, "start": 1606}, {"end": 1722, "start": 1665}, {"end": 1821, "start": 1764}, {"end": 1879, "start": 1822}, {"end": 1937, "start": 1880}, {"end": 1995, "start": 1938}, {"end": 2057, "start": 1996}, {"end": 2120, "start": 2058}, {"end": 2178, "start": 2121}, {"end": 2238, "start": 2179}, {"end": 2297, "start": 2239}, {"end": 2355, "start": 2298}, {"end": 2414, "start": 2356}, {"end": 2472, "start": 2415}, {"end": 2530, "start": 2473}, {"end": 2587, "start": 2531}, {"end": 2691, "start": 2640}, {"end": 2749, "start": 2692}, {"end": 2807, "start": 2750}, {"end": 2865, "start": 2808}, {"end": 2923, "start": 2866}, {"end": 2981, "start": 2924}, {"end": 3038, "start": 2982}, {"end": 3096, "start": 3039}, {"end": 3155, "start": 3097}, {"end": 3215, "start": 3156}, {"end": 3273, "start": 3216}, {"end": 3331, "start": 3274}, {"end": 3390, "start": 3332}, {"end": 3447, "start": 3391}]}, {"name": "PF07645", "regions": [{"end": 1760, "start": 1722}]}, {"name": "SSF53300", "regions": [{"end": 239, "start": 56}]}, {"name": "PF00084", "regions": [{"end": 407, "start": 355}, {"end": 470, "start": 415}, {"end": 1662, "start": 1605}, {"end": 1720, "start": 1667}, {"end": 1819, "start": 1766}, {"end": 1877, "start": 1824}, {"end": 1935, "start": 1882}, {"end": 1993, "start": 1940}, {"end": 2055, "start": 1998}, {"end": 2113, "start": 2060}, {"end": 2176, "start": 2123}, {"end": 2236, "start": 2181}, {"end": 2295, "start": 2241}, {"end": 2353, "start": 2300}, {"end": 2412, "start": 2358}, {"end": 2470, "start": 2417}, {"end": 2528, "start": 2475}, {"end": 2585, "start": 2533}, {"end": 2689, "start": 2644}, {"end": 2747, "start": 2694}, {"end": 2805, "start": 2752}, {"end": 2863, "start": 2810}, {"end": 2921, "start": 2868}, {"end": 2979, "start": 2926}, {"end": 3036, "start": 2984}, {"end": 3094, "start": 3061}, {"end": 3149, "start": 3099}, {"end": 3213, "start": 3158}, {"end": 3267, "start": 3218}, {"end": 3329, "start": 3276}, {"end": 3388, "start": 3334}, {"end": 3445, "start": 3393}]}, {"name": "PF07974", "regions": [{"end": 1243, "start": 1212}, {"end": 3476, "start": 3452}, {"end": 3508, "start": 3484}, {"end": 3540, "start": 3513}]}, {"name": "PS50234", "regions": [{"end": 241, "start": 60}]}]}]}, {"end": 113341823, "exons": [{"end": 113206000, "start": 113204759}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220399, "start": 113220395}, {"end": 113220842, "start": 113220756}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113341823, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000302728", "start": 113204759, "translations": [{"cdna_coding_end": 1, "cdna_coding_start": 4650, "domains": [{"name": "PS50825", "regions": [{"end": 642, "start": 560}, {"end": 724, "start": 643}]}, {"name": "PF07699", "regions": [{"end": 360, "start": 310}, {"end": 1052, "start": 1005}, {"end": 1106, "start": 1059}, {"end": 1160, "start": 1113}]}, {"name": "PS50311", "regions": [{"end": 1409, "start": 1197}]}, {"name": "SM00181", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1234}, {"end": 1305, "start": 1272}, {"end": 1343, "start": 1310}, {"end": 1381, "start": 1348}, {"end": 1419, "start": 1386}]}, {"name": "SSF57196", "regions": [{"end": 1267, "start": 1189}, {"end": 1305, "start": 1268}, {"end": 1342, "start": 1306}, {"end": 1423, "start": 1344}]}, {"name": "PS50026", "regions": [{"end": 1229, "start": 1193}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}]}, {"name": "SSF57184", "regions": [{"end": 440, "start": 269}, {"end": 1144, "start": 988}]}, {"name": "SM00179", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}]}, {"name": "PF00092", "regions": [{"end": 252, "start": 84}]}, {"name": "SM00032", "regions": [{"end": 433, "start": 378}, {"end": 493, "start": 438}, {"end": 559, "start": 498}, {"end": 787, "start": 727}]}, {"name": "PF02494", "regions": [{"end": 642, "start": 561}, {"end": 721, "start": 644}]}, {"name": "PR00010", "regions": [{"end": 1318, "start": 1307}, {"end": 1364, "start": 1357}, {"end": 1413, "start": 1403}, {"end": 1420, "start": 1414}]}, {"name": "PF00354", "regions": [{"end": 1532, "start": 1442}]}, {"name": "SSF57535", "regions": [{"end": 433, "start": 374}, {"end": 493, "start": 434}, {"end": 560, "start": 494}, {"end": 790, "start": 727}]}, {"name": "SSF49899", "regions": [{"end": 1547, "start": 1421}]}, {"name": "PS50234", "regions": [{"end": 264, "start": 83}]}, {"name": "SSF53300", "regions": [{"end": 262, "start": 79}]}, {"name": "PF00084", "regions": [{"end": 430, "start": 378}, {"end": 493, "start": 438}]}, {"name": "PS50923", "regions": [{"end": 435, "start": 376}, {"end": 495, "start": 436}, {"end": 561, "start": 496}, {"end": 789, "start": 725}]}, {"name": "PF07645", "regions": [{"end": 1262, "start": 1231}, {"end": 1338, "start": 1308}]}, {"name": "PF00008", "regions": [{"end": 1226, "start": 1197}, {"end": 1265, "start": 1235}, {"end": 1302, "start": 1273}, {"end": 1337, "start": 1311}, {"end": 1379, "start": 1349}, {"end": 1417, "start": 1387}]}, {"name": "SM00327", "regions": [{"end": 260, "start": 81}]}]}]}, {"end": 113342160, "exons": [{"end": 113238595, "start": 113238163}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342160, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000374461", "start": 113238163, "translations": [{"cdna_coding_end": 407, "cdna_coding_start": 2944, "domains": [{"name": "PF02494", "regions": [{"end": 619, "start": 538}, {"end": 698, "start": 621}]}, {"name": "SM00032", "regions": [{"end": 410, "start": 355}, {"end": 470, "start": 415}, {"end": 536, "start": 475}, {"end": 764, "start": 704}]}, {"name": "SSF57535", "regions": [{"end": 410, "start": 351}, {"end": 470, "start": 411}, {"end": 537, "start": 471}, {"end": 767, "start": 704}]}, {"name": "PF07699", "regions": [{"end": 337, "start": 287}]}, {"name": "PS50825", "regions": [{"end": 619, "start": 537}, {"end": 701, "start": 620}]}, {"name": "PF00092", "regions": [{"end": 229, "start": 61}]}, {"name": "SSF57184", "regions": [{"end": 417, "start": 246}]}, {"name": "PS50923", "regions": [{"end": 412, "start": 353}, {"end": 472, "start": 413}, {"end": 538, "start": 473}, {"end": 766, "start": 702}]}, {"name": "SM00327", "regions": [{"end": 237, "start": 58}]}, {"name": "PS50234", "regions": [{"end": 241, "start": 60}]}, {"name": "SSF53300", "regions": [{"end": 239, "start": 56}]}, {"name": "PF00084", "regions": [{"end": 407, "start": 355}, {"end": 470, "start": 415}]}]}]}]}, {"aliases": ["ARID1B"], "chr": "6", "end": 157530401, "name": "ENSG00000049618", "start": 157099063, "strand": "+", "transcripts": [{"end": 157529495, "exons": [{"end": 157100605, "start": 157099063}, {"end": 157150555, "start": 157150361}, {"end": 157192786, "start": 157192748}, {"end": 157222659, "start": 157222510}, {"end": 157256710, "start": 157256600}, {"end": 157406039, "start": 157405796}, {"end": 157431695, "start": 157431606}, {"end": 157454341, "start": 157454162}, {"end": 157470085, "start": 157469758}, {"end": 157488319, "start": 157488174}, {"end": 157495251, "start": 157495142}, {"end": 157502312, "start": 157502103}, {"end": 157505569, "start": 157505365}, {"end": 157510914, "start": 157510776}, {"end": 157511344, "start": 157511172}, {"end": 157517449, "start": 157517299}, {"end": 157520041, "start": 157519945}, {"end": 157522622, "start": 157521839}, {"end": 157525130, "start": 157525000}, {"end": 157529495, "start": 157527301}], "is_best_transcript": true, "name": "ENST00000346085", "start": 157099063, "translations": [{"cdna_coding_end": 6751, "cdna_coding_start": 2, "domains": [{"name": "PF12031", "regions": [{"end": 2195, "start": 1939}]}, {"name": "PS50324", "regions": [{"end": 57, "start": 35}, {"end": 784, "start": 697}]}, {"name": "PF01388", "regions": [{"end": 1153, "start": 1065}]}, {"name": "PS50099", "regions": [{"end": 820, "start": 715}, {"end": 1610, "start": 1472}]}, {"name": "SSF48371", "regions": [{"end": 2220, "start": 2075}]}, {"name": "PS50316", "regions": [{"end": 104, "start": 81}]}, {"name": "PS50322", "regions": [{"end": 131, "start": 107}, {"end": 646, "start": 574}]}, {"name": "PS51011", "regions": [{"end": 1157, "start": 1066}]}, {"name": "PS50310", "regions": [{"end": 47, "start": 2}, {"end": 493, "start": 329}]}, {"name": "PS50315", "regions": [{"end": 401, "start": 141}]}, {"name": "SSF46774", "regions": [{"end": 1168, "start": 1049}]}, {"name": "SM00501", "regions": [{"end": 1158, "start": 1067}]}]}]}]}]} \ No newline at end of file diff --git a/tests/tools/test_convert_annotations_format.py b/tests/tools/test_convert_annotations_format.py index 0f837b30..a5530dd7 100644 --- a/tests/tools/test_convert_annotations_format.py +++ b/tests/tools/test_convert_annotations_format.py @@ -1,22 +1,61 @@ +import json import os -from tools.convert_annotations_format import convert_gff2_to_mavis, convert_gff3_to_mavis +import pytest +from tools.convert_annotations_format import ( + convert_gff2_to_mavis, + convert_gff3_to_mavis, + convert_mavis_json_2to3, +) -def test_load_gff3(): - input = os.path.join(os.path.dirname(__file__), 'data', 'Homo_sapiens.GRCh38.105.chr.kras.gtf') - data = convert_gff2_to_mavis(input, False) - assert len(data['genes']) == 2 - assert sum([len(g['transcripts']) for g in data['genes']]) == 15 - exons = 0 - for gene in data['genes']: - for transcript in gene['transcripts']: - exons += len(transcript['exons']) - assert exons == 62 +CONVERTERS = { + 'gff3': convert_gff3_to_mavis, + 'gtf': convert_gff2_to_mavis, + 'v2-json': convert_mavis_json_2to3, +} -def test_load_gtf(): - input = os.path.join(os.path.dirname(__file__), 'data', 'Homo_sapiens.GRCh38.105.kras.gff3') - data = convert_gff3_to_mavis(input, False) - assert len(data['genes']) == 4 - assert sum([len(g['transcripts']) for g in data['genes']]) == 15 +def sort_elements(data): + """ + Sort lists of exons, domains, genes, etc by position and name to facilitate comparison + """ + if not isinstance(data, dict): + if isinstance(data, list): + items = [sort_elements(e) for e in data] + + if all(isinstance(elem, dict) for elem in data): + return sorted( + items, key=lambda elem: (elem.get('start'), elem.get('end'), elem.get('name')) + ) + return items + else: + return data + + for key, value in data.items(): + data[key] = sort_elements(value) + return data + + +@pytest.mark.parametrize( + 'filename,expected_file,input_type', + [ + ['K02718.1.gff3', 'K02718.1.gff3.json', 'gff3'], + ['K02718.1.gtf', 'K02718.1.gtf.json', 'gtf'], + ['Homo_sapiens.GRCh38.kras.gff3', 'Homo_sapiens.GRCh38.kras.gff3.json', 'gff3'], + ['Homo_sapiens.GRCh38.kras.gtf', 'Homo_sapiens.GRCh38.kras.gtf.json', 'gtf'], + ['example_genes.v2.json', 'example_genes.v3.json', 'v2-json'], + ], +) +def test_gff_examples(filename, expected_file, input_type): + data_dir = os.path.join(os.path.dirname(__file__), 'data') + input_file = os.path.join(data_dir, filename) + with open(os.path.join(data_dir, expected_file), 'r') as fh: + expected = json.load(fh) + + # order doesn't matter + data = sort_elements(CONVERTERS[input_type](input_file)) + expected = sort_elements(expected) + + assert len(data['genes']) == len(expected['genes']) + assert data == expected From 6368996d9e093b781784be3db684ae2c970cf58f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 5 Feb 2022 20:56:14 -0800 Subject: [PATCH 111/137] Support polycistronic transcripts - Nest translations under transcripts in annotations json instead of expecting one cds per transcript - To support HPV genes resolves: #302 --- src/mavis/annotate/annotations_schema.json | 159 ++++++++++++++------- src/mavis/annotate/file_io.py | 117 +++++++++------ 2 files changed, 179 insertions(+), 97 deletions(-) diff --git a/src/mavis/annotate/annotations_schema.json b/src/mavis/annotate/annotations_schema.json index 83f1b501..85adb748 100644 --- a/src/mavis/annotate/annotations_schema.json +++ b/src/mavis/annotate/annotations_schema.json @@ -62,69 +62,32 @@ }, "type": "array" }, - "cdna_coding_end": { - "default": null, - "minimum": 1, - "type": [ - "integer", - "null" - ] - }, - "cdna_coding_start": { - "default": null, - "minimum": 1, - "type": [ - "integer", - "null" - ] - }, - "domains": { - "default": [ - ], - "items": { - "additionalProperties": true, - "properties": { - "name": { - "minLength": 1, - "type": "string" - }, - "regions": { - "minItems": 1, - "properties": { - "end": { - "minimum": 1, - "type": "integer" - }, - "start": { - "minimum": 1, - "type": "integer" - } - }, - "type": "array" - } - }, - "required": [ - "name", - "regions" - ], - "type": "object" - }, - "type": "array" - }, "end": { "minimum": 1, "type": "integer" }, "exons": { - "defualt": [ + "default": [ ], "items": { "additionalProperties": true, "properties": { + "aliases": { + "default": [ + ], + "items": { + "minLength": 1, + "type": "string" + }, + "type": "array" + }, "end": { "minimum": 1, "type": "integer" }, + "name": { + "type": "string" + }, "start": { "minimum": 1, "type": "integer" @@ -149,6 +112,102 @@ "start": { "minimum": 1, "type": "integer" + }, + "translations": { + "default": [ + ], + "items": { + "additionalProperties": true, + "anyOf": [ + { + "required": [ + "start", + "end" + ] + }, + { + "required": [ + "cdna_coding_end", + "cdna_coding_start" + ] + } + ], + "properties": { + "aliases": { + "default": [ + ], + "items": { + "minLength": 1, + "type": "string" + }, + "type": "array" + }, + "cdna_coding_end": { + "description": "coding start position relative to its parent transcript sequence", + "minimum": 1, + "type": "integer" + }, + "cdna_coding_start": { + "description": "coding end position relative to its parent transcript sequence", + "minimum": 1, + "type": "integer" + }, + "domains": { + "default": [ + ], + "items": { + "additionalProperties": true, + "properties": { + "desc": { + "type": "string" + }, + "name": { + "minLength": 1, + "type": "string" + }, + "regions": { + "minItems": 1, + "properties": { + "end": { + "description": "end of the protein domain region in AA coordinates", + "minimum": 1, + "type": "integer" + }, + "start": { + "description": "start of the protein domain region in AA coordinates", + "minimum": 1, + "type": "integer" + } + }, + "type": "array" + } + }, + "required": [ + "name", + "regions" + ], + "type": "object" + }, + "type": "array" + }, + "end": { + "description": "coding start position in genomic coordinates", + "minimum": 1, + "type": "integer" + }, + "name": { + "minLength": 1, + "type": "string" + }, + "start": { + "description": "coding start position in genomic coordinates", + "minimum": 1, + "type": "integer" + } + }, + "type": "object" + }, + "type": "array" } }, "required": [ diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 4b6ba264..a96200c9 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -10,7 +10,7 @@ from Bio import SeqIO from snakemake.utils import validate as snakemake_validate -from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, translate +from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate from ..interval import Interval from ..types import ReferenceGenome from ..util import logger @@ -127,10 +127,13 @@ def parse_annotations_json( has_best = False for transcript in gene_dict['transcripts']: - transcript.setdefault('exons', []) - exons = [Exon(strand=gene.strand, **ex) for ex in transcript['exons']] + exons = [] + for ex in transcript.get('exons', []): + exons.append( + Exon(strand=gene.strand, start=ex['start'], end=ex['end'], name=ex.get('name')) + ) if not exons: - exons = [(transcript['start'], transcript['end'])] + exons = [Exon(transcript['start'], transcript['end'], strand=gene.strand)] pre_transcript = PreTranscript( name=transcript['name'], gene=gene, @@ -148,55 +151,75 @@ def parse_annotations_json( spl_tx = Transcript(pre_transcript, spl_patt) pre_transcript.spliced_transcripts.append(spl_tx) - if ( - transcript.get('cdna_coding_end', None) is None - or transcript.get('cdna_coding_start', None) is None - ): - continue - tx_length = transcript['cdna_coding_end'] - transcript['cdna_coding_start'] + 1 - # check that the translation makes sense before including it - if tx_length % CODON_SIZE != 0: - logger.warning( - 'Ignoring translation. The translated region is not a multiple of three' - ) - continue - tx_length = tx_length // CODON_SIZE - domains = [] - for dom in transcript['domains']: + for translation in transcript.get('translations', []): try: - regions = [Interval(r['start'], r['end']) for r in dom['regions']] - regions = Interval.min_nonoverlapping(*regions) - for region in regions: - if region.start < 1 or region.end > tx_length: - raise AssertionError( - 'region cannot be outside the translated length' - ) - domains.append( - Domain( - name=dom['name'], - data={'desc': dom.get('desc', None)}, - regions=regions, + if 'cdna_coding_end' not in translation: + translation['cdna_coding_end'] = spl_tx.convert_genomic_to_cdna( + translation['end'] + ) + if 'cdna_coding_start' not in translation: + translation['cdna_coding_start'] = spl_tx.convert_genomic_to_cdna( + translation['start'] ) + except IndexError as err: + raise IndexError( + f'Invalid specification of CDS ({translation["name"]}: {translation["start"]}-{translation["end"]}) ' + f'region on transcript ({transcript["name"]}: {transcript["start"]}-{transcript["end"]}): {err}' ) - except AssertionError as err: - logger.warning(repr(err)) - translation = Translation( - transcript['cdna_coding_start'], - transcript['cdna_coding_end'], - transcript=spl_tx, - domains=domains, - ) - if reference_genome and gene.chr in reference_genome: - # get the sequence near here to see why these are wrong? - seq = pre_transcript.get_cdna_seq(spl_tx.splicing_pattern, reference_genome) - met = seq[translation.start - 1 : translation.start + 2] - stop = seq[translation.end - CODON_SIZE : translation.end] - if translate(met) != START_AA or translate(stop) != STOP_AA: + + if gene.strand == STRAND.NEG: + translation['cdna_coding_start'], translation['cdna_coding_end'] = ( + translation['cdna_coding_end'], + translation['cdna_coding_start'], + ) + + tx_length = ( + translation['cdna_coding_end'] - translation['cdna_coding_start'] + 1 + ) + # check that the translation makes sense before including it + if tx_length % CODON_SIZE != 0: logger.warning( - 'Sequence error. The sequence computed from the reference does look like a valid translation' + f'Ignoring translation ({translation.get("name")}). The translated region is not a multiple of three (length={tx_length})' ) continue - spl_tx.translations.append(translation) + tx_length = tx_length // CODON_SIZE + domains = [] + for dom in translation.get('domains', []): + try: + regions = [Interval(r['start'], r['end']) for r in dom['regions']] + regions = Interval.min_nonoverlapping(*regions) + for region in regions: + if region.start < 1 or region.end > tx_length: + raise AssertionError( + 'region cannot be outside the translated length' + ) + domains.append( + Domain( + name=dom['name'], + data={'desc': dom.get('desc', None)}, + regions=regions, + ) + ) + except AssertionError as err: + logger.warning(repr(err)) + translation = Translation( + translation['cdna_coding_start'], + translation['cdna_coding_end'], + transcript=spl_tx, + domains=domains, + name=translation.get('name'), + ) + if reference_genome and gene.chr in reference_genome: + # get the sequence near here to see why these are wrong? + seq = pre_transcript.get_cdna_seq(spl_tx.splicing_pattern, reference_genome) + met = seq[translation.start - 1 : translation.start + 2] + stop = seq[translation.end - CODON_SIZE : translation.end] + if translate(met) != START_AA or translate(stop) != STOP_AA: + logger.warning( + 'Sequence error. The sequence computed from the reference does look like a valid translation' + ) + continue + spl_tx.translations.append(translation) if not best_transcripts_only or has_best: genes_by_chr.setdefault(gene.chr, []).append(gene) return genes_by_chr From 81abded25f3c9e068d650b5805138fef40d2e073 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 5 Feb 2022 20:58:31 -0800 Subject: [PATCH 112/137] remove debugging code --- src/tools/convert_annotations_format.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index 28ca9dac..a01176f0 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -2,7 +2,6 @@ import json import logging import re -import traceback from typing import Dict, Tuple import pandas as pd @@ -315,13 +314,6 @@ def split_col_into_rows(df, col, delimiter=',', new_col=None): return new_df.merge(s, left_index=True, right_index=True) -def print_marker(df, links_df=None): - stack = traceback.extract_stack(limit=2)[0] - print(f'{stack.filename}:{stack.lineno} {stack.name}') - print(df.shape, links_df.shape if links_df is not None else '') - print(df.groupby(['type']).agg({'feature_id': 'count', 'feature_id': 'unique'}).reset_index()) - - def fix_dangling_parent_reference(nodes_df, links_df): """ Insert a pseudo element for any parents referenced by an element that do not already have their own line/definition From 8ad2848f1872f002431911f61e99b992b85298b8 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 5 Feb 2022 20:58:54 -0800 Subject: [PATCH 113/137] Improve type annotations and error messages --- src/mavis/annotate/file_io.py | 2 +- src/mavis/annotate/genomic.py | 2 +- src/mavis/annotate/protein.py | 16 ++++++++-------- src/mavis/annotate/splicing.py | 23 ++--------------------- 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index a96200c9..ec0c5f48 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -346,7 +346,7 @@ def __init__( Args: *filepaths (str): list of paths to load file_type (str): Type of file to load - eager_load (bool=False): load the files immeadiately + eager_load (bool=False): load the files immediately assert_exists (bool=False): check that all files exist **opt: key word arguments to be passed to the load function and used as part of the file cache key diff --git a/src/mavis/annotate/genomic.py b/src/mavis/annotate/genomic.py index 22f6831d..5f2544c1 100644 --- a/src/mavis/annotate/genomic.py +++ b/src/mavis/annotate/genomic.py @@ -599,7 +599,7 @@ class Transcript(BioInterval): def __init__( self, pre_transcript: PreTranscript, - splicing_patt: List[int], + splicing_patt: List[SpliceSite], seq: Optional[str] = None, translations: Optional[List[Translation]] = None, ): diff --git a/src/mavis/annotate/protein.py b/src/mavis/annotate/protein.py index 50d31f33..2468953f 100644 --- a/src/mavis/annotate/protein.py +++ b/src/mavis/annotate/protein.py @@ -256,18 +256,18 @@ def __init__( end: int, transcript: Optional['Transcript'] = None, domains: Optional[List[Domain]] = None, - seq=None, - name=None, + seq: Optional[str] = None, + name: Optional[str] = None, ): """ describes the splicing pattern and cds start and end with reference to a particular transcript Args: - start (int): start of the coding sequence (cds) relative to the start of the first exon in the transcript - end (int): end of the coding sequence (cds) relative to the start of the first exon in the transcript - transcript (Transcript): the transcript this is a Translation of - domains (List[Domain]): a list of the domains on this translation - sequence (str): the cds sequence + start: start of the coding sequence (cds) relative to the start of the first exon in the transcript + end: end of the coding sequence (cds) relative to the start of the first exon in the transcript + transcript: the transcript this is a Translation of + domains: a list of the domains on this translation + sequence: the cds sequence """ domains = [] if domains is None else domains BioInterval.__init__( @@ -279,7 +279,7 @@ def __init__( raise AttributeError('start must be a positive integer', start) if transcript and end > len(transcript): raise AttributeError( - 'translation cannot be outside of related transcript range', end, len(transcript) + f'translation ({self.name}) cannot be outside of related transcript range ({end} > {len(transcript)})' ) for domain in domains: diff --git a/src/mavis/annotate/splicing.py b/src/mavis/annotate/splicing.py index 9de8ce24..08fcd0d5 100644 --- a/src/mavis/annotate/splicing.py +++ b/src/mavis/annotate/splicing.py @@ -16,9 +16,9 @@ def __str__(self): temp = [] for site in self: temp.append( - '{}{}{}'.format( - 'D' if site.type == SPLICE_SITE_TYPE.DONOR else 'A', + '{}:{}{}'.format( site.pos, + 'D' if site.type == SPLICE_SITE_TYPE.DONOR else 'A', '' if site.intact else '*', ) ) @@ -161,25 +161,6 @@ def __init__( def __or__(self, other): return Interval.__or__(self, other) - def __repr__(self): - cls = self.__class__.__name__ - refname = self.reference_object - try: - refname = self.reference_object.name - except AttributeError: - pass - seq = '' if not self.seq else ', seq=' + self.seq - return '{}(type={}, {}:{}({}-{}){}, strand={})'.format( - cls, - SPLICE_SITE_TYPE.reverse(self.type), - refname, - self.pos, - self.start, - self.end, - seq, - self.get_strand(), - ) - def predict_splice_sites(input_sequence: str, is_reverse: bool = False) -> List[SpliceSite]: """ From 7e41fe38e07eadd42ef94e11cbe19b4124683880 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 5 Feb 2022 21:06:10 -0800 Subject: [PATCH 114/137] Update docs --- docs/migrating.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/migrating.md b/docs/migrating.md index db9c76b2..29231381 100644 --- a/docs/migrating.md +++ b/docs/migrating.md @@ -22,11 +22,20 @@ MAVIS is now integrated with snakemake instead of handling its own scheduling ## Reference Annotation Files -MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory +MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory. ```bash python src/tools/convert_annotations_format.py \ /path/to/tab/file.tab \ - --input_type v2 \ + --input_type v2-tab \ + /path/to/new/json/file.json +``` + +In v3 the JSON files are slightly different to support multiple translations per transcript. You old v3 files can be automatically converted to the new format with the same script + +```bash +python src/tools/convert_annotations_format.py \ + /path/to/json/file.json \ + --input_type v2-json \ /path/to/new/json/file.json ``` From bc4c0f8f72b1e2278f17b4a9225f3c846a728979 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 5 Feb 2022 21:06:35 -0800 Subject: [PATCH 115/137] Fix merge conflict --- src/mavis/annotate/file_io.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 3e889870..bcd0db0f 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -344,17 +344,10 @@ def __init__( ): """ Args: -<<<<<<< HEAD - *filepaths (str): list of paths to load - file_type (str): Type of file to load - eager_load (bool=False): load the files immediately - assert_exists (bool=False): check that all files exist -======= *filepaths: list of paths to load file_type: Type of file to load - eager_load: load the files immeadiately + eager_load: load the files immediately assert_exists: check that all files exist ->>>>>>> develop_v3 **opt: key word arguments to be passed to the load function and used as part of the file cache key Raises From 5e36b2d96d2b369ebd9c2188762bdbf37538b042 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 8 Feb 2022 20:22:57 -0800 Subject: [PATCH 116/137] Add tests for tab conversion --- .github/workflows/build.yml | 1 + setup.cfg | 5 +- .../data/ensembl69_hg19_annotations.kras.tab | 9 + .../ensembl69_hg19_annotations.kras.tab.json | 466 + tests/tools/data/example_genes.v3.json | 7855 ++++++++++++++++- .../tools/test_convert_annotations_format.py | 7 + 6 files changed, 8339 insertions(+), 4 deletions(-) create mode 100644 tests/tools/data/ensembl69_hg19_annotations.kras.tab create mode 100644 tests/tools/data/ensembl69_hg19_annotations.kras.tab.json diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b5809403..83ccec48 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -51,6 +51,7 @@ jobs: pytest tests -v \ --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ --cov mavis \ + --cov tools.convert_annotations_format \ --cov-report term-missing \ --cov-report xml \ --durations=10 \ diff --git a/setup.cfg b/setup.cfg index d0a4934c..371d7e35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,9 +23,7 @@ ignore = E501 statistics = True [flake8] -ignore = E501 - W503 - E203 +ignore = E501,W503,E203 [options] packages = find: @@ -79,6 +77,7 @@ dev = pycodestyle>=2.3.1 pytest pytest-cov + pytest-xdist mkdocs==1.1.2 markdown-refdocs mkdocs-material==5.4.0 diff --git a/tests/tools/data/ensembl69_hg19_annotations.kras.tab b/tests/tools/data/ensembl69_hg19_annotations.kras.tab new file mode 100644 index 00000000..32a4113f --- /dev/null +++ b/tests/tools/data/ensembl69_hg19_annotations.kras.tab @@ -0,0 +1,9 @@ +## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv +## input file for picking best transcript: ens69_best_transcript.txt +## Ensembl Api version 69 +## generated at: Thu Aug 4 16:38:01 2016 +#ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges +ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000311936 NP_004976.2;NM_004985.3 25357723 25403865 193 759 25403685-25403865;25398208-25398329;25380168-25380346;25378548-25378707;25357723-25362845 PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-162;SM00173:1-166;PF00009:45-163;PF08477:5-119;PS50318:165-184;SSF52540:3-184;TIGR00231:1-159;SM00175:4-166;PF00071:5-164;SM00174:6-166 +ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000557334 25362102 25403870 198 425 25403685-25403870;25398208-25398329;25362102-25362845 PR00449:4-25,27-43;PS50318:52-71;SM00173:1-53;PF00071:5-44;SSF52540:3-37 +ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000256078 NP_203524.1;NM_033360.2 25362365 25403737 65 634 25403685-25403737;25398208-25398329;25380168-25380346;25378548-25378707;25368371-25368494;25362365-25362845 SM00175:4-166;PF00071:5-164;SSF52540:3-185;SM00176:9-189;TIGR00231:1-159;SM00174:6-166;PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-161;PF08477:5-119;PF00009:45-162;SM00173:1-166 +ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000556131 25386753 25403863 178 309 25403698-25403863;25398208-25398329;25386753-25388160 PR00449:4-25,27-43;PF00071:5-37;SSF52540:3-38 diff --git a/tests/tools/data/ensembl69_hg19_annotations.kras.tab.json b/tests/tools/data/ensembl69_hg19_annotations.kras.tab.json new file mode 100644 index 00000000..eeaab2bb --- /dev/null +++ b/tests/tools/data/ensembl69_hg19_annotations.kras.tab.json @@ -0,0 +1,466 @@ +{ + "genes": [ + { + "aliases": [ + "KRAS" + ], + "chr": "12", + "end": 25403870, + "name": "ENSG00000133703", + "start": 25357723, + "strand": "-", + "transcripts": [ + { + "aliases": [ + ], + "end": 25403865, + "exons": [ + { + "end": 25403865, + "start": 25403685 + }, + { + "end": 25398329, + "start": 25398208 + }, + { + "end": 25380346, + "start": 25380168 + }, + { + "end": 25378707, + "start": 25378548 + }, + { + "end": 25362845, + "start": 25357723 + } + ], + "is_best_transcript": true, + "name": "ENST00000311936", + "start": 25357723, + "translations": [ + { + "cdna_coding_end": 759, + "cdna_coding_start": 193, + "domains": [ + { + "name": "PR00449", + "regions": [ + { + "end": 25, + "start": 4 + }, + { + "end": 43, + "start": 27 + }, + { + "end": 66, + "start": 44 + }, + { + "end": 120, + "start": 107 + }, + { + "end": 163, + "start": 141 + } + ] + }, + { + "name": "PF00025", + "regions": [ + { + "end": 162, + "start": 3 + } + ] + }, + { + "name": "SM00173", + "regions": [ + { + "end": 166, + "start": 1 + } + ] + }, + { + "name": "PF00009", + "regions": [ + { + "end": 163, + "start": 45 + } + ] + }, + { + "name": "PF08477", + "regions": [ + { + "end": 119, + "start": 5 + } + ] + }, + { + "name": "PS50318", + "regions": [ + { + "end": 184, + "start": 165 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 184, + "start": 3 + } + ] + }, + { + "name": "TIGR00231", + "regions": [ + { + "end": 159, + "start": 1 + } + ] + }, + { + "name": "SM00175", + "regions": [ + { + "end": 166, + "start": 4 + } + ] + }, + { + "name": "PF00071", + "regions": [ + { + "end": 164, + "start": 5 + } + ] + }, + { + "name": "SM00174", + "regions": [ + { + "end": 166, + "start": 6 + } + ] + } + ] + } + ] + }, + { + "aliases": [ + ], + "end": 25403870, + "exons": [ + { + "end": 25403870, + "start": 25403685 + }, + { + "end": 25398329, + "start": 25398208 + }, + { + "end": 25362845, + "start": 25362102 + } + ], + "is_best_transcript": false, + "name": "ENST00000557334", + "start": 25362102, + "translations": [ + { + "cdna_coding_end": 425, + "cdna_coding_start": 198, + "domains": [ + { + "name": "PR00449", + "regions": [ + { + "end": 25, + "start": 4 + }, + { + "end": 43, + "start": 27 + } + ] + }, + { + "name": "PS50318", + "regions": [ + { + "end": 71, + "start": 52 + } + ] + }, + { + "name": "SM00173", + "regions": [ + { + "end": 53, + "start": 1 + } + ] + }, + { + "name": "PF00071", + "regions": [ + { + "end": 44, + "start": 5 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 37, + "start": 3 + } + ] + } + ] + } + ] + }, + { + "aliases": [ + ], + "end": 25403737, + "exons": [ + { + "end": 25403737, + "start": 25403685 + }, + { + "end": 25398329, + "start": 25398208 + }, + { + "end": 25380346, + "start": 25380168 + }, + { + "end": 25378707, + "start": 25378548 + }, + { + "end": 25368494, + "start": 25368371 + }, + { + "end": 25362845, + "start": 25362365 + } + ], + "is_best_transcript": false, + "name": "ENST00000256078", + "start": 25362365, + "translations": [ + { + "cdna_coding_end": 634, + "cdna_coding_start": 65, + "domains": [ + { + "name": "SM00175", + "regions": [ + { + "end": 166, + "start": 4 + } + ] + }, + { + "name": "PF00071", + "regions": [ + { + "end": 164, + "start": 5 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 185, + "start": 3 + } + ] + }, + { + "name": "SM00176", + "regions": [ + { + "end": 189, + "start": 9 + } + ] + }, + { + "name": "TIGR00231", + "regions": [ + { + "end": 159, + "start": 1 + } + ] + }, + { + "name": "SM00174", + "regions": [ + { + "end": 166, + "start": 6 + } + ] + }, + { + "name": "PR00449", + "regions": [ + { + "end": 25, + "start": 4 + }, + { + "end": 43, + "start": 27 + }, + { + "end": 66, + "start": 44 + }, + { + "end": 120, + "start": 107 + }, + { + "end": 163, + "start": 141 + } + ] + }, + { + "name": "PF00025", + "regions": [ + { + "end": 161, + "start": 3 + } + ] + }, + { + "name": "PF08477", + "regions": [ + { + "end": 119, + "start": 5 + } + ] + }, + { + "name": "PF00009", + "regions": [ + { + "end": 162, + "start": 45 + } + ] + }, + { + "name": "SM00173", + "regions": [ + { + "end": 166, + "start": 1 + } + ] + } + ] + } + ] + }, + { + "aliases": [ + ], + "end": 25403863, + "exons": [ + { + "end": 25403863, + "start": 25403698 + }, + { + "end": 25398329, + "start": 25398208 + }, + { + "end": 25388160, + "start": 25386753 + } + ], + "is_best_transcript": false, + "name": "ENST00000556131", + "start": 25386753, + "translations": [ + { + "cdna_coding_end": 309, + "cdna_coding_start": 178, + "domains": [ + { + "name": "PR00449", + "regions": [ + { + "end": 25, + "start": 4 + }, + { + "end": 43, + "start": 27 + } + ] + }, + { + "name": "PF00071", + "regions": [ + { + "end": 37, + "start": 5 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 38, + "start": 3 + } + ] + } + ] + } + ] + } + ] + } + ] +} diff --git a/tests/tools/data/example_genes.v3.json b/tests/tools/data/example_genes.v3.json index 6a590488..7f77a887 100644 --- a/tests/tools/data/example_genes.v3.json +++ b/tests/tools/data/example_genes.v3.json @@ -1 +1,7854 @@ -{"genes": [{"aliases": ["EGFR"], "chr": "7", "end": 55324313, "name": "ENSG00000146648", "start": 55086714, "strand": "+", "transcripts": [{"end": 55270769, "exons": [{"end": 55087058, "start": 55086714}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270769, "start": 55270210}], "is_best_transcript": false, "name": "ENST00000455089", "start": 55086714, "translations": [{"cdna_coding_end": 3533, "cdna_coding_start": 258, "domains": [{"name": "PIRSF000619", "regions": [{"end": 1090, "start": 1}]}, {"name": "PF07714", "regions": [{"end": 920, "start": 669}]}, {"name": "SSF52058", "regions": [{"end": 191, "start": 28}, {"end": 475, "start": 283}]}, {"name": "PF00757", "regions": [{"end": 293, "start": 141}]}, {"name": "PS50011", "regions": [{"end": 934, "start": 667}]}, {"name": "PS50311", "regions": [{"end": 219, "start": 145}]}, {"name": "SSF57184", "regions": [{"end": 290, "start": 142}, {"end": 593, "start": 460}]}, {"name": "PR00109", "regions": [{"end": 758, "start": 745}, {"end": 800, "start": 782}, {"end": 841, "start": 831}, {"end": 872, "start": 850}, {"end": 916, "start": 894}]}, {"name": "SSF56112", "regions": [{"end": 975, "start": 651}]}, {"name": "PF01030", "regions": [{"end": 141, "start": 57}, {"end": 435, "start": 316}]}, {"name": "SM00220", "regions": [{"end": 924, "start": 667}]}, {"name": "SM00261", "regions": [{"end": 225, "start": 183}, {"end": 502, "start": 451}, {"end": 556, "start": 507}]}, {"name": "SM00219", "regions": [{"end": 923, "start": 667}]}, {"name": "PF00069", "regions": [{"end": 919, "start": 667}]}]}]}, {"end": 55236328, "exons": [{"end": 55087058, "start": 55086725}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55236328, "start": 55236216}], "is_best_transcript": false, "name": "ENST00000342916", "start": 55086725, "translations": [{"cdna_coding_end": 2133, "cdna_coding_start": 247, "domains": [{"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 624, "start": 505}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}]}]}, {"end": 55238738, "exons": [{"end": 55087058, "start": 55086726}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238738, "start": 55238000}], "is_best_transcript": false, "name": "ENST00000344576", "start": 55086726, "translations": [{"cdna_coding_end": 2363, "cdna_coding_start": 246, "domains": [{"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 624, "start": 505}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}]}]}, {"end": 55224644, "exons": [{"end": 55087058, "start": 55086727}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224644, "start": 55224452}], "is_best_transcript": false, "name": "ENST00000420316", "start": 55086727, "translations": [{"cdna_coding_end": 1462, "cdna_coding_start": 245, "domains": [{"name": "SSF57184", "regions": [{"end": 339, "start": 182}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 403, "start": 328}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}]}]}]}, {"end": 55279321, "exons": [{"end": 55087058, "start": 55086794}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270318, "start": 55270210}, {"end": 55279321, "start": 55272949}], "is_best_transcript": true, "name": "ENST00000275493", "start": 55086794, "translations": [{"cdna_coding_end": 3810, "cdna_coding_start": 178, "domains": [{"name": "SM00220", "regions": [{"end": 969, "start": 712}]}, {"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}, {"name": "SSF56112", "regions": [{"end": 1020, "start": 696}]}, {"name": "PF00069", "regions": [{"end": 964, "start": 712}]}, {"name": "SM00219", "regions": [{"end": 968, "start": 712}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF07714", "regions": [{"end": 965, "start": 714}]}, {"name": "PIRSF000619", "regions": [{"end": 1210, "start": 1}]}, {"name": "PR00109", "regions": [{"end": 803, "start": 790}, {"end": 845, "start": 827}, {"end": 886, "start": 876}, {"end": 917, "start": 895}, {"end": 961, "start": 939}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 638, "start": 505}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "PS50011", "regions": [{"end": 979, "start": 712}]}]}]}, {"end": 55324313, "exons": [{"end": 55087058, "start": 55086811}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240621, "start": 55240539}, {"end": 55324313, "start": 55323947}], "is_best_transcript": false, "name": "ENST00000442591", "start": 55086811, "translations": [{"cdna_coding_end": 2134, "cdna_coding_start": 161, "domains": [{"name": "PF01030", "regions": [{"end": 167, "start": 57}, {"end": 480, "start": 361}]}, {"name": "SM00261", "regions": [{"end": 270, "start": 228}, {"end": 547, "start": 496}, {"end": 601, "start": 552}, {"end": 653, "start": 614}]}, {"name": "SSF52058", "regions": [{"end": 211, "start": 29}, {"end": 520, "start": 328}]}, {"name": "PF00757", "regions": [{"end": 338, "start": 185}]}, {"name": "PS50311", "regions": [{"end": 264, "start": 187}]}, {"name": "SSF57184", "regions": [{"end": 339, "start": 182}, {"end": 638, "start": 505}]}]}]}, {"end": 55214417, "exons": [{"end": 55177651, "start": 55177416}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214417, "start": 55214299}], "is_best_transcript": false, "name": "ENST00000450046", "start": 55177416, "translations": [{"cdna_coding_end": 691, "cdna_coding_start": 308, "domains": [{"name": "SSF52058", "regions": [{"end": 127, "start": 1}]}, {"name": "PF01030", "regions": [{"end": 114, "start": 4}]}]}]}, {"end": 55273591, "exons": [{"end": 55177651, "start": 55177540}, {"end": 55210130, "start": 55209979}, {"end": 55211181, "start": 55210998}, {"end": 55214433, "start": 55214299}, {"end": 55219055, "start": 55218987}, {"end": 55220357, "start": 55220239}, {"end": 55221845, "start": 55221704}, {"end": 55223639, "start": 55223523}, {"end": 55224352, "start": 55224226}, {"end": 55224525, "start": 55224452}, {"end": 55225446, "start": 55225356}, {"end": 55228031, "start": 55227832}, {"end": 55229324, "start": 55229192}, {"end": 55231516, "start": 55231426}, {"end": 55233130, "start": 55232973}, {"end": 55238906, "start": 55238868}, {"end": 55240817, "start": 55240676}, {"end": 55241736, "start": 55241614}, {"end": 55242513, "start": 55242415}, {"end": 55249171, "start": 55248986}, {"end": 55259567, "start": 55259412}, {"end": 55260534, "start": 55260459}, {"end": 55266556, "start": 55266410}, {"end": 55268106, "start": 55268009}, {"end": 55269048, "start": 55268881}, {"end": 55269475, "start": 55269428}, {"end": 55270318, "start": 55270210}, {"end": 55273591, "start": 55272949}], "is_best_transcript": false, "name": "ENST00000454757", "start": 55177540, "translations": [{"cdna_coding_end": 3657, "cdna_coding_start": 184, "domains": [{"name": "SM00261", "regions": [{"end": 217, "start": 175}, {"end": 494, "start": 443}, {"end": 548, "start": 499}]}, {"name": "PF00069", "regions": [{"end": 911, "start": 659}]}, {"name": "SM00219", "regions": [{"end": 915, "start": 659}]}, {"name": "SSF56112", "regions": [{"end": 967, "start": 643}]}, {"name": "SM00220", "regions": [{"end": 916, "start": 659}]}, {"name": "PF01030", "regions": [{"end": 114, "start": 4}, {"end": 427, "start": 308}]}, {"name": "PS50311", "regions": [{"end": 211, "start": 134}]}, {"name": "PS50011", "regions": [{"end": 926, "start": 659}]}, {"name": "PR00109", "regions": [{"end": 750, "start": 737}, {"end": 792, "start": 774}, {"end": 833, "start": 823}, {"end": 864, "start": 842}, {"end": 908, "start": 886}]}, {"name": "SSF57184", "regions": [{"end": 286, "start": 129}, {"end": 585, "start": 452}]}, {"name": "PIRSF000619", "regions": [{"end": 1157, "start": 1}]}, {"name": "PF07714", "regions": [{"end": 912, "start": 661}]}, {"name": "SSF52058", "regions": [{"end": 158, "start": 1}, {"end": 467, "start": 275}]}, {"name": "PF00757", "regions": [{"end": 285, "start": 132}]}]}]}]}, {"aliases": ["DSTYK"], "chr": "1", "end": 205180727, "name": "ENSG00000133059", "start": 205111632, "strand": "-", "transcripts": [{"end": 205180727, "exons": [{"end": 205116873, "start": 205111632}, {"end": 205117467, "start": 205117333}, {"end": 205119898, "start": 205119808}, {"end": 205133083, "start": 205133055}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180727, "start": 205180399}], "is_best_transcript": false, "name": "ENST00000367160", "start": 205111632, "translations": [{"cdna_coding_end": 65, "cdna_coding_start": 1831, "domains": [{"name": "SM00220", "regions": [{"end": 565, "start": 337}]}, {"name": "SSF56112", "regions": [{"end": 585, "start": 452}]}, {"name": "PF00069", "regions": [{"end": 556, "start": 451}]}, {"name": "PF07714", "regions": [{"end": 558, "start": 471}]}, {"name": "PS50011", "regions": [{"end": 565, "start": 312}]}]}]}, {"end": 205180694, "exons": [{"end": 205116873, "start": 205111633}, {"end": 205119922, "start": 205119808}, {"end": 205126514, "start": 205126401}, {"end": 205128807, "start": 205128675}, {"end": 205129398, "start": 205129242}, {"end": 205130515, "start": 205130386}, {"end": 205131340, "start": 205131164}, {"end": 205132134, "start": 205132051}, {"end": 205133083, "start": 205132851}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180694, "start": 205180399}], "is_best_transcript": false, "name": "ENST00000367161", "start": 205111633, "translations": [{"cdna_coding_end": 32, "cdna_coding_start": 2686, "domains": [{"name": "PF07714", "regions": [{"end": 820, "start": 654}]}, {"name": "PS50011", "regions": [{"end": 884, "start": 652}]}, {"name": "SSF56112", "regions": [{"end": 853, "start": 627}]}, {"name": "SM00220", "regions": [{"end": 861, "start": 652}]}, {"name": "PF00069", "regions": [{"end": 824, "start": 654}]}, {"name": "SM00219", "regions": [{"end": 861, "start": 652}]}]}]}, {"end": 205180694, "exons": [{"end": 205116873, "start": 205111633}, {"end": 205117467, "start": 205117333}, {"end": 205119922, "start": 205119808}, {"end": 205126514, "start": 205126401}, {"end": 205128807, "start": 205128675}, {"end": 205129398, "start": 205129242}, {"end": 205130515, "start": 205130386}, {"end": 205131340, "start": 205131164}, {"end": 205132134, "start": 205132051}, {"end": 205133083, "start": 205132851}, {"end": 205138960, "start": 205138291}, {"end": 205156934, "start": 205156546}, {"end": 205180694, "start": 205180399}], "is_best_transcript": true, "name": "ENST00000367162", "start": 205111633, "translations": [{"cdna_coding_end": 32, "cdna_coding_start": 2821, "domains": [{"name": "PF07714", "regions": [{"end": 899, "start": 654}]}, {"name": "PS50011", "regions": [{"end": 906, "start": 652}]}, {"name": "SSF56112", "regions": [{"end": 897, "start": 638}]}, {"name": "SM00220", "regions": [{"end": 906, "start": 652}]}, {"name": "SM00219", "regions": [{"end": 906, "start": 652}]}, {"name": "PF00069", "regions": [{"end": 897, "start": 654}]}]}]}]}, {"aliases": ["NDUFA12"], "chr": "12", "end": 95397546, "name": "ENSG00000184752", "start": 95290831, "strand": "-", "transcripts": [{"end": 95397436, "exons": [{"end": 95291086, "start": 95290831}, {"end": 95318582, "start": 95318422}, {"end": 95322039, "start": 95321793}, {"end": 95396597, "start": 95396515}, {"end": 95397436, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000552205", "start": 95290831}, {"end": 95397476, "exons": [{"end": 95365261, "start": 95365108}, {"end": 95396597, "start": 95396582}, {"end": 95397476, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000547157", "start": 95365108, "translations": [{"cdna_coding_end": 21, "cdna_coding_start": 188}]}, {"end": 95397384, "exons": [{"end": 95365396, "start": 95365109}, {"end": 95388033, "start": 95387946}, {"end": 95390752, "start": 95390680}, {"end": 95396597, "start": 95396515}, {"end": 95397384, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000551991", "start": 95365109, "translations": [{"cdna_coding_end": 1, "cdna_coding_start": 144, "domains": [{"name": "PF05071", "regions": [{"end": 33, "start": 12}]}]}]}, {"end": 95397546, "exons": [{"end": 95365396, "start": 95365109}, {"end": 95388033, "start": 95387946}, {"end": 95396597, "start": 95396515}, {"end": 95397546, "start": 95397371}], "is_best_transcript": true, "name": "ENST00000327772", "start": 95365109, "translations": [{"cdna_coding_end": 91, "cdna_coding_start": 528, "domains": [{"name": "PF05071", "regions": [{"end": 137, "start": 36}]}]}]}, {"end": 95397489, "exons": [{"end": 95365396, "start": 95365112}, {"end": 95396597, "start": 95396515}, {"end": 95397489, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000547986", "start": 95365112, "translations": [{"cdna_coding_end": 34, "cdna_coding_start": 225, "domains": [{"name": "PF05071", "regions": [{"end": 53, "start": 36}]}]}]}, {"end": 95397524, "exons": [{"end": 95365396, "start": 95365254}, {"end": 95366265, "start": 95366171}, {"end": 95388033, "start": 95387946}, {"end": 95396597, "start": 95396515}, {"end": 95397524, "start": 95397371}], "is_best_transcript": false, "name": "ENST00000546788", "start": 95365254, "translations": [{"cdna_coding_end": 69, "cdna_coding_start": 368, "domains": [{"name": "PF05071", "regions": [{"end": 87, "start": 36}]}]}]}]}, {"aliases": ["FRMD6"], "chr": "14", "end": 52197445, "name": "ENSG00000139926", "start": 51955818, "strand": "+", "transcripts": [{"end": 52197177, "exons": [{"end": 51956138, "start": 51955855}, {"end": 52037128, "start": 52037066}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197177, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000356218", "start": 51955855, "translations": [{"cdna_coding_end": 2338, "cdna_coding_start": 494, "domains": [{"name": "PF09379", "regions": [{"end": 109, "start": 20}]}, {"name": "PF09380", "regions": [{"end": 322, "start": 237}]}, {"name": "SSF50729", "regions": [{"end": 375, "start": 219}]}, {"name": "SM00295", "regions": [{"end": 226, "start": 12}]}, {"name": "PS50057", "regions": [{"end": 320, "start": 16}]}, {"name": "PF00373", "regions": [{"end": 226, "start": 115}]}, {"name": "SSF47031", "regions": [{"end": 218, "start": 110}]}, {"name": "SSF54236", "regions": [{"end": 110, "start": 14}]}]}]}, {"end": 52197445, "exons": [{"end": 52118714, "start": 52118576}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197445, "start": 52194463}], "is_best_transcript": true, "name": "ENST00000395718", "start": 52118576, "translations": [{"cdna_coding_end": 2130, "cdna_coding_start": 286, "domains": [{"name": "PF00373", "regions": [{"end": 226, "start": 115}]}, {"name": "SSF47031", "regions": [{"end": 218, "start": 110}]}, {"name": "SSF54236", "regions": [{"end": 110, "start": 14}]}, {"name": "PS50057", "regions": [{"end": 320, "start": 16}]}, {"name": "SM00295", "regions": [{"end": 226, "start": 12}]}, {"name": "SSF50729", "regions": [{"end": 375, "start": 219}]}, {"name": "PF09380", "regions": [{"end": 322, "start": 237}]}, {"name": "PF09379", "regions": [{"end": 109, "start": 20}]}]}]}, {"end": 52195654, "exons": [{"end": 52118714, "start": 52118665}, {"end": 52156653, "start": 52156409}, {"end": 52164950, "start": 52164860}, {"end": 52167877, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52195654, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000344768", "start": 52118665, "translations": [{"cdna_coding_end": 2065, "cdna_coding_start": 197, "domains": [{"name": "PF09380", "regions": [{"end": 330, "start": 245}]}, {"name": "PF09379", "regions": [{"end": 117, "start": 20}]}, {"name": "SSF47031", "regions": [{"end": 226, "start": 118}]}, {"name": "PF00373", "regions": [{"end": 234, "start": 123}]}, {"name": "SSF54236", "regions": [{"end": 118, "start": 14}]}, {"name": "PS50057", "regions": [{"end": 328, "start": 16}]}, {"name": "SM00295", "regions": [{"end": 234, "start": 12}]}, {"name": "SSF50729", "regions": [{"end": 383, "start": 227}]}]}]}, {"end": 52164945, "exons": [{"end": 52118935, "start": 52118698}, {"end": 52156653, "start": 52156409}, {"end": 52164945, "start": 52164860}], "is_best_transcript": false, "name": "ENST00000554778", "start": 52118698}, {"end": 52174806, "exons": [{"end": 52164950, "start": 52164706}, {"end": 52167877, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174806, "start": 52174796}], "is_best_transcript": false, "name": "ENST00000555936", "start": 52164706}, {"end": 52197148, "exons": [{"end": 52164950, "start": 52164831}, {"end": 52167853, "start": 52167774}, {"end": 52169306, "start": 52169230}, {"end": 52171653, "start": 52171467}, {"end": 52174951, "start": 52174796}, {"end": 52178314, "start": 52178249}, {"end": 52179269, "start": 52179201}, {"end": 52182217, "start": 52182043}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52197148, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000554167", "start": 52164831, "translations": [{"cdna_coding_end": 1775, "cdna_coding_start": 138, "domains": [{"name": "SSF50729", "regions": [{"end": 306, "start": 150}]}, {"name": "PS50057", "regions": [{"end": 251, "start": 1}]}, {"name": "SSF54236", "regions": [{"end": 41, "start": 1}]}, {"name": "SSF47031", "regions": [{"end": 149, "start": 41}]}, {"name": "PF00373", "regions": [{"end": 157, "start": 46}]}, {"name": "PF09380", "regions": [{"end": 253, "start": 168}]}]}]}, {"end": 52175062, "exons": [{"end": 52169306, "start": 52169266}, {"end": 52171653, "start": 52171467}, {"end": 52175062, "start": 52174796}], "is_best_transcript": false, "name": "ENST00000557405", "start": 52169266, "translations": [{"cdna_coding_end": 390, "cdna_coding_start": 1, "domains": [{"name": "PS50057", "regions": [{"end": 129, "start": 1}]}, {"name": "PF00373", "regions": [{"end": 124, "start": 13}]}, {"name": "SSF47031", "regions": [{"end": 116, "start": 8}]}]}]}, {"end": 52187243, "exons": [{"end": 52179269, "start": 52179231}, {"end": 52182217, "start": 52182043}, {"end": 52187243, "start": 52186773}], "is_best_transcript": false, "name": "ENST00000555197", "start": 52179231, "translations": [{"cdna_coding_end": 618, "cdna_coding_start": 1, "domains": [{"name": "PF09380", "regions": [{"end": 60, "start": 2}]}, {"name": "PS50057", "regions": [{"end": 58, "start": 1}]}, {"name": "SSF50729", "regions": [{"end": 113, "start": 2}]}]}]}, {"end": 52192513, "exons": [{"end": 52184066, "start": 52183973}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188673}, {"end": 52192513, "start": 52192497}], "is_best_transcript": false, "name": "ENST00000555703", "start": 52183973, "translations": [{"cdna_coding_end": 573, "cdna_coding_start": 145}]}, {"end": 52195487, "exons": [{"end": 52184066, "start": 52183973}, {"end": 52187108, "start": 52186773}, {"end": 52188798, "start": 52188667}, {"end": 52192588, "start": 52192497}, {"end": 52195487, "start": 52194463}], "is_best_transcript": false, "name": "ENST00000553556", "start": 52183973, "translations": [{"cdna_coding_end": 939, "cdna_coding_start": 145}]}]}, {"aliases": ["PRKCB"], "chr": "16", "end": 24231932, "name": "ENSG00000166501", "start": 23847322, "strand": "+", "transcripts": [{"end": 24231932, "exons": [{"end": 23847669, "start": 23847322}, {"end": 23848727, "start": 23848696}, {"end": 23999911, "start": 23999829}, {"end": 24043568, "start": 24043457}, {"end": 24046868, "start": 24046740}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124390, "start": 24124294}, {"end": 24135302, "start": 24135156}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192249, "start": 24192111}, {"end": 24196512, "start": 24196432}, {"end": 24196888, "start": 24196781}, {"end": 24202551, "start": 24202411}, {"end": 24231932, "start": 24231282}], "is_best_transcript": true, "name": "ENST00000321728", "start": 23847322, "translations": [{"cdna_coding_end": 2191, "cdna_coding_start": 176, "domains": [{"name": "SM00239", "regions": [{"end": 275, "start": 172}]}, {"name": "PF07714", "regions": [{"end": 583, "start": 344}]}, {"name": "SSF49562", "regions": [{"end": 288, "start": 157}]}, {"name": "SM00109", "regions": [{"end": 86, "start": 37}, {"end": 151, "start": 102}]}, {"name": "PS50011", "regions": [{"end": 600, "start": 342}]}, {"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 59, "start": 50}, {"end": 74, "start": 63}, {"end": 152, "start": 140}]}, {"name": "PF00433", "regions": [{"end": 666, "start": 623}]}, {"name": "SM00220", "regions": [{"end": 600, "start": 342}]}, {"name": "PF00168", "regions": [{"end": 259, "start": 175}]}, {"name": "SSF57889", "regions": [{"end": 92, "start": 6}, {"end": 157, "start": 101}]}, {"name": "PF00130", "regions": [{"end": 87, "start": 37}, {"end": 153, "start": 102}]}, {"name": "PS50081", "regions": [{"end": 86, "start": 36}, {"end": 151, "start": 101}]}, {"name": "SSF56112", "regions": [{"end": 627, "start": 317}]}, {"name": "PF00069", "regions": [{"end": 586, "start": 343}]}, {"name": "SM00219", "regions": [{"end": 576, "start": 342}]}, {"name": "PR00360", "regions": [{"end": 200, "start": 188}, {"end": 230, "start": 217}, {"end": 248, "start": 240}]}, {"name": "SM00133", "regions": [{"end": 664, "start": 601}]}, {"name": "PS50004", "regions": [{"end": 260, "start": 173}]}, {"name": "PIRSF000550", "regions": [{"end": 671, "start": 1}]}]}]}, {"end": 24231932, "exons": [{"end": 23847669, "start": 23847345}, {"end": 23848727, "start": 23848696}, {"end": 23999911, "start": 23999829}, {"end": 24043568, "start": 24043457}, {"end": 24046868, "start": 24046740}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124390, "start": 24124294}, {"end": 24135302, "start": 24135156}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192249, "start": 24192111}, {"end": 24196512, "start": 24196432}, {"end": 24196888, "start": 24196781}, {"end": 24202551, "start": 24202411}, {"end": 24231932, "start": 24225979}], "is_best_transcript": false, "name": "ENST00000303531", "start": 23847345, "translations": [{"cdna_coding_end": 2174, "cdna_coding_start": 153, "domains": [{"name": "SM00133", "regions": [{"end": 663, "start": 601}]}, {"name": "PS50004", "regions": [{"end": 260, "start": 173}]}, {"name": "PIRSF000550", "regions": [{"end": 672, "start": 1}]}, {"name": "PF00069", "regions": [{"end": 586, "start": 343}]}, {"name": "PR00360", "regions": [{"end": 200, "start": 188}, {"end": 230, "start": 217}, {"end": 248, "start": 240}]}, {"name": "SM00219", "regions": [{"end": 576, "start": 342}]}, {"name": "PS50081", "regions": [{"end": 86, "start": 36}, {"end": 151, "start": 101}]}, {"name": "SSF56112", "regions": [{"end": 627, "start": 317}]}, {"name": "SM00220", "regions": [{"end": 600, "start": 342}]}, {"name": "PF00433", "regions": [{"end": 664, "start": 627}]}, {"name": "PF00130", "regions": [{"end": 87, "start": 37}, {"end": 153, "start": 102}]}, {"name": "PF00168", "regions": [{"end": 259, "start": 175}]}, {"name": "SSF57889", "regions": [{"end": 92, "start": 6}, {"end": 157, "start": 101}]}, {"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 59, "start": 50}, {"end": 74, "start": 63}, {"end": 152, "start": 140}]}, {"name": "PS50011", "regions": [{"end": 600, "start": 342}]}, {"name": "SM00109", "regions": [{"end": 86, "start": 37}, {"end": 151, "start": 102}]}, {"name": "PF07714", "regions": [{"end": 583, "start": 344}]}, {"name": "SSF49562", "regions": [{"end": 288, "start": 157}]}, {"name": "SM00239", "regions": [{"end": 275, "start": 172}]}]}]}, {"end": 23880647, "exons": [{"end": 23847669, "start": 23847403}, {"end": 23880647, "start": 23880435}], "is_best_transcript": false, "name": "ENST00000498058", "start": 23847403, "translations": [{"cdna_coding_end": 268, "cdna_coding_start": 95, "domains": [{"name": "PR00008", "regions": [{"end": 48, "start": 34}, {"end": 57, "start": 50}]}, {"name": "PS50081", "regions": [{"end": 57, "start": 36}]}, {"name": "SSF57889", "regions": [{"end": 57, "start": 6}]}]}]}, {"end": 24124386, "exons": [{"end": 23848727, "start": 23848544}, {"end": 24104268, "start": 24104112}, {"end": 24105618, "start": 24105484}, {"end": 24124386, "start": 24124294}], "is_best_transcript": false, "name": "ENST00000498739", "start": 23848544}, {"end": 24192166, "exons": [{"end": 24163176, "start": 24163006}, {"end": 24166178, "start": 24166005}, {"end": 24183682, "start": 24183591}, {"end": 24185901, "start": 24185839}, {"end": 24192166, "start": 24192111}], "is_best_transcript": false, "name": "ENST00000472066", "start": 24163006}, {"end": 24202909, "exons": [{"end": 24196888, "start": 24196852}, {"end": 24202909, "start": 24202411}], "is_best_transcript": false, "name": "ENST00000466124", "start": 24196852}]}, {"aliases": ["GIMAP4"], "chr": "7", "end": 150271041, "name": "ENSG00000133574", "start": 150264365, "strand": "+", "transcripts": [{"end": 150271041, "exons": [{"end": 150264525, "start": 150264365}, {"end": 150267047, "start": 150266976}, {"end": 150271041, "start": 150269217}], "is_best_transcript": true, "name": "ENST00000255945", "start": 150264365, "translations": [{"cdna_coding_end": 1165, "cdna_coding_start": 176, "domains": [{"name": "PF04548", "regions": [{"end": 238, "start": 31}]}, {"name": "SSF52540", "regions": [{"end": 288, "start": 24}]}]}]}, {"end": 150270602, "exons": [{"end": 150264525, "start": 150264457}, {"end": 150267089, "start": 150266976}, {"end": 150270602, "start": 150269217}], "is_best_transcript": false, "name": "ENST00000461940", "start": 150264457, "translations": [{"cdna_coding_end": 1115, "cdna_coding_start": 84, "domains": [{"name": "PF04548", "regions": [{"end": 252, "start": 45}]}, {"name": "SSF52540", "regions": [{"end": 302, "start": 38}]}]}]}, {"end": 150269569, "exons": [{"end": 150264608, "start": 150264524}, {"end": 150267089, "start": 150266976}, {"end": 150269569, "start": 150269217}], "is_best_transcript": false, "name": "ENST00000479232", "start": 150264524, "translations": [{"cdna_coding_end": 552, "cdna_coding_start": 100, "domains": [{"name": "SSF52540", "regions": [{"end": 151, "start": 38}]}, {"name": "PF04548", "regions": [{"end": 151, "start": 45}]}]}]}]}, {"aliases": ["IL7"], "chr": "8", "end": 79717758, "name": "ENSG00000104432", "start": 79587978, "strand": "-", "transcripts": [{"end": 79717758, "exons": [{"end": 79646067, "start": 79645007}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710307}, {"end": 79717758, "start": 79717148}], "is_best_transcript": true, "name": "ENST00000263851", "start": 79645007, "translations": [{"cdna_coding_end": 602, "cdna_coding_start": 1135, "domains": [{"name": "PIRSF001942", "regions": [{"end": 177, "start": 1}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}, {"end": 77, "start": 57}, {"end": 98, "start": 78}, {"end": 118, "start": 99}, {"end": 173, "start": 151}]}, {"name": "PF01415", "regions": [{"end": 173, "start": 28}]}, {"name": "SM00127", "regions": [{"end": 173, "start": 27}]}]}]}, {"end": 79717699, "exons": [{"end": 79646063, "start": 79645283}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79659331, "start": 79659129}, {"end": 79710443, "start": 79710307}, {"end": 79717699, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000518982", "start": 79645283, "translations": [{"cdna_coding_end": 543, "cdna_coding_start": 758, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}]}, {"name": "PF01415", "regions": [{"end": 54, "start": 28}]}]}]}, {"end": 79717163, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710307}, {"end": 79717163, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520269", "start": 79645900, "translations": [{"cdna_coding_end": 7, "cdna_coding_start": 408, "domains": [{"name": "PF01415", "regions": [{"end": 77, "start": 28}, {"end": 129, "start": 91}]}, {"name": "SM00127", "regions": [{"end": 129, "start": 27}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}, {"end": 77, "start": 57}]}, {"name": "PIRSF001942", "regions": [{"end": 133, "start": 1}]}]}]}, {"end": 79717163, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710363}, {"end": 79717163, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520215", "start": 79645900, "translations": [{"cdna_coding_end": 7, "cdna_coding_start": 120, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 37, "start": 26}]}]}]}, {"end": 79717686, "exons": [{"end": 79646067, "start": 79645900}, {"end": 79648762, "start": 79648709}, {"end": 79650870, "start": 79650739}, {"end": 79652317, "start": 79652237}, {"end": 79710443, "start": 79710363}, {"end": 79717686, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000520317", "start": 79645900, "translations": [{"cdna_coding_end": 530, "cdna_coding_start": 643, "domains": [{"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 37, "start": 26}]}]}]}, {"end": 79652311, "exons": [{"end": 79646067, "start": 79645948}, {"end": 79652311, "start": 79652237}], "is_best_transcript": false, "name": "ENST00000541183", "start": 79645948, "translations": [{"cdna_coding_end": 1, "cdna_coding_start": 195, "domains": [{"name": "SM00127", "regions": [{"end": 60, "start": 1}]}, {"name": "PF01415", "regions": [{"end": 60, "start": 1}]}]}]}, {"end": 79717758, "exons": [{"end": 79659331, "start": 79659263}, {"end": 79710443, "start": 79710307}, {"end": 79717758, "start": 79717148}], "is_best_transcript": false, "name": "ENST00000379113", "start": 79659263, "translations": [{"cdna_coding_end": 602, "cdna_coding_start": 817, "domains": [{"name": "PF01415", "regions": [{"end": 54, "start": 28}]}, {"name": "PR00435", "regions": [{"end": 25, "start": 2}, {"end": 48, "start": 26}]}]}]}]}, {"aliases": ["SVEP1"], "chr": "9", "end": 113342160, "name": "ENSG00000165124", "start": 113127531, "strand": "-", "transcripts": [{"end": 113342160, "exons": [{"end": 113128840, "start": 113127531}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}, {"end": 113191614, "start": 113191423}, {"end": 113192284, "start": 113192200}, {"end": 113192730, "start": 113192554}, {"end": 113194314, "start": 113194195}, {"end": 113194915, "start": 113194742}, {"end": 113196786, "start": 113196616}, {"end": 113197644, "start": 113197521}, {"end": 113198784, "start": 113198660}, {"end": 113206000, "start": 113205825}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220842, "start": 113220751}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342160, "start": 113341293}], "is_best_transcript": true, "name": "ENST00000401783", "start": 113127531, "translations": [{"cdna_coding_end": 338, "cdna_coding_start": 11053, "domains": [{"name": "SM00032", "regions": [{"end": 433, "start": 378}, {"end": 493, "start": 438}, {"end": 559, "start": 498}, {"end": 787, "start": 727}, {"end": 1685, "start": 1631}, {"end": 1743, "start": 1690}, {"end": 1842, "start": 1789}, {"end": 1900, "start": 1847}, {"end": 1958, "start": 1905}, {"end": 2016, "start": 1963}, {"end": 2078, "start": 2021}, {"end": 2141, "start": 2083}, {"end": 2199, "start": 2146}, {"end": 2259, "start": 2204}, {"end": 2318, "start": 2264}, {"end": 2376, "start": 2323}, {"end": 2435, "start": 2381}, {"end": 2493, "start": 2440}, {"end": 2551, "start": 2498}, {"end": 2608, "start": 2556}, {"end": 2712, "start": 2654}, {"end": 2770, "start": 2717}, {"end": 2828, "start": 2775}, {"end": 2886, "start": 2833}, {"end": 2944, "start": 2891}, {"end": 3002, "start": 2949}, {"end": 3059, "start": 3007}, {"end": 3117, "start": 3064}, {"end": 3176, "start": 3122}, {"end": 3236, "start": 3181}, {"end": 3294, "start": 3241}, {"end": 3352, "start": 3299}, {"end": 3411, "start": 3357}, {"end": 3468, "start": 3416}]}, {"name": "PF02494", "regions": [{"end": 642, "start": 561}, {"end": 721, "start": 644}]}, {"name": "PR00895", "regions": [{"end": 1530, "start": 1512}, {"end": 1558, "start": 1539}, {"end": 1592, "start": 1559}]}, {"name": "SSF57535", "regions": [{"end": 433, "start": 374}, {"end": 493, "start": 434}, {"end": 560, "start": 494}, {"end": 790, "start": 727}, {"end": 1746, "start": 1626}, {"end": 1842, "start": 1785}, {"end": 1900, "start": 1843}, {"end": 1958, "start": 1901}, {"end": 2016, "start": 1959}, {"end": 2078, "start": 2017}, {"end": 2199, "start": 2081}, {"end": 2318, "start": 2202}, {"end": 2377, "start": 2321}, {"end": 2437, "start": 2379}, {"end": 2551, "start": 2438}, {"end": 2616, "start": 2552}, {"end": 2712, "start": 2643}, {"end": 2828, "start": 2715}, {"end": 2886, "start": 2829}, {"end": 2944, "start": 2887}, {"end": 3117, "start": 2945}, {"end": 3176, "start": 3118}, {"end": 3229, "start": 3177}, {"end": 3475, "start": 3239}]}, {"name": "SSF49899", "regions": [{"end": 1632, "start": 1421}]}, {"name": "SM00159", "regions": [{"end": 1627, "start": 1420}]}, {"name": "PF00354", "regions": [{"end": 1620, "start": 1442}]}, {"name": "PF07699", "regions": [{"end": 360, "start": 310}, {"end": 1052, "start": 1005}, {"end": 1106, "start": 1059}, {"end": 1160, "start": 1113}]}, {"name": "PS50311", "regions": [{"end": 1409, "start": 1197}, {"end": 3554, "start": 3468}]}, {"name": "PS50825", "regions": [{"end": 642, "start": 560}, {"end": 724, "start": 643}]}, {"name": "PF00092", "regions": [{"end": 252, "start": 84}]}, {"name": "SSF57196", "regions": [{"end": 1267, "start": 1189}, {"end": 1305, "start": 1268}, {"end": 1342, "start": 1306}, {"end": 1423, "start": 1344}, {"end": 1786, "start": 1735}, {"end": 3506, "start": 3463}, {"end": 3535, "start": 3507}, {"end": 3570, "start": 3537}]}, {"name": "PS50026", "regions": [{"end": 1229, "start": 1193}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}, {"end": 1784, "start": 1745}, {"end": 3532, "start": 3500}, {"end": 3564, "start": 3533}]}, {"name": "SM00181", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1234}, {"end": 1305, "start": 1272}, {"end": 1343, "start": 1310}, {"end": 1381, "start": 1348}, {"end": 1419, "start": 1386}, {"end": 1784, "start": 1748}, {"end": 3500, "start": 3471}, {"end": 3532, "start": 3503}, {"end": 3564, "start": 3535}]}, {"name": "SM00179", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}, {"end": 1784, "start": 1745}, {"end": 3532, "start": 3504}]}, {"name": "SSF57184", "regions": [{"end": 440, "start": 269}, {"end": 1144, "start": 988}]}, {"name": "PF07645", "regions": [{"end": 1783, "start": 1745}]}, {"name": "PS50923", "regions": [{"end": 435, "start": 376}, {"end": 495, "start": 436}, {"end": 561, "start": 496}, {"end": 789, "start": 725}, {"end": 1687, "start": 1629}, {"end": 1745, "start": 1688}, {"end": 1844, "start": 1787}, {"end": 1902, "start": 1845}, {"end": 1960, "start": 1903}, {"end": 2018, "start": 1961}, {"end": 2080, "start": 2019}, {"end": 2143, "start": 2081}, {"end": 2201, "start": 2144}, {"end": 2261, "start": 2202}, {"end": 2320, "start": 2262}, {"end": 2378, "start": 2321}, {"end": 2437, "start": 2379}, {"end": 2495, "start": 2438}, {"end": 2553, "start": 2496}, {"end": 2610, "start": 2554}, {"end": 2714, "start": 2663}, {"end": 2772, "start": 2715}, {"end": 2830, "start": 2773}, {"end": 2888, "start": 2831}, {"end": 2946, "start": 2889}, {"end": 3004, "start": 2947}, {"end": 3061, "start": 3005}, {"end": 3119, "start": 3062}, {"end": 3178, "start": 3120}, {"end": 3238, "start": 3179}, {"end": 3296, "start": 3239}, {"end": 3354, "start": 3297}, {"end": 3413, "start": 3355}, {"end": 3470, "start": 3414}]}, {"name": "SM00327", "regions": [{"end": 260, "start": 81}]}, {"name": "PF00008", "regions": [{"end": 1226, "start": 1197}, {"end": 1265, "start": 1235}, {"end": 1302, "start": 1273}, {"end": 1379, "start": 1349}, {"end": 1417, "start": 1387}]}, {"name": "PS50234", "regions": [{"end": 264, "start": 83}]}, {"name": "PF07974", "regions": [{"end": 1266, "start": 1235}, {"end": 3499, "start": 3475}, {"end": 3531, "start": 3507}, {"end": 3563, "start": 3536}]}, {"name": "SSF53300", "regions": [{"end": 262, "start": 79}]}, {"name": "PF00084", "regions": [{"end": 430, "start": 378}, {"end": 493, "start": 438}, {"end": 1685, "start": 1628}, {"end": 1743, "start": 1690}, {"end": 1842, "start": 1789}, {"end": 1900, "start": 1847}, {"end": 1958, "start": 1905}, {"end": 2016, "start": 1963}, {"end": 2078, "start": 2021}, {"end": 2136, "start": 2083}, {"end": 2199, "start": 2146}, {"end": 2259, "start": 2204}, {"end": 2318, "start": 2264}, {"end": 2376, "start": 2323}, {"end": 2435, "start": 2381}, {"end": 2493, "start": 2440}, {"end": 2551, "start": 2498}, {"end": 2608, "start": 2556}, {"end": 2712, "start": 2667}, {"end": 2770, "start": 2717}, {"end": 2828, "start": 2775}, {"end": 2886, "start": 2833}, {"end": 2944, "start": 2891}, {"end": 3002, "start": 2949}, {"end": 3059, "start": 3007}, {"end": 3117, "start": 3084}, {"end": 3172, "start": 3122}, {"end": 3236, "start": 3181}, {"end": 3290, "start": 3241}, {"end": 3352, "start": 3299}, {"end": 3411, "start": 3357}, {"end": 3468, "start": 3416}]}]}]}, {"end": 113190038, "exons": [{"end": 113128840, "start": 113127536}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}], "is_best_transcript": false, "name": "ENST00000297826", "start": 113127536, "translations": [{"cdna_coding_end": 416, "cdna_coding_start": 4909, "domains": [{"name": "PF00084", "regions": [{"end": 62, "start": 9}, {"end": 125, "start": 72}, {"end": 185, "start": 130}, {"end": 244, "start": 190}, {"end": 302, "start": 249}, {"end": 361, "start": 307}, {"end": 419, "start": 366}, {"end": 477, "start": 424}, {"end": 534, "start": 482}, {"end": 638, "start": 593}, {"end": 696, "start": 643}, {"end": 754, "start": 701}, {"end": 812, "start": 759}, {"end": 870, "start": 817}, {"end": 928, "start": 875}, {"end": 985, "start": 933}, {"end": 1043, "start": 1010}, {"end": 1098, "start": 1048}, {"end": 1162, "start": 1107}, {"end": 1216, "start": 1167}, {"end": 1278, "start": 1225}, {"end": 1337, "start": 1283}, {"end": 1394, "start": 1342}]}, {"name": "PF07974", "regions": [{"end": 1425, "start": 1401}, {"end": 1457, "start": 1433}, {"end": 1489, "start": 1462}]}, {"name": "PF00008", "regions": [{"end": 1456, "start": 1427}]}, {"name": "PS50923", "regions": [{"end": 69, "start": 7}, {"end": 127, "start": 70}, {"end": 187, "start": 128}, {"end": 246, "start": 188}, {"end": 304, "start": 247}, {"end": 363, "start": 305}, {"end": 421, "start": 364}, {"end": 479, "start": 422}, {"end": 536, "start": 480}, {"end": 640, "start": 589}, {"end": 698, "start": 641}, {"end": 756, "start": 699}, {"end": 814, "start": 757}, {"end": 872, "start": 815}, {"end": 930, "start": 873}, {"end": 987, "start": 931}, {"end": 1045, "start": 988}, {"end": 1104, "start": 1046}, {"end": 1164, "start": 1105}, {"end": 1222, "start": 1165}, {"end": 1280, "start": 1223}, {"end": 1339, "start": 1281}, {"end": 1396, "start": 1340}]}, {"name": "SM00181", "regions": [{"end": 1426, "start": 1397}, {"end": 1458, "start": 1429}, {"end": 1490, "start": 1461}]}, {"name": "SSF57196", "regions": [{"end": 1432, "start": 1389}, {"end": 1461, "start": 1433}, {"end": 1496, "start": 1463}]}, {"name": "PS50026", "regions": [{"end": 1458, "start": 1426}, {"end": 1490, "start": 1459}]}, {"name": "PS50311", "regions": [{"end": 1480, "start": 1394}]}, {"name": "SSF57535", "regions": [{"end": 125, "start": 7}, {"end": 244, "start": 128}, {"end": 303, "start": 247}, {"end": 363, "start": 305}, {"end": 477, "start": 364}, {"end": 542, "start": 478}, {"end": 638, "start": 569}, {"end": 754, "start": 641}, {"end": 812, "start": 755}, {"end": 870, "start": 813}, {"end": 1043, "start": 871}, {"end": 1102, "start": 1044}, {"end": 1155, "start": 1103}, {"end": 1401, "start": 1165}]}, {"name": "SM00032", "regions": [{"end": 67, "start": 9}, {"end": 125, "start": 72}, {"end": 185, "start": 130}, {"end": 244, "start": 190}, {"end": 302, "start": 249}, {"end": 361, "start": 307}, {"end": 419, "start": 366}, {"end": 477, "start": 424}, {"end": 534, "start": 482}, {"end": 638, "start": 580}, {"end": 696, "start": 643}, {"end": 754, "start": 701}, {"end": 812, "start": 759}, {"end": 870, "start": 817}, {"end": 928, "start": 875}, {"end": 985, "start": 933}, {"end": 1043, "start": 990}, {"end": 1102, "start": 1048}, {"end": 1162, "start": 1107}, {"end": 1220, "start": 1167}, {"end": 1278, "start": 1225}, {"end": 1337, "start": 1283}, {"end": 1394, "start": 1342}]}]}]}, {"end": 113342018, "exons": [{"end": 113128840, "start": 113127536}, {"end": 113132296, "start": 113132203}, {"end": 113137743, "start": 113137648}, {"end": 113139646, "start": 113139551}, {"end": 113141797, "start": 113141627}, {"end": 113148354, "start": 113148178}, {"end": 113149738, "start": 113149565}, {"end": 113151867, "start": 113151804}, {"end": 113163289, "start": 113163134}, {"end": 113166832, "start": 113166607}, {"end": 113171231, "start": 113168440}, {"end": 113174015, "start": 113173343}, {"end": 113190038, "start": 113189871}, {"end": 113191614, "start": 113191423}, {"end": 113192284, "start": 113192200}, {"end": 113192730, "start": 113192554}, {"end": 113194314, "start": 113194195}, {"end": 113194915, "start": 113194742}, {"end": 113196786, "start": 113196616}, {"end": 113197644, "start": 113197521}, {"end": 113198784, "start": 113198660}, {"end": 113206000, "start": 113205825}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220842, "start": 113220751}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342018, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000374469", "start": 113127536, "translations": [{"cdna_coding_end": 265, "cdna_coding_start": 10911, "domains": [{"name": "SSF57535", "regions": [{"end": 410, "start": 351}, {"end": 470, "start": 411}, {"end": 537, "start": 471}, {"end": 767, "start": 704}, {"end": 1723, "start": 1603}, {"end": 1819, "start": 1762}, {"end": 1877, "start": 1820}, {"end": 1935, "start": 1878}, {"end": 1993, "start": 1936}, {"end": 2055, "start": 1994}, {"end": 2176, "start": 2058}, {"end": 2295, "start": 2179}, {"end": 2354, "start": 2298}, {"end": 2414, "start": 2356}, {"end": 2528, "start": 2415}, {"end": 2593, "start": 2529}, {"end": 2689, "start": 2620}, {"end": 2805, "start": 2692}, {"end": 2863, "start": 2806}, {"end": 2921, "start": 2864}, {"end": 3094, "start": 2922}, {"end": 3153, "start": 3095}, {"end": 3206, "start": 3154}, {"end": 3452, "start": 3216}]}, {"name": "SSF49899", "regions": [{"end": 1609, "start": 1398}]}, {"name": "SM00159", "regions": [{"end": 1604, "start": 1397}]}, {"name": "PF00354", "regions": [{"end": 1597, "start": 1419}]}, {"name": "PR00895", "regions": [{"end": 1507, "start": 1489}, {"end": 1535, "start": 1516}, {"end": 1569, "start": 1536}]}, {"name": "PF02494", "regions": [{"end": 619, "start": 538}, {"end": 698, "start": 621}]}, {"name": "SM00032", "regions": [{"end": 410, "start": 355}, {"end": 470, "start": 415}, {"end": 536, "start": 475}, {"end": 764, "start": 704}, {"end": 1662, "start": 1608}, {"end": 1720, "start": 1667}, {"end": 1819, "start": 1766}, {"end": 1877, "start": 1824}, {"end": 1935, "start": 1882}, {"end": 1993, "start": 1940}, {"end": 2055, "start": 1998}, {"end": 2118, "start": 2060}, {"end": 2176, "start": 2123}, {"end": 2236, "start": 2181}, {"end": 2295, "start": 2241}, {"end": 2353, "start": 2300}, {"end": 2412, "start": 2358}, {"end": 2470, "start": 2417}, {"end": 2528, "start": 2475}, {"end": 2585, "start": 2533}, {"end": 2689, "start": 2631}, {"end": 2747, "start": 2694}, {"end": 2805, "start": 2752}, {"end": 2863, "start": 2810}, {"end": 2921, "start": 2868}, {"end": 2979, "start": 2926}, {"end": 3036, "start": 2984}, {"end": 3094, "start": 3041}, {"end": 3153, "start": 3099}, {"end": 3213, "start": 3158}, {"end": 3271, "start": 3218}, {"end": 3329, "start": 3276}, {"end": 3388, "start": 3334}, {"end": 3445, "start": 3393}]}, {"name": "SM00179", "regions": [{"end": 1206, "start": 1173}, {"end": 1244, "start": 1208}, {"end": 1282, "start": 1246}, {"end": 1320, "start": 1284}, {"end": 1358, "start": 1322}, {"end": 1396, "start": 1360}, {"end": 1761, "start": 1722}, {"end": 3509, "start": 3481}]}, {"name": "SSF57184", "regions": [{"end": 417, "start": 246}, {"end": 1121, "start": 965}]}, {"name": "SSF57196", "regions": [{"end": 1244, "start": 1166}, {"end": 1282, "start": 1245}, {"end": 1319, "start": 1283}, {"end": 1400, "start": 1321}, {"end": 1763, "start": 1712}, {"end": 3483, "start": 3440}, {"end": 3512, "start": 3484}, {"end": 3547, "start": 3514}]}, {"name": "PS50026", "regions": [{"end": 1206, "start": 1170}, {"end": 1244, "start": 1208}, {"end": 1282, "start": 1246}, {"end": 1320, "start": 1284}, {"end": 1358, "start": 1322}, {"end": 1396, "start": 1360}, {"end": 1761, "start": 1722}, {"end": 3509, "start": 3477}, {"end": 3541, "start": 3510}]}, {"name": "SM00181", "regions": [{"end": 1206, "start": 1173}, {"end": 1244, "start": 1211}, {"end": 1282, "start": 1249}, {"end": 1320, "start": 1287}, {"end": 1358, "start": 1325}, {"end": 1396, "start": 1363}, {"end": 1761, "start": 1725}, {"end": 3477, "start": 3448}, {"end": 3509, "start": 3480}, {"end": 3541, "start": 3512}]}, {"name": "PF00092", "regions": [{"end": 229, "start": 61}]}, {"name": "PS50825", "regions": [{"end": 619, "start": 537}, {"end": 701, "start": 620}]}, {"name": "PS50311", "regions": [{"end": 1386, "start": 1174}, {"end": 3531, "start": 3445}]}, {"name": "PF07699", "regions": [{"end": 337, "start": 287}, {"end": 1029, "start": 982}, {"end": 1083, "start": 1036}, {"end": 1137, "start": 1090}]}, {"name": "PF00008", "regions": [{"end": 1203, "start": 1174}, {"end": 1242, "start": 1212}, {"end": 1279, "start": 1250}, {"end": 1356, "start": 1326}, {"end": 1394, "start": 1364}]}, {"name": "SM00327", "regions": [{"end": 237, "start": 58}]}, {"name": "PS50923", "regions": [{"end": 412, "start": 353}, {"end": 472, "start": 413}, {"end": 538, "start": 473}, {"end": 766, "start": 702}, {"end": 1664, "start": 1606}, {"end": 1722, "start": 1665}, {"end": 1821, "start": 1764}, {"end": 1879, "start": 1822}, {"end": 1937, "start": 1880}, {"end": 1995, "start": 1938}, {"end": 2057, "start": 1996}, {"end": 2120, "start": 2058}, {"end": 2178, "start": 2121}, {"end": 2238, "start": 2179}, {"end": 2297, "start": 2239}, {"end": 2355, "start": 2298}, {"end": 2414, "start": 2356}, {"end": 2472, "start": 2415}, {"end": 2530, "start": 2473}, {"end": 2587, "start": 2531}, {"end": 2691, "start": 2640}, {"end": 2749, "start": 2692}, {"end": 2807, "start": 2750}, {"end": 2865, "start": 2808}, {"end": 2923, "start": 2866}, {"end": 2981, "start": 2924}, {"end": 3038, "start": 2982}, {"end": 3096, "start": 3039}, {"end": 3155, "start": 3097}, {"end": 3215, "start": 3156}, {"end": 3273, "start": 3216}, {"end": 3331, "start": 3274}, {"end": 3390, "start": 3332}, {"end": 3447, "start": 3391}]}, {"name": "PF07645", "regions": [{"end": 1760, "start": 1722}]}, {"name": "SSF53300", "regions": [{"end": 239, "start": 56}]}, {"name": "PF00084", "regions": [{"end": 407, "start": 355}, {"end": 470, "start": 415}, {"end": 1662, "start": 1605}, {"end": 1720, "start": 1667}, {"end": 1819, "start": 1766}, {"end": 1877, "start": 1824}, {"end": 1935, "start": 1882}, {"end": 1993, "start": 1940}, {"end": 2055, "start": 1998}, {"end": 2113, "start": 2060}, {"end": 2176, "start": 2123}, {"end": 2236, "start": 2181}, {"end": 2295, "start": 2241}, {"end": 2353, "start": 2300}, {"end": 2412, "start": 2358}, {"end": 2470, "start": 2417}, {"end": 2528, "start": 2475}, {"end": 2585, "start": 2533}, {"end": 2689, "start": 2644}, {"end": 2747, "start": 2694}, {"end": 2805, "start": 2752}, {"end": 2863, "start": 2810}, {"end": 2921, "start": 2868}, {"end": 2979, "start": 2926}, {"end": 3036, "start": 2984}, {"end": 3094, "start": 3061}, {"end": 3149, "start": 3099}, {"end": 3213, "start": 3158}, {"end": 3267, "start": 3218}, {"end": 3329, "start": 3276}, {"end": 3388, "start": 3334}, {"end": 3445, "start": 3393}]}, {"name": "PF07974", "regions": [{"end": 1243, "start": 1212}, {"end": 3476, "start": 3452}, {"end": 3508, "start": 3484}, {"end": 3540, "start": 3513}]}, {"name": "PS50234", "regions": [{"end": 241, "start": 60}]}]}]}, {"end": 113341823, "exons": [{"end": 113206000, "start": 113204759}, {"end": 113208318, "start": 113208117}, {"end": 113209337, "start": 113209180}, {"end": 113212540, "start": 113212339}, {"end": 113213682, "start": 113213569}, {"end": 113217983, "start": 113217870}, {"end": 113219632, "start": 113219536}, {"end": 113220399, "start": 113220395}, {"end": 113220842, "start": 113220756}, {"end": 113221393, "start": 113221232}, {"end": 113228306, "start": 113228145}, {"end": 113231381, "start": 113231220}, {"end": 113233877, "start": 113233644}, {"end": 113234603, "start": 113234439}, {"end": 113238595, "start": 113238484}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113341823, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000302728", "start": 113204759, "translations": [{"cdna_coding_end": 1, "cdna_coding_start": 4650, "domains": [{"name": "PS50825", "regions": [{"end": 642, "start": 560}, {"end": 724, "start": 643}]}, {"name": "PF07699", "regions": [{"end": 360, "start": 310}, {"end": 1052, "start": 1005}, {"end": 1106, "start": 1059}, {"end": 1160, "start": 1113}]}, {"name": "PS50311", "regions": [{"end": 1409, "start": 1197}]}, {"name": "SM00181", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1234}, {"end": 1305, "start": 1272}, {"end": 1343, "start": 1310}, {"end": 1381, "start": 1348}, {"end": 1419, "start": 1386}]}, {"name": "SSF57196", "regions": [{"end": 1267, "start": 1189}, {"end": 1305, "start": 1268}, {"end": 1342, "start": 1306}, {"end": 1423, "start": 1344}]}, {"name": "PS50026", "regions": [{"end": 1229, "start": 1193}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}]}, {"name": "SSF57184", "regions": [{"end": 440, "start": 269}, {"end": 1144, "start": 988}]}, {"name": "SM00179", "regions": [{"end": 1229, "start": 1196}, {"end": 1267, "start": 1231}, {"end": 1305, "start": 1269}, {"end": 1343, "start": 1307}, {"end": 1381, "start": 1345}, {"end": 1419, "start": 1383}]}, {"name": "PF00092", "regions": [{"end": 252, "start": 84}]}, {"name": "SM00032", "regions": [{"end": 433, "start": 378}, {"end": 493, "start": 438}, {"end": 559, "start": 498}, {"end": 787, "start": 727}]}, {"name": "PF02494", "regions": [{"end": 642, "start": 561}, {"end": 721, "start": 644}]}, {"name": "PR00010", "regions": [{"end": 1318, "start": 1307}, {"end": 1364, "start": 1357}, {"end": 1413, "start": 1403}, {"end": 1420, "start": 1414}]}, {"name": "PF00354", "regions": [{"end": 1532, "start": 1442}]}, {"name": "SSF57535", "regions": [{"end": 433, "start": 374}, {"end": 493, "start": 434}, {"end": 560, "start": 494}, {"end": 790, "start": 727}]}, {"name": "SSF49899", "regions": [{"end": 1547, "start": 1421}]}, {"name": "PS50234", "regions": [{"end": 264, "start": 83}]}, {"name": "SSF53300", "regions": [{"end": 262, "start": 79}]}, {"name": "PF00084", "regions": [{"end": 430, "start": 378}, {"end": 493, "start": 438}]}, {"name": "PS50923", "regions": [{"end": 435, "start": 376}, {"end": 495, "start": 436}, {"end": 561, "start": 496}, {"end": 789, "start": 725}]}, {"name": "PF07645", "regions": [{"end": 1262, "start": 1231}, {"end": 1338, "start": 1308}]}, {"name": "PF00008", "regions": [{"end": 1226, "start": 1197}, {"end": 1265, "start": 1235}, {"end": 1302, "start": 1273}, {"end": 1337, "start": 1311}, {"end": 1379, "start": 1349}, {"end": 1417, "start": 1387}]}, {"name": "SM00327", "regions": [{"end": 260, "start": 81}]}]}]}, {"end": 113342160, "exons": [{"end": 113238595, "start": 113238163}, {"end": 113242036, "start": 113241915}, {"end": 113243716, "start": 113243522}, {"end": 113244772, "start": 113244641}, {"end": 113245973, "start": 113245866}, {"end": 113252059, "start": 113251930}, {"end": 113259213, "start": 113259095}, {"end": 113261518, "start": 113261321}, {"end": 113265497, "start": 113265318}, {"end": 113275385, "start": 113275206}, {"end": 113276386, "start": 113276228}, {"end": 113308571, "start": 113308395}, {"end": 113312384, "start": 113312129}, {"end": 113342160, "start": 113341293}], "is_best_transcript": false, "name": "ENST00000374461", "start": 113238163, "translations": [{"cdna_coding_end": 407, "cdna_coding_start": 2944, "domains": [{"name": "PF02494", "regions": [{"end": 619, "start": 538}, {"end": 698, "start": 621}]}, {"name": "SM00032", "regions": [{"end": 410, "start": 355}, {"end": 470, "start": 415}, {"end": 536, "start": 475}, {"end": 764, "start": 704}]}, {"name": "SSF57535", "regions": [{"end": 410, "start": 351}, {"end": 470, "start": 411}, {"end": 537, "start": 471}, {"end": 767, "start": 704}]}, {"name": "PF07699", "regions": [{"end": 337, "start": 287}]}, {"name": "PS50825", "regions": [{"end": 619, "start": 537}, {"end": 701, "start": 620}]}, {"name": "PF00092", "regions": [{"end": 229, "start": 61}]}, {"name": "SSF57184", "regions": [{"end": 417, "start": 246}]}, {"name": "PS50923", "regions": [{"end": 412, "start": 353}, {"end": 472, "start": 413}, {"end": 538, "start": 473}, {"end": 766, "start": 702}]}, {"name": "SM00327", "regions": [{"end": 237, "start": 58}]}, {"name": "PS50234", "regions": [{"end": 241, "start": 60}]}, {"name": "SSF53300", "regions": [{"end": 239, "start": 56}]}, {"name": "PF00084", "regions": [{"end": 407, "start": 355}, {"end": 470, "start": 415}]}]}]}]}, {"aliases": ["ARID1B"], "chr": "6", "end": 157530401, "name": "ENSG00000049618", "start": 157099063, "strand": "+", "transcripts": [{"end": 157529495, "exons": [{"end": 157100605, "start": 157099063}, {"end": 157150555, "start": 157150361}, {"end": 157192786, "start": 157192748}, {"end": 157222659, "start": 157222510}, {"end": 157256710, "start": 157256600}, {"end": 157406039, "start": 157405796}, {"end": 157431695, "start": 157431606}, {"end": 157454341, "start": 157454162}, {"end": 157470085, "start": 157469758}, {"end": 157488319, "start": 157488174}, {"end": 157495251, "start": 157495142}, {"end": 157502312, "start": 157502103}, {"end": 157505569, "start": 157505365}, {"end": 157510914, "start": 157510776}, {"end": 157511344, "start": 157511172}, {"end": 157517449, "start": 157517299}, {"end": 157520041, "start": 157519945}, {"end": 157522622, "start": 157521839}, {"end": 157525130, "start": 157525000}, {"end": 157529495, "start": 157527301}], "is_best_transcript": true, "name": "ENST00000346085", "start": 157099063, "translations": [{"cdna_coding_end": 6751, "cdna_coding_start": 2, "domains": [{"name": "PF12031", "regions": [{"end": 2195, "start": 1939}]}, {"name": "PS50324", "regions": [{"end": 57, "start": 35}, {"end": 784, "start": 697}]}, {"name": "PF01388", "regions": [{"end": 1153, "start": 1065}]}, {"name": "PS50099", "regions": [{"end": 820, "start": 715}, {"end": 1610, "start": 1472}]}, {"name": "SSF48371", "regions": [{"end": 2220, "start": 2075}]}, {"name": "PS50316", "regions": [{"end": 104, "start": 81}]}, {"name": "PS50322", "regions": [{"end": 131, "start": 107}, {"end": 646, "start": 574}]}, {"name": "PS51011", "regions": [{"end": 1157, "start": 1066}]}, {"name": "PS50310", "regions": [{"end": 47, "start": 2}, {"end": 493, "start": 329}]}, {"name": "PS50315", "regions": [{"end": 401, "start": 141}]}, {"name": "SSF46774", "regions": [{"end": 1168, "start": 1049}]}, {"name": "SM00501", "regions": [{"end": 1158, "start": 1067}]}]}]}]}]} \ No newline at end of file +{ + "genes": [ + { + "aliases": [ + "EGFR" + ], + "chr": "7", + "end": 55324313, + "name": "ENSG00000146648", + "start": 55086714, + "strand": "+", + "transcripts": [ + { + "end": 55270769, + "exons": [ + { + "end": 55087058, + "start": 55086714 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240817, + "start": 55240676 + }, + { + "end": 55241736, + "start": 55241614 + }, + { + "end": 55242513, + "start": 55242415 + }, + { + "end": 55249171, + "start": 55248986 + }, + { + "end": 55259567, + "start": 55259412 + }, + { + "end": 55260534, + "start": 55260459 + }, + { + "end": 55266556, + "start": 55266410 + }, + { + "end": 55268106, + "start": 55268009 + }, + { + "end": 55269048, + "start": 55268881 + }, + { + "end": 55269475, + "start": 55269428 + }, + { + "end": 55270769, + "start": 55270210 + } + ], + "is_best_transcript": false, + "name": "ENST00000455089", + "start": 55086714, + "translations": [ + { + "cdna_coding_end": 3533, + "cdna_coding_start": 258, + "domains": [ + { + "name": "PIRSF000619", + "regions": [ + { + "end": 1090, + "start": 1 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 920, + "start": 669 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 191, + "start": 28 + }, + { + "end": 475, + "start": 283 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 293, + "start": 141 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 934, + "start": 667 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 219, + "start": 145 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 290, + "start": 142 + }, + { + "end": 593, + "start": 460 + } + ] + }, + { + "name": "PR00109", + "regions": [ + { + "end": 758, + "start": 745 + }, + { + "end": 800, + "start": 782 + }, + { + "end": 841, + "start": 831 + }, + { + "end": 872, + "start": 850 + }, + { + "end": 916, + "start": 894 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 975, + "start": 651 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 141, + "start": 57 + }, + { + "end": 435, + "start": 316 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 924, + "start": 667 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 225, + "start": 183 + }, + { + "end": 502, + "start": 451 + }, + { + "end": 556, + "start": 507 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 923, + "start": 667 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 919, + "start": 667 + } + ] + } + ] + } + ] + }, + { + "end": 55236328, + "exons": [ + { + "end": 55087058, + "start": 55086725 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55236328, + "start": 55236216 + } + ], + "is_best_transcript": false, + "name": "ENST00000342916", + "start": 55086725, + "translations": [ + { + "cdna_coding_end": 2133, + "cdna_coding_start": 247, + "domains": [ + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 624, + "start": 505 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + } + ] + } + ] + }, + { + "end": 55238738, + "exons": [ + { + "end": 55087058, + "start": 55086726 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238738, + "start": 55238000 + } + ], + "is_best_transcript": false, + "name": "ENST00000344576", + "start": 55086726, + "translations": [ + { + "cdna_coding_end": 2363, + "cdna_coding_start": 246, + "domains": [ + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 624, + "start": 505 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + } + ] + } + ] + }, + { + "end": 55224644, + "exons": [ + { + "end": 55087058, + "start": 55086727 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224644, + "start": 55224452 + } + ], + "is_best_transcript": false, + "name": "ENST00000420316", + "start": 55086727, + "translations": [ + { + "cdna_coding_end": 1462, + "cdna_coding_start": 245, + "domains": [ + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 403, + "start": 328 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + } + ] + } + ] + } + ] + }, + { + "end": 55279321, + "exons": [ + { + "end": 55087058, + "start": 55086794 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240817, + "start": 55240676 + }, + { + "end": 55241736, + "start": 55241614 + }, + { + "end": 55242513, + "start": 55242415 + }, + { + "end": 55249171, + "start": 55248986 + }, + { + "end": 55259567, + "start": 55259412 + }, + { + "end": 55260534, + "start": 55260459 + }, + { + "end": 55266556, + "start": 55266410 + }, + { + "end": 55268106, + "start": 55268009 + }, + { + "end": 55269048, + "start": 55268881 + }, + { + "end": 55269475, + "start": 55269428 + }, + { + "end": 55270318, + "start": 55270210 + }, + { + "end": 55279321, + "start": 55272949 + } + ], + "is_best_transcript": true, + "name": "ENST00000275493", + "start": 55086794, + "translations": [ + { + "cdna_coding_end": 3810, + "cdna_coding_start": 178, + "domains": [ + { + "name": "SM00220", + "regions": [ + { + "end": 969, + "start": 712 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 1020, + "start": 696 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 964, + "start": 712 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 968, + "start": 712 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 965, + "start": 714 + } + ] + }, + { + "name": "PIRSF000619", + "regions": [ + { + "end": 1210, + "start": 1 + } + ] + }, + { + "name": "PR00109", + "regions": [ + { + "end": 803, + "start": 790 + }, + { + "end": 845, + "start": 827 + }, + { + "end": 886, + "start": 876 + }, + { + "end": 917, + "start": 895 + }, + { + "end": 961, + "start": 939 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 638, + "start": 505 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 979, + "start": 712 + } + ] + } + ] + } + ] + }, + { + "end": 55324313, + "exons": [ + { + "end": 55087058, + "start": 55086811 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240621, + "start": 55240539 + }, + { + "end": 55324313, + "start": 55323947 + } + ], + "is_best_transcript": false, + "name": "ENST00000442591", + "start": 55086811, + "translations": [ + { + "cdna_coding_end": 2134, + "cdna_coding_start": 161, + "domains": [ + { + "name": "PF01030", + "regions": [ + { + "end": 167, + "start": 57 + }, + { + "end": 480, + "start": 361 + } + ] + }, + { + "name": "SM00261", + "regions": [ + { + "end": 270, + "start": 228 + }, + { + "end": 547, + "start": 496 + }, + { + "end": 601, + "start": 552 + }, + { + "end": 653, + "start": 614 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 211, + "start": 29 + }, + { + "end": 520, + "start": 328 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 338, + "start": 185 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 264, + "start": 187 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 339, + "start": 182 + }, + { + "end": 638, + "start": 505 + } + ] + } + ] + } + ] + }, + { + "end": 55214417, + "exons": [ + { + "end": 55177651, + "start": 55177416 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214417, + "start": 55214299 + } + ], + "is_best_transcript": false, + "name": "ENST00000450046", + "start": 55177416, + "translations": [ + { + "cdna_coding_end": 691, + "cdna_coding_start": 308, + "domains": [ + { + "name": "SSF52058", + "regions": [ + { + "end": 127, + "start": 1 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 114, + "start": 4 + } + ] + } + ] + } + ] + }, + { + "end": 55273591, + "exons": [ + { + "end": 55177651, + "start": 55177540 + }, + { + "end": 55210130, + "start": 55209979 + }, + { + "end": 55211181, + "start": 55210998 + }, + { + "end": 55214433, + "start": 55214299 + }, + { + "end": 55219055, + "start": 55218987 + }, + { + "end": 55220357, + "start": 55220239 + }, + { + "end": 55221845, + "start": 55221704 + }, + { + "end": 55223639, + "start": 55223523 + }, + { + "end": 55224352, + "start": 55224226 + }, + { + "end": 55224525, + "start": 55224452 + }, + { + "end": 55225446, + "start": 55225356 + }, + { + "end": 55228031, + "start": 55227832 + }, + { + "end": 55229324, + "start": 55229192 + }, + { + "end": 55231516, + "start": 55231426 + }, + { + "end": 55233130, + "start": 55232973 + }, + { + "end": 55238906, + "start": 55238868 + }, + { + "end": 55240817, + "start": 55240676 + }, + { + "end": 55241736, + "start": 55241614 + }, + { + "end": 55242513, + "start": 55242415 + }, + { + "end": 55249171, + "start": 55248986 + }, + { + "end": 55259567, + "start": 55259412 + }, + { + "end": 55260534, + "start": 55260459 + }, + { + "end": 55266556, + "start": 55266410 + }, + { + "end": 55268106, + "start": 55268009 + }, + { + "end": 55269048, + "start": 55268881 + }, + { + "end": 55269475, + "start": 55269428 + }, + { + "end": 55270318, + "start": 55270210 + }, + { + "end": 55273591, + "start": 55272949 + } + ], + "is_best_transcript": false, + "name": "ENST00000454757", + "start": 55177540, + "translations": [ + { + "cdna_coding_end": 3657, + "cdna_coding_start": 184, + "domains": [ + { + "name": "SM00261", + "regions": [ + { + "end": 217, + "start": 175 + }, + { + "end": 494, + "start": 443 + }, + { + "end": 548, + "start": 499 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 911, + "start": 659 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 915, + "start": 659 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 967, + "start": 643 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 916, + "start": 659 + } + ] + }, + { + "name": "PF01030", + "regions": [ + { + "end": 114, + "start": 4 + }, + { + "end": 427, + "start": 308 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 211, + "start": 134 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 926, + "start": 659 + } + ] + }, + { + "name": "PR00109", + "regions": [ + { + "end": 750, + "start": 737 + }, + { + "end": 792, + "start": 774 + }, + { + "end": 833, + "start": 823 + }, + { + "end": 864, + "start": 842 + }, + { + "end": 908, + "start": 886 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 286, + "start": 129 + }, + { + "end": 585, + "start": 452 + } + ] + }, + { + "name": "PIRSF000619", + "regions": [ + { + "end": 1157, + "start": 1 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 912, + "start": 661 + } + ] + }, + { + "name": "SSF52058", + "regions": [ + { + "end": 158, + "start": 1 + }, + { + "end": 467, + "start": 275 + } + ] + }, + { + "name": "PF00757", + "regions": [ + { + "end": 285, + "start": 132 + } + ] + } + ] + } + ] + } + ] + }, + { + "aliases": [ + "DSTYK" + ], + "chr": "1", + "end": 205180727, + "name": "ENSG00000133059", + "start": 205111632, + "strand": "-", + "transcripts": [ + { + "end": 205180727, + "exons": [ + { + "end": 205116873, + "start": 205111632 + }, + { + "end": 205117467, + "start": 205117333 + }, + { + "end": 205119898, + "start": 205119808 + }, + { + "end": 205133083, + "start": 205133055 + }, + { + "end": 205138960, + "start": 205138291 + }, + { + "end": 205156934, + "start": 205156546 + }, + { + "end": 205180727, + "start": 205180399 + } + ], + "is_best_transcript": false, + "name": "ENST00000367160", + "start": 205111632, + "translations": [ + { + "cdna_coding_end": 1831, + "cdna_coding_start": 65, + "domains": [ + { + "name": "SM00220", + "regions": [ + { + "end": 565, + "start": 337 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 585, + "start": 452 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 556, + "start": 451 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 558, + "start": 471 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 565, + "start": 312 + } + ] + } + ] + } + ] + }, + { + "end": 205180694, + "exons": [ + { + "end": 205116873, + "start": 205111633 + }, + { + "end": 205119922, + "start": 205119808 + }, + { + "end": 205126514, + "start": 205126401 + }, + { + "end": 205128807, + "start": 205128675 + }, + { + "end": 205129398, + "start": 205129242 + }, + { + "end": 205130515, + "start": 205130386 + }, + { + "end": 205131340, + "start": 205131164 + }, + { + "end": 205132134, + "start": 205132051 + }, + { + "end": 205133083, + "start": 205132851 + }, + { + "end": 205138960, + "start": 205138291 + }, + { + "end": 205156934, + "start": 205156546 + }, + { + "end": 205180694, + "start": 205180399 + } + ], + "is_best_transcript": false, + "name": "ENST00000367161", + "start": 205111633, + "translations": [ + { + "cdna_coding_end": 2686, + "cdna_coding_start": 32, + "domains": [ + { + "name": "PF07714", + "regions": [ + { + "end": 820, + "start": 654 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 884, + "start": 652 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 853, + "start": 627 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 861, + "start": 652 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 824, + "start": 654 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 861, + "start": 652 + } + ] + } + ] + } + ] + }, + { + "end": 205180694, + "exons": [ + { + "end": 205116873, + "start": 205111633 + }, + { + "end": 205117467, + "start": 205117333 + }, + { + "end": 205119922, + "start": 205119808 + }, + { + "end": 205126514, + "start": 205126401 + }, + { + "end": 205128807, + "start": 205128675 + }, + { + "end": 205129398, + "start": 205129242 + }, + { + "end": 205130515, + "start": 205130386 + }, + { + "end": 205131340, + "start": 205131164 + }, + { + "end": 205132134, + "start": 205132051 + }, + { + "end": 205133083, + "start": 205132851 + }, + { + "end": 205138960, + "start": 205138291 + }, + { + "end": 205156934, + "start": 205156546 + }, + { + "end": 205180694, + "start": 205180399 + } + ], + "is_best_transcript": true, + "name": "ENST00000367162", + "start": 205111633, + "translations": [ + { + "cdna_coding_end": 2821, + "cdna_coding_start": 32, + "domains": [ + { + "name": "PF07714", + "regions": [ + { + "end": 899, + "start": 654 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 906, + "start": 652 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 897, + "start": 638 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 906, + "start": 652 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 906, + "start": 652 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 897, + "start": 654 + } + ] + } + ] + } + ] + } + ] + }, + { + "aliases": [ + "NDUFA12" + ], + "chr": "12", + "end": 95397546, + "name": "ENSG00000184752", + "start": 95290831, + "strand": "-", + "transcripts": [ + { + "end": 95397436, + "exons": [ + { + "end": 95291086, + "start": 95290831 + }, + { + "end": 95318582, + "start": 95318422 + }, + { + "end": 95322039, + "start": 95321793 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397436, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000552205", + "start": 95290831 + }, + { + "end": 95397476, + "exons": [ + { + "end": 95365261, + "start": 95365108 + }, + { + "end": 95396597, + "start": 95396582 + }, + { + "end": 95397476, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000547157", + "start": 95365108, + "translations": [ + { + "cdna_coding_end": 188, + "cdna_coding_start": 21 + } + ] + }, + { + "end": 95397384, + "exons": [ + { + "end": 95365396, + "start": 95365109 + }, + { + "end": 95388033, + "start": 95387946 + }, + { + "end": 95390752, + "start": 95390680 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397384, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000551991", + "start": 95365109, + "translations": [ + { + "cdna_coding_end": 144, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 33, + "start": 12 + } + ] + } + ] + } + ] + }, + { + "end": 95397546, + "exons": [ + { + "end": 95365396, + "start": 95365109 + }, + { + "end": 95388033, + "start": 95387946 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397546, + "start": 95397371 + } + ], + "is_best_transcript": true, + "name": "ENST00000327772", + "start": 95365109, + "translations": [ + { + "cdna_coding_end": 528, + "cdna_coding_start": 91, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 137, + "start": 36 + } + ] + } + ] + } + ] + }, + { + "end": 95397489, + "exons": [ + { + "end": 95365396, + "start": 95365112 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397489, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000547986", + "start": 95365112, + "translations": [ + { + "cdna_coding_end": 225, + "cdna_coding_start": 34, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 53, + "start": 36 + } + ] + } + ] + } + ] + }, + { + "end": 95397524, + "exons": [ + { + "end": 95365396, + "start": 95365254 + }, + { + "end": 95366265, + "start": 95366171 + }, + { + "end": 95388033, + "start": 95387946 + }, + { + "end": 95396597, + "start": 95396515 + }, + { + "end": 95397524, + "start": 95397371 + } + ], + "is_best_transcript": false, + "name": "ENST00000546788", + "start": 95365254, + "translations": [ + { + "cdna_coding_end": 368, + "cdna_coding_start": 69, + "domains": [ + { + "name": "PF05071", + "regions": [ + { + "end": 87, + "start": 36 + } + ] + } + ] + } + ] + } + ] + }, + { + "aliases": [ + "FRMD6" + ], + "chr": "14", + "end": 52197445, + "name": "ENSG00000139926", + "start": 51955818, + "strand": "+", + "transcripts": [ + { + "end": 52197177, + "exons": [ + { + "end": 51956138, + "start": 51955855 + }, + { + "end": 52037128, + "start": 52037066 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164950, + "start": 52164860 + }, + { + "end": 52167853, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52197177, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000356218", + "start": 51955855, + "translations": [ + { + "cdna_coding_end": 2338, + "cdna_coding_start": 494, + "domains": [ + { + "name": "PF09379", + "regions": [ + { + "end": 109, + "start": 20 + } + ] + }, + { + "name": "PF09380", + "regions": [ + { + "end": 322, + "start": 237 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 375, + "start": 219 + } + ] + }, + { + "name": "SM00295", + "regions": [ + { + "end": 226, + "start": 12 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 320, + "start": 16 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 226, + "start": 115 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 218, + "start": 110 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 110, + "start": 14 + } + ] + } + ] + } + ] + }, + { + "end": 52197445, + "exons": [ + { + "end": 52118714, + "start": 52118576 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164950, + "start": 52164860 + }, + { + "end": 52167853, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52197445, + "start": 52194463 + } + ], + "is_best_transcript": true, + "name": "ENST00000395718", + "start": 52118576, + "translations": [ + { + "cdna_coding_end": 2130, + "cdna_coding_start": 286, + "domains": [ + { + "name": "PF00373", + "regions": [ + { + "end": 226, + "start": 115 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 218, + "start": 110 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 110, + "start": 14 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 320, + "start": 16 + } + ] + }, + { + "name": "SM00295", + "regions": [ + { + "end": 226, + "start": 12 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 375, + "start": 219 + } + ] + }, + { + "name": "PF09380", + "regions": [ + { + "end": 322, + "start": 237 + } + ] + }, + { + "name": "PF09379", + "regions": [ + { + "end": 109, + "start": 20 + } + ] + } + ] + } + ] + }, + { + "end": 52195654, + "exons": [ + { + "end": 52118714, + "start": 52118665 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164950, + "start": 52164860 + }, + { + "end": 52167877, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52195654, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000344768", + "start": 52118665, + "translations": [ + { + "cdna_coding_end": 2065, + "cdna_coding_start": 197, + "domains": [ + { + "name": "PF09380", + "regions": [ + { + "end": 330, + "start": 245 + } + ] + }, + { + "name": "PF09379", + "regions": [ + { + "end": 117, + "start": 20 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 226, + "start": 118 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 234, + "start": 123 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 118, + "start": 14 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 328, + "start": 16 + } + ] + }, + { + "name": "SM00295", + "regions": [ + { + "end": 234, + "start": 12 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 383, + "start": 227 + } + ] + } + ] + } + ] + }, + { + "end": 52164945, + "exons": [ + { + "end": 52118935, + "start": 52118698 + }, + { + "end": 52156653, + "start": 52156409 + }, + { + "end": 52164945, + "start": 52164860 + } + ], + "is_best_transcript": false, + "name": "ENST00000554778", + "start": 52118698 + }, + { + "end": 52174806, + "exons": [ + { + "end": 52164950, + "start": 52164706 + }, + { + "end": 52167877, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174806, + "start": 52174796 + } + ], + "is_best_transcript": false, + "name": "ENST00000555936", + "start": 52164706 + }, + { + "end": 52197148, + "exons": [ + { + "end": 52164950, + "start": 52164831 + }, + { + "end": 52167853, + "start": 52167774 + }, + { + "end": 52169306, + "start": 52169230 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52174951, + "start": 52174796 + }, + { + "end": 52178314, + "start": 52178249 + }, + { + "end": 52179269, + "start": 52179201 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52197148, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000554167", + "start": 52164831, + "translations": [ + { + "cdna_coding_end": 1775, + "cdna_coding_start": 138, + "domains": [ + { + "name": "SSF50729", + "regions": [ + { + "end": 306, + "start": 150 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 251, + "start": 1 + } + ] + }, + { + "name": "SSF54236", + "regions": [ + { + "end": 41, + "start": 1 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 149, + "start": 41 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 157, + "start": 46 + } + ] + }, + { + "name": "PF09380", + "regions": [ + { + "end": 253, + "start": 168 + } + ] + } + ] + } + ] + }, + { + "end": 52175062, + "exons": [ + { + "end": 52169306, + "start": 52169266 + }, + { + "end": 52171653, + "start": 52171467 + }, + { + "end": 52175062, + "start": 52174796 + } + ], + "is_best_transcript": false, + "name": "ENST00000557405", + "start": 52169266, + "translations": [ + { + "cdna_coding_end": 390, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PS50057", + "regions": [ + { + "end": 129, + "start": 1 + } + ] + }, + { + "name": "PF00373", + "regions": [ + { + "end": 124, + "start": 13 + } + ] + }, + { + "name": "SSF47031", + "regions": [ + { + "end": 116, + "start": 8 + } + ] + } + ] + } + ] + }, + { + "end": 52187243, + "exons": [ + { + "end": 52179269, + "start": 52179231 + }, + { + "end": 52182217, + "start": 52182043 + }, + { + "end": 52187243, + "start": 52186773 + } + ], + "is_best_transcript": false, + "name": "ENST00000555197", + "start": 52179231, + "translations": [ + { + "cdna_coding_end": 618, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PF09380", + "regions": [ + { + "end": 60, + "start": 2 + } + ] + }, + { + "name": "PS50057", + "regions": [ + { + "end": 58, + "start": 1 + } + ] + }, + { + "name": "SSF50729", + "regions": [ + { + "end": 113, + "start": 2 + } + ] + } + ] + } + ] + }, + { + "end": 52192513, + "exons": [ + { + "end": 52184066, + "start": 52183973 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188673 + }, + { + "end": 52192513, + "start": 52192497 + } + ], + "is_best_transcript": false, + "name": "ENST00000555703", + "start": 52183973, + "translations": [ + { + "cdna_coding_end": 573, + "cdna_coding_start": 145 + } + ] + }, + { + "end": 52195487, + "exons": [ + { + "end": 52184066, + "start": 52183973 + }, + { + "end": 52187108, + "start": 52186773 + }, + { + "end": 52188798, + "start": 52188667 + }, + { + "end": 52192588, + "start": 52192497 + }, + { + "end": 52195487, + "start": 52194463 + } + ], + "is_best_transcript": false, + "name": "ENST00000553556", + "start": 52183973, + "translations": [ + { + "cdna_coding_end": 939, + "cdna_coding_start": 145 + } + ] + } + ] + }, + { + "aliases": [ + "PRKCB" + ], + "chr": "16", + "end": 24231932, + "name": "ENSG00000166501", + "start": 23847322, + "strand": "+", + "transcripts": [ + { + "end": 24231932, + "exons": [ + { + "end": 23847669, + "start": 23847322 + }, + { + "end": 23848727, + "start": 23848696 + }, + { + "end": 23999911, + "start": 23999829 + }, + { + "end": 24043568, + "start": 24043457 + }, + { + "end": 24046868, + "start": 24046740 + }, + { + "end": 24104268, + "start": 24104112 + }, + { + "end": 24105618, + "start": 24105484 + }, + { + "end": 24124390, + "start": 24124294 + }, + { + "end": 24135302, + "start": 24135156 + }, + { + "end": 24166178, + "start": 24166005 + }, + { + "end": 24183682, + "start": 24183591 + }, + { + "end": 24185901, + "start": 24185839 + }, + { + "end": 24192249, + "start": 24192111 + }, + { + "end": 24196512, + "start": 24196432 + }, + { + "end": 24196888, + "start": 24196781 + }, + { + "end": 24202551, + "start": 24202411 + }, + { + "end": 24231932, + "start": 24231282 + } + ], + "is_best_transcript": true, + "name": "ENST00000321728", + "start": 23847322, + "translations": [ + { + "cdna_coding_end": 2191, + "cdna_coding_start": 176, + "domains": [ + { + "name": "SM00239", + "regions": [ + { + "end": 275, + "start": 172 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 583, + "start": 344 + } + ] + }, + { + "name": "SSF49562", + "regions": [ + { + "end": 288, + "start": 157 + } + ] + }, + { + "name": "SM00109", + "regions": [ + { + "end": 86, + "start": 37 + }, + { + "end": 151, + "start": 102 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "PR00008", + "regions": [ + { + "end": 48, + "start": 34 + }, + { + "end": 59, + "start": 50 + }, + { + "end": 74, + "start": 63 + }, + { + "end": 152, + "start": 140 + } + ] + }, + { + "name": "PF00433", + "regions": [ + { + "end": 666, + "start": 623 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "PF00168", + "regions": [ + { + "end": 259, + "start": 175 + } + ] + }, + { + "name": "SSF57889", + "regions": [ + { + "end": 92, + "start": 6 + }, + { + "end": 157, + "start": 101 + } + ] + }, + { + "name": "PF00130", + "regions": [ + { + "end": 87, + "start": 37 + }, + { + "end": 153, + "start": 102 + } + ] + }, + { + "name": "PS50081", + "regions": [ + { + "end": 86, + "start": 36 + }, + { + "end": 151, + "start": 101 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 627, + "start": 317 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 586, + "start": 343 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 576, + "start": 342 + } + ] + }, + { + "name": "PR00360", + "regions": [ + { + "end": 200, + "start": 188 + }, + { + "end": 230, + "start": 217 + }, + { + "end": 248, + "start": 240 + } + ] + }, + { + "name": "SM00133", + "regions": [ + { + "end": 664, + "start": 601 + } + ] + }, + { + "name": "PS50004", + "regions": [ + { + "end": 260, + "start": 173 + } + ] + }, + { + "name": "PIRSF000550", + "regions": [ + { + "end": 671, + "start": 1 + } + ] + } + ] + } + ] + }, + { + "end": 24231932, + "exons": [ + { + "end": 23847669, + "start": 23847345 + }, + { + "end": 23848727, + "start": 23848696 + }, + { + "end": 23999911, + "start": 23999829 + }, + { + "end": 24043568, + "start": 24043457 + }, + { + "end": 24046868, + "start": 24046740 + }, + { + "end": 24104268, + "start": 24104112 + }, + { + "end": 24105618, + "start": 24105484 + }, + { + "end": 24124390, + "start": 24124294 + }, + { + "end": 24135302, + "start": 24135156 + }, + { + "end": 24166178, + "start": 24166005 + }, + { + "end": 24183682, + "start": 24183591 + }, + { + "end": 24185901, + "start": 24185839 + }, + { + "end": 24192249, + "start": 24192111 + }, + { + "end": 24196512, + "start": 24196432 + }, + { + "end": 24196888, + "start": 24196781 + }, + { + "end": 24202551, + "start": 24202411 + }, + { + "end": 24231932, + "start": 24225979 + } + ], + "is_best_transcript": false, + "name": "ENST00000303531", + "start": 23847345, + "translations": [ + { + "cdna_coding_end": 2174, + "cdna_coding_start": 153, + "domains": [ + { + "name": "SM00133", + "regions": [ + { + "end": 663, + "start": 601 + } + ] + }, + { + "name": "PS50004", + "regions": [ + { + "end": 260, + "start": 173 + } + ] + }, + { + "name": "PIRSF000550", + "regions": [ + { + "end": 672, + "start": 1 + } + ] + }, + { + "name": "PF00069", + "regions": [ + { + "end": 586, + "start": 343 + } + ] + }, + { + "name": "PR00360", + "regions": [ + { + "end": 200, + "start": 188 + }, + { + "end": 230, + "start": 217 + }, + { + "end": 248, + "start": 240 + } + ] + }, + { + "name": "SM00219", + "regions": [ + { + "end": 576, + "start": 342 + } + ] + }, + { + "name": "PS50081", + "regions": [ + { + "end": 86, + "start": 36 + }, + { + "end": 151, + "start": 101 + } + ] + }, + { + "name": "SSF56112", + "regions": [ + { + "end": 627, + "start": 317 + } + ] + }, + { + "name": "SM00220", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "PF00433", + "regions": [ + { + "end": 664, + "start": 627 + } + ] + }, + { + "name": "PF00130", + "regions": [ + { + "end": 87, + "start": 37 + }, + { + "end": 153, + "start": 102 + } + ] + }, + { + "name": "PF00168", + "regions": [ + { + "end": 259, + "start": 175 + } + ] + }, + { + "name": "SSF57889", + "regions": [ + { + "end": 92, + "start": 6 + }, + { + "end": 157, + "start": 101 + } + ] + }, + { + "name": "PR00008", + "regions": [ + { + "end": 48, + "start": 34 + }, + { + "end": 59, + "start": 50 + }, + { + "end": 74, + "start": 63 + }, + { + "end": 152, + "start": 140 + } + ] + }, + { + "name": "PS50011", + "regions": [ + { + "end": 600, + "start": 342 + } + ] + }, + { + "name": "SM00109", + "regions": [ + { + "end": 86, + "start": 37 + }, + { + "end": 151, + "start": 102 + } + ] + }, + { + "name": "PF07714", + "regions": [ + { + "end": 583, + "start": 344 + } + ] + }, + { + "name": "SSF49562", + "regions": [ + { + "end": 288, + "start": 157 + } + ] + }, + { + "name": "SM00239", + "regions": [ + { + "end": 275, + "start": 172 + } + ] + } + ] + } + ] + }, + { + "end": 23880647, + "exons": [ + { + "end": 23847669, + "start": 23847403 + }, + { + "end": 23880647, + "start": 23880435 + } + ], + "is_best_transcript": false, + "name": "ENST00000498058", + "start": 23847403, + "translations": [ + { + "cdna_coding_end": 268, + "cdna_coding_start": 95, + "domains": [ + { + "name": "PR00008", + "regions": [ + { + "end": 48, + "start": 34 + }, + { + "end": 57, + "start": 50 + } + ] + }, + { + "name": "PS50081", + "regions": [ + { + "end": 57, + "start": 36 + } + ] + }, + { + "name": "SSF57889", + "regions": [ + { + "end": 57, + "start": 6 + } + ] + } + ] + } + ] + }, + { + "end": 24124386, + "exons": [ + { + "end": 23848727, + "start": 23848544 + }, + { + "end": 24104268, + "start": 24104112 + }, + { + "end": 24105618, + "start": 24105484 + }, + { + "end": 24124386, + "start": 24124294 + } + ], + "is_best_transcript": false, + "name": "ENST00000498739", + "start": 23848544 + }, + { + "end": 24192166, + "exons": [ + { + "end": 24163176, + "start": 24163006 + }, + { + "end": 24166178, + "start": 24166005 + }, + { + "end": 24183682, + "start": 24183591 + }, + { + "end": 24185901, + "start": 24185839 + }, + { + "end": 24192166, + "start": 24192111 + } + ], + "is_best_transcript": false, + "name": "ENST00000472066", + "start": 24163006 + }, + { + "end": 24202909, + "exons": [ + { + "end": 24196888, + "start": 24196852 + }, + { + "end": 24202909, + "start": 24202411 + } + ], + "is_best_transcript": false, + "name": "ENST00000466124", + "start": 24196852 + } + ] + }, + { + "aliases": [ + "GIMAP4" + ], + "chr": "7", + "end": 150271041, + "name": "ENSG00000133574", + "start": 150264365, + "strand": "+", + "transcripts": [ + { + "end": 150271041, + "exons": [ + { + "end": 150264525, + "start": 150264365 + }, + { + "end": 150267047, + "start": 150266976 + }, + { + "end": 150271041, + "start": 150269217 + } + ], + "is_best_transcript": true, + "name": "ENST00000255945", + "start": 150264365, + "translations": [ + { + "cdna_coding_end": 1165, + "cdna_coding_start": 176, + "domains": [ + { + "name": "PF04548", + "regions": [ + { + "end": 238, + "start": 31 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 288, + "start": 24 + } + ] + } + ] + } + ] + }, + { + "end": 150270602, + "exons": [ + { + "end": 150264525, + "start": 150264457 + }, + { + "end": 150267089, + "start": 150266976 + }, + { + "end": 150270602, + "start": 150269217 + } + ], + "is_best_transcript": false, + "name": "ENST00000461940", + "start": 150264457, + "translations": [ + { + "cdna_coding_end": 1115, + "cdna_coding_start": 84, + "domains": [ + { + "name": "PF04548", + "regions": [ + { + "end": 252, + "start": 45 + } + ] + }, + { + "name": "SSF52540", + "regions": [ + { + "end": 302, + "start": 38 + } + ] + } + ] + } + ] + }, + { + "end": 150269569, + "exons": [ + { + "end": 150264608, + "start": 150264524 + }, + { + "end": 150267089, + "start": 150266976 + }, + { + "end": 150269569, + "start": 150269217 + } + ], + "is_best_transcript": false, + "name": "ENST00000479232", + "start": 150264524, + "translations": [ + { + "cdna_coding_end": 552, + "cdna_coding_start": 100, + "domains": [ + { + "name": "SSF52540", + "regions": [ + { + "end": 151, + "start": 38 + } + ] + }, + { + "name": "PF04548", + "regions": [ + { + "end": 151, + "start": 45 + } + ] + } + ] + } + ] + } + ] + }, + { + "aliases": [ + "IL7" + ], + "chr": "8", + "end": 79717758, + "name": "ENSG00000104432", + "start": 79587978, + "strand": "-", + "transcripts": [ + { + "end": 79717758, + "exons": [ + { + "end": 79646067, + "start": 79645007 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79650870, + "start": 79650739 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717758, + "start": 79717148 + } + ], + "is_best_transcript": true, + "name": "ENST00000263851", + "start": 79645007, + "translations": [ + { + "cdna_coding_end": 1135, + "cdna_coding_start": 602, + "domains": [ + { + "name": "PIRSF001942", + "regions": [ + { + "end": 177, + "start": 1 + } + ] + }, + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + }, + { + "end": 77, + "start": 57 + }, + { + "end": 98, + "start": 78 + }, + { + "end": 118, + "start": 99 + }, + { + "end": 173, + "start": 151 + } + ] + }, + { + "name": "PF01415", + "regions": [ + { + "end": 173, + "start": 28 + } + ] + }, + { + "name": "SM00127", + "regions": [ + { + "end": 173, + "start": 27 + } + ] + } + ] + } + ] + }, + { + "end": 79717699, + "exons": [ + { + "end": 79646063, + "start": 79645283 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79650870, + "start": 79650739 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79659331, + "start": 79659129 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717699, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000518982", + "start": 79645283, + "translations": [ + { + "cdna_coding_end": 758, + "cdna_coding_start": 543, + "domains": [ + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + } + ] + }, + { + "name": "PF01415", + "regions": [ + { + "end": 54, + "start": 28 + } + ] + } + ] + } + ] + }, + { + "end": 79717163, + "exons": [ + { + "end": 79646067, + "start": 79645900 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717163, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000520269", + "start": 79645900, + "translations": [ + { + "cdna_coding_end": 408, + "cdna_coding_start": 7, + "domains": [ + { + "name": "PF01415", + "regions": [ + { + "end": 77, + "start": 28 + }, + { + "end": 129, + "start": 91 + } + ] + }, + { + "name": "SM00127", + "regions": [ + { + "end": 129, + "start": 27 + } + ] + }, + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + }, + { + "end": 77, + "start": 57 + } + ] + }, + { + "name": "PIRSF001942", + "regions": [ + { + "end": 133, + "start": 1 + } + ] + } + ] + } + ] + }, + { + "end": 79717163, + "exons": [ + { + "end": 79646067, + "start": 79645900 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710363 + }, + { + "end": 79717163, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000520215", + "start": 79645900, + "translations": [ + { + "cdna_coding_end": 120, + "cdna_coding_start": 7, + "domains": [ + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 37, + "start": 26 + } + ] + } + ] + } + ] + }, + { + "end": 79717686, + "exons": [ + { + "end": 79646067, + "start": 79645900 + }, + { + "end": 79648762, + "start": 79648709 + }, + { + "end": 79650870, + "start": 79650739 + }, + { + "end": 79652317, + "start": 79652237 + }, + { + "end": 79710443, + "start": 79710363 + }, + { + "end": 79717686, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000520317", + "start": 79645900, + "translations": [ + { + "cdna_coding_end": 643, + "cdna_coding_start": 530, + "domains": [ + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 37, + "start": 26 + } + ] + } + ] + } + ] + }, + { + "end": 79652311, + "exons": [ + { + "end": 79646067, + "start": 79645948 + }, + { + "end": 79652311, + "start": 79652237 + } + ], + "is_best_transcript": false, + "name": "ENST00000541183", + "start": 79645948, + "translations": [ + { + "cdna_coding_end": 195, + "cdna_coding_start": 1, + "domains": [ + { + "name": "SM00127", + "regions": [ + { + "end": 60, + "start": 1 + } + ] + }, + { + "name": "PF01415", + "regions": [ + { + "end": 60, + "start": 1 + } + ] + } + ] + } + ] + }, + { + "end": 79717758, + "exons": [ + { + "end": 79659331, + "start": 79659263 + }, + { + "end": 79710443, + "start": 79710307 + }, + { + "end": 79717758, + "start": 79717148 + } + ], + "is_best_transcript": false, + "name": "ENST00000379113", + "start": 79659263, + "translations": [ + { + "cdna_coding_end": 817, + "cdna_coding_start": 602, + "domains": [ + { + "name": "PF01415", + "regions": [ + { + "end": 54, + "start": 28 + } + ] + }, + { + "name": "PR00435", + "regions": [ + { + "end": 25, + "start": 2 + }, + { + "end": 48, + "start": 26 + } + ] + } + ] + } + ] + } + ] + }, + { + "aliases": [ + "SVEP1" + ], + "chr": "9", + "end": 113342160, + "name": "ENSG00000165124", + "start": 113127531, + "strand": "-", + "transcripts": [ + { + "end": 113342160, + "exons": [ + { + "end": 113128840, + "start": 113127531 + }, + { + "end": 113132296, + "start": 113132203 + }, + { + "end": 113137743, + "start": 113137648 + }, + { + "end": 113139646, + "start": 113139551 + }, + { + "end": 113141797, + "start": 113141627 + }, + { + "end": 113148354, + "start": 113148178 + }, + { + "end": 113149738, + "start": 113149565 + }, + { + "end": 113151867, + "start": 113151804 + }, + { + "end": 113163289, + "start": 113163134 + }, + { + "end": 113166832, + "start": 113166607 + }, + { + "end": 113171231, + "start": 113168440 + }, + { + "end": 113174015, + "start": 113173343 + }, + { + "end": 113190038, + "start": 113189871 + }, + { + "end": 113191614, + "start": 113191423 + }, + { + "end": 113192284, + "start": 113192200 + }, + { + "end": 113192730, + "start": 113192554 + }, + { + "end": 113194314, + "start": 113194195 + }, + { + "end": 113194915, + "start": 113194742 + }, + { + "end": 113196786, + "start": 113196616 + }, + { + "end": 113197644, + "start": 113197521 + }, + { + "end": 113198784, + "start": 113198660 + }, + { + "end": 113206000, + "start": 113205825 + }, + { + "end": 113208318, + "start": 113208117 + }, + { + "end": 113209337, + "start": 113209180 + }, + { + "end": 113212540, + "start": 113212339 + }, + { + "end": 113213682, + "start": 113213569 + }, + { + "end": 113217983, + "start": 113217870 + }, + { + "end": 113219632, + "start": 113219536 + }, + { + "end": 113220842, + "start": 113220751 + }, + { + "end": 113221393, + "start": 113221232 + }, + { + "end": 113228306, + "start": 113228145 + }, + { + "end": 113231381, + "start": 113231220 + }, + { + "end": 113233877, + "start": 113233644 + }, + { + "end": 113234603, + "start": 113234439 + }, + { + "end": 113238595, + "start": 113238484 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113342160, + "start": 113341293 + } + ], + "is_best_transcript": true, + "name": "ENST00000401783", + "start": 113127531, + "translations": [ + { + "cdna_coding_end": 11053, + "cdna_coding_start": 338, + "domains": [ + { + "name": "SM00032", + "regions": [ + { + "end": 433, + "start": 378 + }, + { + "end": 493, + "start": 438 + }, + { + "end": 559, + "start": 498 + }, + { + "end": 787, + "start": 727 + }, + { + "end": 1685, + "start": 1631 + }, + { + "end": 1743, + "start": 1690 + }, + { + "end": 1842, + "start": 1789 + }, + { + "end": 1900, + "start": 1847 + }, + { + "end": 1958, + "start": 1905 + }, + { + "end": 2016, + "start": 1963 + }, + { + "end": 2078, + "start": 2021 + }, + { + "end": 2141, + "start": 2083 + }, + { + "end": 2199, + "start": 2146 + }, + { + "end": 2259, + "start": 2204 + }, + { + "end": 2318, + "start": 2264 + }, + { + "end": 2376, + "start": 2323 + }, + { + "end": 2435, + "start": 2381 + }, + { + "end": 2493, + "start": 2440 + }, + { + "end": 2551, + "start": 2498 + }, + { + "end": 2608, + "start": 2556 + }, + { + "end": 2712, + "start": 2654 + }, + { + "end": 2770, + "start": 2717 + }, + { + "end": 2828, + "start": 2775 + }, + { + "end": 2886, + "start": 2833 + }, + { + "end": 2944, + "start": 2891 + }, + { + "end": 3002, + "start": 2949 + }, + { + "end": 3059, + "start": 3007 + }, + { + "end": 3117, + "start": 3064 + }, + { + "end": 3176, + "start": 3122 + }, + { + "end": 3236, + "start": 3181 + }, + { + "end": 3294, + "start": 3241 + }, + { + "end": 3352, + "start": 3299 + }, + { + "end": 3411, + "start": 3357 + }, + { + "end": 3468, + "start": 3416 + } + ] + }, + { + "name": "PF02494", + "regions": [ + { + "end": 642, + "start": 561 + }, + { + "end": 721, + "start": 644 + } + ] + }, + { + "name": "PR00895", + "regions": [ + { + "end": 1530, + "start": 1512 + }, + { + "end": 1558, + "start": 1539 + }, + { + "end": 1592, + "start": 1559 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 433, + "start": 374 + }, + { + "end": 493, + "start": 434 + }, + { + "end": 560, + "start": 494 + }, + { + "end": 790, + "start": 727 + }, + { + "end": 1746, + "start": 1626 + }, + { + "end": 1842, + "start": 1785 + }, + { + "end": 1900, + "start": 1843 + }, + { + "end": 1958, + "start": 1901 + }, + { + "end": 2016, + "start": 1959 + }, + { + "end": 2078, + "start": 2017 + }, + { + "end": 2199, + "start": 2081 + }, + { + "end": 2318, + "start": 2202 + }, + { + "end": 2377, + "start": 2321 + }, + { + "end": 2437, + "start": 2379 + }, + { + "end": 2551, + "start": 2438 + }, + { + "end": 2616, + "start": 2552 + }, + { + "end": 2712, + "start": 2643 + }, + { + "end": 2828, + "start": 2715 + }, + { + "end": 2886, + "start": 2829 + }, + { + "end": 2944, + "start": 2887 + }, + { + "end": 3117, + "start": 2945 + }, + { + "end": 3176, + "start": 3118 + }, + { + "end": 3229, + "start": 3177 + }, + { + "end": 3475, + "start": 3239 + } + ] + }, + { + "name": "SSF49899", + "regions": [ + { + "end": 1632, + "start": 1421 + } + ] + }, + { + "name": "SM00159", + "regions": [ + { + "end": 1627, + "start": 1420 + } + ] + }, + { + "name": "PF00354", + "regions": [ + { + "end": 1620, + "start": 1442 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 360, + "start": 310 + }, + { + "end": 1052, + "start": 1005 + }, + { + "end": 1106, + "start": 1059 + }, + { + "end": 1160, + "start": 1113 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1409, + "start": 1197 + }, + { + "end": 3554, + "start": 3468 + } + ] + }, + { + "name": "PS50825", + "regions": [ + { + "end": 642, + "start": 560 + }, + { + "end": 724, + "start": 643 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 252, + "start": 84 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1267, + "start": 1189 + }, + { + "end": 1305, + "start": 1268 + }, + { + "end": 1342, + "start": 1306 + }, + { + "end": 1423, + "start": 1344 + }, + { + "end": 1786, + "start": 1735 + }, + { + "end": 3506, + "start": 3463 + }, + { + "end": 3535, + "start": 3507 + }, + { + "end": 3570, + "start": 3537 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1229, + "start": 1193 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + }, + { + "end": 1784, + "start": 1745 + }, + { + "end": 3532, + "start": 3500 + }, + { + "end": 3564, + "start": 3533 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1234 + }, + { + "end": 1305, + "start": 1272 + }, + { + "end": 1343, + "start": 1310 + }, + { + "end": 1381, + "start": 1348 + }, + { + "end": 1419, + "start": 1386 + }, + { + "end": 1784, + "start": 1748 + }, + { + "end": 3500, + "start": 3471 + }, + { + "end": 3532, + "start": 3503 + }, + { + "end": 3564, + "start": 3535 + } + ] + }, + { + "name": "SM00179", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + }, + { + "end": 1784, + "start": 1745 + }, + { + "end": 3532, + "start": 3504 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 440, + "start": 269 + }, + { + "end": 1144, + "start": 988 + } + ] + }, + { + "name": "PF07645", + "regions": [ + { + "end": 1783, + "start": 1745 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 435, + "start": 376 + }, + { + "end": 495, + "start": 436 + }, + { + "end": 561, + "start": 496 + }, + { + "end": 789, + "start": 725 + }, + { + "end": 1687, + "start": 1629 + }, + { + "end": 1745, + "start": 1688 + }, + { + "end": 1844, + "start": 1787 + }, + { + "end": 1902, + "start": 1845 + }, + { + "end": 1960, + "start": 1903 + }, + { + "end": 2018, + "start": 1961 + }, + { + "end": 2080, + "start": 2019 + }, + { + "end": 2143, + "start": 2081 + }, + { + "end": 2201, + "start": 2144 + }, + { + "end": 2261, + "start": 2202 + }, + { + "end": 2320, + "start": 2262 + }, + { + "end": 2378, + "start": 2321 + }, + { + "end": 2437, + "start": 2379 + }, + { + "end": 2495, + "start": 2438 + }, + { + "end": 2553, + "start": 2496 + }, + { + "end": 2610, + "start": 2554 + }, + { + "end": 2714, + "start": 2663 + }, + { + "end": 2772, + "start": 2715 + }, + { + "end": 2830, + "start": 2773 + }, + { + "end": 2888, + "start": 2831 + }, + { + "end": 2946, + "start": 2889 + }, + { + "end": 3004, + "start": 2947 + }, + { + "end": 3061, + "start": 3005 + }, + { + "end": 3119, + "start": 3062 + }, + { + "end": 3178, + "start": 3120 + }, + { + "end": 3238, + "start": 3179 + }, + { + "end": 3296, + "start": 3239 + }, + { + "end": 3354, + "start": 3297 + }, + { + "end": 3413, + "start": 3355 + }, + { + "end": 3470, + "start": 3414 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 260, + "start": 81 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1226, + "start": 1197 + }, + { + "end": 1265, + "start": 1235 + }, + { + "end": 1302, + "start": 1273 + }, + { + "end": 1379, + "start": 1349 + }, + { + "end": 1417, + "start": 1387 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 264, + "start": 83 + } + ] + }, + { + "name": "PF07974", + "regions": [ + { + "end": 1266, + "start": 1235 + }, + { + "end": 3499, + "start": 3475 + }, + { + "end": 3531, + "start": 3507 + }, + { + "end": 3563, + "start": 3536 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 262, + "start": 79 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 430, + "start": 378 + }, + { + "end": 493, + "start": 438 + }, + { + "end": 1685, + "start": 1628 + }, + { + "end": 1743, + "start": 1690 + }, + { + "end": 1842, + "start": 1789 + }, + { + "end": 1900, + "start": 1847 + }, + { + "end": 1958, + "start": 1905 + }, + { + "end": 2016, + "start": 1963 + }, + { + "end": 2078, + "start": 2021 + }, + { + "end": 2136, + "start": 2083 + }, + { + "end": 2199, + "start": 2146 + }, + { + "end": 2259, + "start": 2204 + }, + { + "end": 2318, + "start": 2264 + }, + { + "end": 2376, + "start": 2323 + }, + { + "end": 2435, + "start": 2381 + }, + { + "end": 2493, + "start": 2440 + }, + { + "end": 2551, + "start": 2498 + }, + { + "end": 2608, + "start": 2556 + }, + { + "end": 2712, + "start": 2667 + }, + { + "end": 2770, + "start": 2717 + }, + { + "end": 2828, + "start": 2775 + }, + { + "end": 2886, + "start": 2833 + }, + { + "end": 2944, + "start": 2891 + }, + { + "end": 3002, + "start": 2949 + }, + { + "end": 3059, + "start": 3007 + }, + { + "end": 3117, + "start": 3084 + }, + { + "end": 3172, + "start": 3122 + }, + { + "end": 3236, + "start": 3181 + }, + { + "end": 3290, + "start": 3241 + }, + { + "end": 3352, + "start": 3299 + }, + { + "end": 3411, + "start": 3357 + }, + { + "end": 3468, + "start": 3416 + } + ] + } + ] + } + ] + }, + { + "end": 113190038, + "exons": [ + { + "end": 113128840, + "start": 113127536 + }, + { + "end": 113132296, + "start": 113132203 + }, + { + "end": 113137743, + "start": 113137648 + }, + { + "end": 113139646, + "start": 113139551 + }, + { + "end": 113141797, + "start": 113141627 + }, + { + "end": 113148354, + "start": 113148178 + }, + { + "end": 113149738, + "start": 113149565 + }, + { + "end": 113151867, + "start": 113151804 + }, + { + "end": 113163289, + "start": 113163134 + }, + { + "end": 113166832, + "start": 113166607 + }, + { + "end": 113171231, + "start": 113168440 + }, + { + "end": 113174015, + "start": 113173343 + }, + { + "end": 113190038, + "start": 113189871 + } + ], + "is_best_transcript": false, + "name": "ENST00000297826", + "start": 113127536, + "translations": [ + { + "cdna_coding_end": 4909, + "cdna_coding_start": 416, + "domains": [ + { + "name": "PF00084", + "regions": [ + { + "end": 62, + "start": 9 + }, + { + "end": 125, + "start": 72 + }, + { + "end": 185, + "start": 130 + }, + { + "end": 244, + "start": 190 + }, + { + "end": 302, + "start": 249 + }, + { + "end": 361, + "start": 307 + }, + { + "end": 419, + "start": 366 + }, + { + "end": 477, + "start": 424 + }, + { + "end": 534, + "start": 482 + }, + { + "end": 638, + "start": 593 + }, + { + "end": 696, + "start": 643 + }, + { + "end": 754, + "start": 701 + }, + { + "end": 812, + "start": 759 + }, + { + "end": 870, + "start": 817 + }, + { + "end": 928, + "start": 875 + }, + { + "end": 985, + "start": 933 + }, + { + "end": 1043, + "start": 1010 + }, + { + "end": 1098, + "start": 1048 + }, + { + "end": 1162, + "start": 1107 + }, + { + "end": 1216, + "start": 1167 + }, + { + "end": 1278, + "start": 1225 + }, + { + "end": 1337, + "start": 1283 + }, + { + "end": 1394, + "start": 1342 + } + ] + }, + { + "name": "PF07974", + "regions": [ + { + "end": 1425, + "start": 1401 + }, + { + "end": 1457, + "start": 1433 + }, + { + "end": 1489, + "start": 1462 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1456, + "start": 1427 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 69, + "start": 7 + }, + { + "end": 127, + "start": 70 + }, + { + "end": 187, + "start": 128 + }, + { + "end": 246, + "start": 188 + }, + { + "end": 304, + "start": 247 + }, + { + "end": 363, + "start": 305 + }, + { + "end": 421, + "start": 364 + }, + { + "end": 479, + "start": 422 + }, + { + "end": 536, + "start": 480 + }, + { + "end": 640, + "start": 589 + }, + { + "end": 698, + "start": 641 + }, + { + "end": 756, + "start": 699 + }, + { + "end": 814, + "start": 757 + }, + { + "end": 872, + "start": 815 + }, + { + "end": 930, + "start": 873 + }, + { + "end": 987, + "start": 931 + }, + { + "end": 1045, + "start": 988 + }, + { + "end": 1104, + "start": 1046 + }, + { + "end": 1164, + "start": 1105 + }, + { + "end": 1222, + "start": 1165 + }, + { + "end": 1280, + "start": 1223 + }, + { + "end": 1339, + "start": 1281 + }, + { + "end": 1396, + "start": 1340 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1426, + "start": 1397 + }, + { + "end": 1458, + "start": 1429 + }, + { + "end": 1490, + "start": 1461 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1432, + "start": 1389 + }, + { + "end": 1461, + "start": 1433 + }, + { + "end": 1496, + "start": 1463 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1458, + "start": 1426 + }, + { + "end": 1490, + "start": 1459 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1480, + "start": 1394 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 125, + "start": 7 + }, + { + "end": 244, + "start": 128 + }, + { + "end": 303, + "start": 247 + }, + { + "end": 363, + "start": 305 + }, + { + "end": 477, + "start": 364 + }, + { + "end": 542, + "start": 478 + }, + { + "end": 638, + "start": 569 + }, + { + "end": 754, + "start": 641 + }, + { + "end": 812, + "start": 755 + }, + { + "end": 870, + "start": 813 + }, + { + "end": 1043, + "start": 871 + }, + { + "end": 1102, + "start": 1044 + }, + { + "end": 1155, + "start": 1103 + }, + { + "end": 1401, + "start": 1165 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 67, + "start": 9 + }, + { + "end": 125, + "start": 72 + }, + { + "end": 185, + "start": 130 + }, + { + "end": 244, + "start": 190 + }, + { + "end": 302, + "start": 249 + }, + { + "end": 361, + "start": 307 + }, + { + "end": 419, + "start": 366 + }, + { + "end": 477, + "start": 424 + }, + { + "end": 534, + "start": 482 + }, + { + "end": 638, + "start": 580 + }, + { + "end": 696, + "start": 643 + }, + { + "end": 754, + "start": 701 + }, + { + "end": 812, + "start": 759 + }, + { + "end": 870, + "start": 817 + }, + { + "end": 928, + "start": 875 + }, + { + "end": 985, + "start": 933 + }, + { + "end": 1043, + "start": 990 + }, + { + "end": 1102, + "start": 1048 + }, + { + "end": 1162, + "start": 1107 + }, + { + "end": 1220, + "start": 1167 + }, + { + "end": 1278, + "start": 1225 + }, + { + "end": 1337, + "start": 1283 + }, + { + "end": 1394, + "start": 1342 + } + ] + } + ] + } + ] + }, + { + "end": 113342018, + "exons": [ + { + "end": 113128840, + "start": 113127536 + }, + { + "end": 113132296, + "start": 113132203 + }, + { + "end": 113137743, + "start": 113137648 + }, + { + "end": 113139646, + "start": 113139551 + }, + { + "end": 113141797, + "start": 113141627 + }, + { + "end": 113148354, + "start": 113148178 + }, + { + "end": 113149738, + "start": 113149565 + }, + { + "end": 113151867, + "start": 113151804 + }, + { + "end": 113163289, + "start": 113163134 + }, + { + "end": 113166832, + "start": 113166607 + }, + { + "end": 113171231, + "start": 113168440 + }, + { + "end": 113174015, + "start": 113173343 + }, + { + "end": 113190038, + "start": 113189871 + }, + { + "end": 113191614, + "start": 113191423 + }, + { + "end": 113192284, + "start": 113192200 + }, + { + "end": 113192730, + "start": 113192554 + }, + { + "end": 113194314, + "start": 113194195 + }, + { + "end": 113194915, + "start": 113194742 + }, + { + "end": 113196786, + "start": 113196616 + }, + { + "end": 113197644, + "start": 113197521 + }, + { + "end": 113198784, + "start": 113198660 + }, + { + "end": 113206000, + "start": 113205825 + }, + { + "end": 113208318, + "start": 113208117 + }, + { + "end": 113209337, + "start": 113209180 + }, + { + "end": 113212540, + "start": 113212339 + }, + { + "end": 113213682, + "start": 113213569 + }, + { + "end": 113217983, + "start": 113217870 + }, + { + "end": 113219632, + "start": 113219536 + }, + { + "end": 113220842, + "start": 113220751 + }, + { + "end": 113221393, + "start": 113221232 + }, + { + "end": 113228306, + "start": 113228145 + }, + { + "end": 113231381, + "start": 113231220 + }, + { + "end": 113233877, + "start": 113233644 + }, + { + "end": 113234603, + "start": 113234439 + }, + { + "end": 113238595, + "start": 113238484 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113342018, + "start": 113341293 + } + ], + "is_best_transcript": false, + "name": "ENST00000374469", + "start": 113127536, + "translations": [ + { + "cdna_coding_end": 10911, + "cdna_coding_start": 265, + "domains": [ + { + "name": "SSF57535", + "regions": [ + { + "end": 410, + "start": 351 + }, + { + "end": 470, + "start": 411 + }, + { + "end": 537, + "start": 471 + }, + { + "end": 767, + "start": 704 + }, + { + "end": 1723, + "start": 1603 + }, + { + "end": 1819, + "start": 1762 + }, + { + "end": 1877, + "start": 1820 + }, + { + "end": 1935, + "start": 1878 + }, + { + "end": 1993, + "start": 1936 + }, + { + "end": 2055, + "start": 1994 + }, + { + "end": 2176, + "start": 2058 + }, + { + "end": 2295, + "start": 2179 + }, + { + "end": 2354, + "start": 2298 + }, + { + "end": 2414, + "start": 2356 + }, + { + "end": 2528, + "start": 2415 + }, + { + "end": 2593, + "start": 2529 + }, + { + "end": 2689, + "start": 2620 + }, + { + "end": 2805, + "start": 2692 + }, + { + "end": 2863, + "start": 2806 + }, + { + "end": 2921, + "start": 2864 + }, + { + "end": 3094, + "start": 2922 + }, + { + "end": 3153, + "start": 3095 + }, + { + "end": 3206, + "start": 3154 + }, + { + "end": 3452, + "start": 3216 + } + ] + }, + { + "name": "SSF49899", + "regions": [ + { + "end": 1609, + "start": 1398 + } + ] + }, + { + "name": "SM00159", + "regions": [ + { + "end": 1604, + "start": 1397 + } + ] + }, + { + "name": "PF00354", + "regions": [ + { + "end": 1597, + "start": 1419 + } + ] + }, + { + "name": "PR00895", + "regions": [ + { + "end": 1507, + "start": 1489 + }, + { + "end": 1535, + "start": 1516 + }, + { + "end": 1569, + "start": 1536 + } + ] + }, + { + "name": "PF02494", + "regions": [ + { + "end": 619, + "start": 538 + }, + { + "end": 698, + "start": 621 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 410, + "start": 355 + }, + { + "end": 470, + "start": 415 + }, + { + "end": 536, + "start": 475 + }, + { + "end": 764, + "start": 704 + }, + { + "end": 1662, + "start": 1608 + }, + { + "end": 1720, + "start": 1667 + }, + { + "end": 1819, + "start": 1766 + }, + { + "end": 1877, + "start": 1824 + }, + { + "end": 1935, + "start": 1882 + }, + { + "end": 1993, + "start": 1940 + }, + { + "end": 2055, + "start": 1998 + }, + { + "end": 2118, + "start": 2060 + }, + { + "end": 2176, + "start": 2123 + }, + { + "end": 2236, + "start": 2181 + }, + { + "end": 2295, + "start": 2241 + }, + { + "end": 2353, + "start": 2300 + }, + { + "end": 2412, + "start": 2358 + }, + { + "end": 2470, + "start": 2417 + }, + { + "end": 2528, + "start": 2475 + }, + { + "end": 2585, + "start": 2533 + }, + { + "end": 2689, + "start": 2631 + }, + { + "end": 2747, + "start": 2694 + }, + { + "end": 2805, + "start": 2752 + }, + { + "end": 2863, + "start": 2810 + }, + { + "end": 2921, + "start": 2868 + }, + { + "end": 2979, + "start": 2926 + }, + { + "end": 3036, + "start": 2984 + }, + { + "end": 3094, + "start": 3041 + }, + { + "end": 3153, + "start": 3099 + }, + { + "end": 3213, + "start": 3158 + }, + { + "end": 3271, + "start": 3218 + }, + { + "end": 3329, + "start": 3276 + }, + { + "end": 3388, + "start": 3334 + }, + { + "end": 3445, + "start": 3393 + } + ] + }, + { + "name": "SM00179", + "regions": [ + { + "end": 1206, + "start": 1173 + }, + { + "end": 1244, + "start": 1208 + }, + { + "end": 1282, + "start": 1246 + }, + { + "end": 1320, + "start": 1284 + }, + { + "end": 1358, + "start": 1322 + }, + { + "end": 1396, + "start": 1360 + }, + { + "end": 1761, + "start": 1722 + }, + { + "end": 3509, + "start": 3481 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 417, + "start": 246 + }, + { + "end": 1121, + "start": 965 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1244, + "start": 1166 + }, + { + "end": 1282, + "start": 1245 + }, + { + "end": 1319, + "start": 1283 + }, + { + "end": 1400, + "start": 1321 + }, + { + "end": 1763, + "start": 1712 + }, + { + "end": 3483, + "start": 3440 + }, + { + "end": 3512, + "start": 3484 + }, + { + "end": 3547, + "start": 3514 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1206, + "start": 1170 + }, + { + "end": 1244, + "start": 1208 + }, + { + "end": 1282, + "start": 1246 + }, + { + "end": 1320, + "start": 1284 + }, + { + "end": 1358, + "start": 1322 + }, + { + "end": 1396, + "start": 1360 + }, + { + "end": 1761, + "start": 1722 + }, + { + "end": 3509, + "start": 3477 + }, + { + "end": 3541, + "start": 3510 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1206, + "start": 1173 + }, + { + "end": 1244, + "start": 1211 + }, + { + "end": 1282, + "start": 1249 + }, + { + "end": 1320, + "start": 1287 + }, + { + "end": 1358, + "start": 1325 + }, + { + "end": 1396, + "start": 1363 + }, + { + "end": 1761, + "start": 1725 + }, + { + "end": 3477, + "start": 3448 + }, + { + "end": 3509, + "start": 3480 + }, + { + "end": 3541, + "start": 3512 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 229, + "start": 61 + } + ] + }, + { + "name": "PS50825", + "regions": [ + { + "end": 619, + "start": 537 + }, + { + "end": 701, + "start": 620 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1386, + "start": 1174 + }, + { + "end": 3531, + "start": 3445 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 337, + "start": 287 + }, + { + "end": 1029, + "start": 982 + }, + { + "end": 1083, + "start": 1036 + }, + { + "end": 1137, + "start": 1090 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1203, + "start": 1174 + }, + { + "end": 1242, + "start": 1212 + }, + { + "end": 1279, + "start": 1250 + }, + { + "end": 1356, + "start": 1326 + }, + { + "end": 1394, + "start": 1364 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 237, + "start": 58 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 412, + "start": 353 + }, + { + "end": 472, + "start": 413 + }, + { + "end": 538, + "start": 473 + }, + { + "end": 766, + "start": 702 + }, + { + "end": 1664, + "start": 1606 + }, + { + "end": 1722, + "start": 1665 + }, + { + "end": 1821, + "start": 1764 + }, + { + "end": 1879, + "start": 1822 + }, + { + "end": 1937, + "start": 1880 + }, + { + "end": 1995, + "start": 1938 + }, + { + "end": 2057, + "start": 1996 + }, + { + "end": 2120, + "start": 2058 + }, + { + "end": 2178, + "start": 2121 + }, + { + "end": 2238, + "start": 2179 + }, + { + "end": 2297, + "start": 2239 + }, + { + "end": 2355, + "start": 2298 + }, + { + "end": 2414, + "start": 2356 + }, + { + "end": 2472, + "start": 2415 + }, + { + "end": 2530, + "start": 2473 + }, + { + "end": 2587, + "start": 2531 + }, + { + "end": 2691, + "start": 2640 + }, + { + "end": 2749, + "start": 2692 + }, + { + "end": 2807, + "start": 2750 + }, + { + "end": 2865, + "start": 2808 + }, + { + "end": 2923, + "start": 2866 + }, + { + "end": 2981, + "start": 2924 + }, + { + "end": 3038, + "start": 2982 + }, + { + "end": 3096, + "start": 3039 + }, + { + "end": 3155, + "start": 3097 + }, + { + "end": 3215, + "start": 3156 + }, + { + "end": 3273, + "start": 3216 + }, + { + "end": 3331, + "start": 3274 + }, + { + "end": 3390, + "start": 3332 + }, + { + "end": 3447, + "start": 3391 + } + ] + }, + { + "name": "PF07645", + "regions": [ + { + "end": 1760, + "start": 1722 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 239, + "start": 56 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 407, + "start": 355 + }, + { + "end": 470, + "start": 415 + }, + { + "end": 1662, + "start": 1605 + }, + { + "end": 1720, + "start": 1667 + }, + { + "end": 1819, + "start": 1766 + }, + { + "end": 1877, + "start": 1824 + }, + { + "end": 1935, + "start": 1882 + }, + { + "end": 1993, + "start": 1940 + }, + { + "end": 2055, + "start": 1998 + }, + { + "end": 2113, + "start": 2060 + }, + { + "end": 2176, + "start": 2123 + }, + { + "end": 2236, + "start": 2181 + }, + { + "end": 2295, + "start": 2241 + }, + { + "end": 2353, + "start": 2300 + }, + { + "end": 2412, + "start": 2358 + }, + { + "end": 2470, + "start": 2417 + }, + { + "end": 2528, + "start": 2475 + }, + { + "end": 2585, + "start": 2533 + }, + { + "end": 2689, + "start": 2644 + }, + { + "end": 2747, + "start": 2694 + }, + { + "end": 2805, + "start": 2752 + }, + { + "end": 2863, + "start": 2810 + }, + { + "end": 2921, + "start": 2868 + }, + { + "end": 2979, + "start": 2926 + }, + { + "end": 3036, + "start": 2984 + }, + { + "end": 3094, + "start": 3061 + }, + { + "end": 3149, + "start": 3099 + }, + { + "end": 3213, + "start": 3158 + }, + { + "end": 3267, + "start": 3218 + }, + { + "end": 3329, + "start": 3276 + }, + { + "end": 3388, + "start": 3334 + }, + { + "end": 3445, + "start": 3393 + } + ] + }, + { + "name": "PF07974", + "regions": [ + { + "end": 1243, + "start": 1212 + }, + { + "end": 3476, + "start": 3452 + }, + { + "end": 3508, + "start": 3484 + }, + { + "end": 3540, + "start": 3513 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 241, + "start": 60 + } + ] + } + ] + } + ] + }, + { + "end": 113341823, + "exons": [ + { + "end": 113206000, + "start": 113204759 + }, + { + "end": 113208318, + "start": 113208117 + }, + { + "end": 113209337, + "start": 113209180 + }, + { + "end": 113212540, + "start": 113212339 + }, + { + "end": 113213682, + "start": 113213569 + }, + { + "end": 113217983, + "start": 113217870 + }, + { + "end": 113219632, + "start": 113219536 + }, + { + "end": 113220399, + "start": 113220395 + }, + { + "end": 113220842, + "start": 113220756 + }, + { + "end": 113221393, + "start": 113221232 + }, + { + "end": 113228306, + "start": 113228145 + }, + { + "end": 113231381, + "start": 113231220 + }, + { + "end": 113233877, + "start": 113233644 + }, + { + "end": 113234603, + "start": 113234439 + }, + { + "end": 113238595, + "start": 113238484 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113341823, + "start": 113341293 + } + ], + "is_best_transcript": false, + "name": "ENST00000302728", + "start": 113204759, + "translations": [ + { + "cdna_coding_end": 4650, + "cdna_coding_start": 1, + "domains": [ + { + "name": "PS50825", + "regions": [ + { + "end": 642, + "start": 560 + }, + { + "end": 724, + "start": 643 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 360, + "start": 310 + }, + { + "end": 1052, + "start": 1005 + }, + { + "end": 1106, + "start": 1059 + }, + { + "end": 1160, + "start": 1113 + } + ] + }, + { + "name": "PS50311", + "regions": [ + { + "end": 1409, + "start": 1197 + } + ] + }, + { + "name": "SM00181", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1234 + }, + { + "end": 1305, + "start": 1272 + }, + { + "end": 1343, + "start": 1310 + }, + { + "end": 1381, + "start": 1348 + }, + { + "end": 1419, + "start": 1386 + } + ] + }, + { + "name": "SSF57196", + "regions": [ + { + "end": 1267, + "start": 1189 + }, + { + "end": 1305, + "start": 1268 + }, + { + "end": 1342, + "start": 1306 + }, + { + "end": 1423, + "start": 1344 + } + ] + }, + { + "name": "PS50026", + "regions": [ + { + "end": 1229, + "start": 1193 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 440, + "start": 269 + }, + { + "end": 1144, + "start": 988 + } + ] + }, + { + "name": "SM00179", + "regions": [ + { + "end": 1229, + "start": 1196 + }, + { + "end": 1267, + "start": 1231 + }, + { + "end": 1305, + "start": 1269 + }, + { + "end": 1343, + "start": 1307 + }, + { + "end": 1381, + "start": 1345 + }, + { + "end": 1419, + "start": 1383 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 252, + "start": 84 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 433, + "start": 378 + }, + { + "end": 493, + "start": 438 + }, + { + "end": 559, + "start": 498 + }, + { + "end": 787, + "start": 727 + } + ] + }, + { + "name": "PF02494", + "regions": [ + { + "end": 642, + "start": 561 + }, + { + "end": 721, + "start": 644 + } + ] + }, + { + "name": "PR00010", + "regions": [ + { + "end": 1318, + "start": 1307 + }, + { + "end": 1364, + "start": 1357 + }, + { + "end": 1413, + "start": 1403 + }, + { + "end": 1420, + "start": 1414 + } + ] + }, + { + "name": "PF00354", + "regions": [ + { + "end": 1532, + "start": 1442 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 433, + "start": 374 + }, + { + "end": 493, + "start": 434 + }, + { + "end": 560, + "start": 494 + }, + { + "end": 790, + "start": 727 + } + ] + }, + { + "name": "SSF49899", + "regions": [ + { + "end": 1547, + "start": 1421 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 264, + "start": 83 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 262, + "start": 79 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 430, + "start": 378 + }, + { + "end": 493, + "start": 438 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 435, + "start": 376 + }, + { + "end": 495, + "start": 436 + }, + { + "end": 561, + "start": 496 + }, + { + "end": 789, + "start": 725 + } + ] + }, + { + "name": "PF07645", + "regions": [ + { + "end": 1262, + "start": 1231 + }, + { + "end": 1338, + "start": 1308 + } + ] + }, + { + "name": "PF00008", + "regions": [ + { + "end": 1226, + "start": 1197 + }, + { + "end": 1265, + "start": 1235 + }, + { + "end": 1302, + "start": 1273 + }, + { + "end": 1337, + "start": 1311 + }, + { + "end": 1379, + "start": 1349 + }, + { + "end": 1417, + "start": 1387 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 260, + "start": 81 + } + ] + } + ] + } + ] + }, + { + "end": 113342160, + "exons": [ + { + "end": 113238595, + "start": 113238163 + }, + { + "end": 113242036, + "start": 113241915 + }, + { + "end": 113243716, + "start": 113243522 + }, + { + "end": 113244772, + "start": 113244641 + }, + { + "end": 113245973, + "start": 113245866 + }, + { + "end": 113252059, + "start": 113251930 + }, + { + "end": 113259213, + "start": 113259095 + }, + { + "end": 113261518, + "start": 113261321 + }, + { + "end": 113265497, + "start": 113265318 + }, + { + "end": 113275385, + "start": 113275206 + }, + { + "end": 113276386, + "start": 113276228 + }, + { + "end": 113308571, + "start": 113308395 + }, + { + "end": 113312384, + "start": 113312129 + }, + { + "end": 113342160, + "start": 113341293 + } + ], + "is_best_transcript": false, + "name": "ENST00000374461", + "start": 113238163, + "translations": [ + { + "cdna_coding_end": 2944, + "cdna_coding_start": 407, + "domains": [ + { + "name": "PF02494", + "regions": [ + { + "end": 619, + "start": 538 + }, + { + "end": 698, + "start": 621 + } + ] + }, + { + "name": "SM00032", + "regions": [ + { + "end": 410, + "start": 355 + }, + { + "end": 470, + "start": 415 + }, + { + "end": 536, + "start": 475 + }, + { + "end": 764, + "start": 704 + } + ] + }, + { + "name": "SSF57535", + "regions": [ + { + "end": 410, + "start": 351 + }, + { + "end": 470, + "start": 411 + }, + { + "end": 537, + "start": 471 + }, + { + "end": 767, + "start": 704 + } + ] + }, + { + "name": "PF07699", + "regions": [ + { + "end": 337, + "start": 287 + } + ] + }, + { + "name": "PS50825", + "regions": [ + { + "end": 619, + "start": 537 + }, + { + "end": 701, + "start": 620 + } + ] + }, + { + "name": "PF00092", + "regions": [ + { + "end": 229, + "start": 61 + } + ] + }, + { + "name": "SSF57184", + "regions": [ + { + "end": 417, + "start": 246 + } + ] + }, + { + "name": "PS50923", + "regions": [ + { + "end": 412, + "start": 353 + }, + { + "end": 472, + "start": 413 + }, + { + "end": 538, + "start": 473 + }, + { + "end": 766, + "start": 702 + } + ] + }, + { + "name": "SM00327", + "regions": [ + { + "end": 237, + "start": 58 + } + ] + }, + { + "name": "PS50234", + "regions": [ + { + "end": 241, + "start": 60 + } + ] + }, + { + "name": "SSF53300", + "regions": [ + { + "end": 239, + "start": 56 + } + ] + }, + { + "name": "PF00084", + "regions": [ + { + "end": 407, + "start": 355 + }, + { + "end": 470, + "start": 415 + } + ] + } + ] + } + ] + } + ] + }, + { + "aliases": [ + "ARID1B" + ], + "chr": "6", + "end": 157530401, + "name": "ENSG00000049618", + "start": 157099063, + "strand": "+", + "transcripts": [ + { + "end": 157529495, + "exons": [ + { + "end": 157100605, + "start": 157099063 + }, + { + "end": 157150555, + "start": 157150361 + }, + { + "end": 157192786, + "start": 157192748 + }, + { + "end": 157222659, + "start": 157222510 + }, + { + "end": 157256710, + "start": 157256600 + }, + { + "end": 157406039, + "start": 157405796 + }, + { + "end": 157431695, + "start": 157431606 + }, + { + "end": 157454341, + "start": 157454162 + }, + { + "end": 157470085, + "start": 157469758 + }, + { + "end": 157488319, + "start": 157488174 + }, + { + "end": 157495251, + "start": 157495142 + }, + { + "end": 157502312, + "start": 157502103 + }, + { + "end": 157505569, + "start": 157505365 + }, + { + "end": 157510914, + "start": 157510776 + }, + { + "end": 157511344, + "start": 157511172 + }, + { + "end": 157517449, + "start": 157517299 + }, + { + "end": 157520041, + "start": 157519945 + }, + { + "end": 157522622, + "start": 157521839 + }, + { + "end": 157525130, + "start": 157525000 + }, + { + "end": 157529495, + "start": 157527301 + } + ], + "is_best_transcript": true, + "name": "ENST00000346085", + "start": 157099063, + "translations": [ + { + "cdna_coding_end": 6751, + "cdna_coding_start": 2, + "domains": [ + { + "name": "PF12031", + "regions": [ + { + "end": 2195, + "start": 1939 + } + ] + }, + { + "name": "PS50324", + "regions": [ + { + "end": 57, + "start": 35 + }, + { + "end": 784, + "start": 697 + } + ] + }, + { + "name": "PF01388", + "regions": [ + { + "end": 1153, + "start": 1065 + } + ] + }, + { + "name": "PS50099", + "regions": [ + { + "end": 820, + "start": 715 + }, + { + "end": 1610, + "start": 1472 + } + ] + }, + { + "name": "SSF48371", + "regions": [ + { + "end": 2220, + "start": 2075 + } + ] + }, + { + "name": "PS50316", + "regions": [ + { + "end": 104, + "start": 81 + } + ] + }, + { + "name": "PS50322", + "regions": [ + { + "end": 131, + "start": 107 + }, + { + "end": 646, + "start": 574 + } + ] + }, + { + "name": "PS51011", + "regions": [ + { + "end": 1157, + "start": 1066 + } + ] + }, + { + "name": "PS50310", + "regions": [ + { + "end": 47, + "start": 2 + }, + { + "end": 493, + "start": 329 + } + ] + }, + { + "name": "PS50315", + "regions": [ + { + "end": 401, + "start": 141 + } + ] + }, + { + "name": "SSF46774", + "regions": [ + { + "end": 1168, + "start": 1049 + } + ] + }, + { + "name": "SM00501", + "regions": [ + { + "end": 1158, + "start": 1067 + } + ] + } + ] + } + ] + } + ] + } + ] +} diff --git a/tests/tools/test_convert_annotations_format.py b/tests/tools/test_convert_annotations_format.py index a5530dd7..a42daadb 100644 --- a/tests/tools/test_convert_annotations_format.py +++ b/tests/tools/test_convert_annotations_format.py @@ -7,12 +7,14 @@ convert_gff2_to_mavis, convert_gff3_to_mavis, convert_mavis_json_2to3, + convert_tab_to_json, ) CONVERTERS = { 'gff3': convert_gff3_to_mavis, 'gtf': convert_gff2_to_mavis, 'v2-json': convert_mavis_json_2to3, + 'v2-tab': convert_tab_to_json, } @@ -45,6 +47,11 @@ def sort_elements(data): ['Homo_sapiens.GRCh38.kras.gff3', 'Homo_sapiens.GRCh38.kras.gff3.json', 'gff3'], ['Homo_sapiens.GRCh38.kras.gtf', 'Homo_sapiens.GRCh38.kras.gtf.json', 'gtf'], ['example_genes.v2.json', 'example_genes.v3.json', 'v2-json'], + [ + 'ensembl69_hg19_annotations.kras.tab', + 'ensembl69_hg19_annotations.kras.tab.json', + 'v2-tab', + ], ], ) def test_gff_examples(filename, expected_file, input_type): From af6f8230a6c3947af0c5abcc81824439ba5d0d1a Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 8 Feb 2022 20:23:46 -0800 Subject: [PATCH 117/137] Only swap order when generating cdna_coord --- src/mavis/annotate/file_io.py | 39 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index bcd0db0f..0f8e9aef 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -153,26 +153,31 @@ def parse_annotations_json( for translation in transcript.get('translations', []): try: - if 'cdna_coding_end' not in translation: - translation['cdna_coding_end'] = spl_tx.convert_genomic_to_cdna( - translation['end'] - ) - if 'cdna_coding_start' not in translation: - translation['cdna_coding_start'] = spl_tx.convert_genomic_to_cdna( - translation['start'] - ) + if ( + 'cdna_coding_end' not in translation + or 'cdna_coding_start' not in translation + ): + if 'cdna_coding_end' not in translation: + translation['cdna_coding_end'] = spl_tx.convert_genomic_to_cdna( + translation['end'] + ) + if 'cdna_coding_start' not in translation: + translation['cdna_coding_start'] = spl_tx.convert_genomic_to_cdna( + translation['start'] + ) + + if gene.strand == STRAND.NEG: + translation['cdna_coding_start'], translation['cdna_coding_end'] = ( + translation['cdna_coding_end'], + translation['cdna_coding_start'], + ) + except IndexError as err: raise IndexError( f'Invalid specification of CDS ({translation["name"]}: {translation["start"]}-{translation["end"]}) ' f'region on transcript ({transcript["name"]}: {transcript["start"]}-{transcript["end"]}): {err}' ) - if gene.strand == STRAND.NEG: - translation['cdna_coding_start'], translation['cdna_coding_end'] = ( - translation['cdna_coding_end'], - translation['cdna_coding_start'], - ) - tx_length = ( translation['cdna_coding_end'] - translation['cdna_coding_start'] + 1 ) @@ -191,7 +196,7 @@ def parse_annotations_json( for region in regions: if region.start < 1 or region.end > tx_length: raise AssertionError( - 'region cannot be outside the translated length' + f'region ({dom["name"]}:{region.start}-{region.end}) cannot be outside the translated length ({tx_length})' ) domains.append( Domain( @@ -203,8 +208,8 @@ def parse_annotations_json( except AssertionError as err: logger.warning(repr(err)) translation = Translation( - translation['cdna_coding_start'], - translation['cdna_coding_end'], + start=translation['cdna_coding_start'], + end=translation['cdna_coding_end'], transcript=spl_tx, domains=domains, name=translation.get('name'), From 8edf65d453f7e9897b75abd20eeebd96acb3a6c0 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 8 Feb 2022 20:24:41 -0800 Subject: [PATCH 118/137] Reformat json to nest translations in v3 --- src/tools/convert_annotations_format.py | 63 ++++++++++++++++++------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index a01176f0..ac2b2671 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -7,9 +7,6 @@ import pandas as pd from mavis.annotate.file_io import parse_annotations_json -# pd.set_option('display.width', 250) -pd.options.display.width = 0 - PANDAS_DEFAULT_NA_VALUES = [ '-1.#IND', '1.#QNAN', @@ -92,12 +89,12 @@ def agg_strings_unique(series): def strip_empty_fields(input_obj): - """Remove all empty string fields from some dictionary object to reduce the size""" + """Remove all empty string or null fields from some dictionary object to reduce the size""" if isinstance(input_obj, dict): result = {} for k, v in input_obj.items(): - if v == '' or (isinstance(v, list) and not len(v)): + if v == '' or v is None or (isinstance(v, list) and not len(v)): continue result[k] = strip_empty_fields(v) return result @@ -106,6 +103,22 @@ def strip_empty_fields(input_obj): return input_obj +def coerce_number_types(input_obj, fields=['start', 'end', 'coding_cdna_start', 'coding_cdna_end']): + if isinstance(input_obj, dict): + result = {} + for k, v in input_obj.items(): + if k in fields and isinstance(v, str): + if v.lower() in {'', 'null', 'none'}: + continue + result[k] = int(v) + else: + result[k] = coerce_number_types(v) + return result + elif isinstance(input_obj, list): + return [coerce_number_types(v) for v in input_obj] + return input_obj + + def convert_tab_to_json(filepath: str) -> Dict: """ given a file in the std input format (see below) reads and return a list of genes (and sub-objects) @@ -167,10 +180,16 @@ def parse_domain_list(row): logging.warning(f'error in domain: {domain}, {row}, {repr(err)}') return domains + skip_lines = 0 + with open(filepath, 'r') as fh: + lines = fh.readlines() + skip_lines = len([l for l in lines if l.startswith('##')]) + df = pd.read_csv( filepath, + skiprows=skip_lines, dtype={ - 'ensembl_gene_id': str, + '#ensembl_gene_id': str, 'ensembl_transcript_id': str, 'chr': str, 'cdna_coding_start': pd.Int64Dtype(), @@ -185,8 +204,7 @@ def parse_domain_list(row): 'gene_end': int, }, sep='\t', - comment='#', - ) + ).rename(columns={'#ensembl_gene_id': 'ensembl_gene_id'}) for col in ['ensembl_gene_id', 'chr', 'ensembl_transcript_id', 'gene_start', 'gene_end']: if col not in df: @@ -228,19 +246,20 @@ def parse_domain_list(row): 'is_best_transcript': is_best_transcript, 'name': row['ensembl_transcript_id'], 'exons': row.get('genomic_exon_ranges', []), - 'domains': row.get('AA_domain_ranges', []), 'start': row.get('transcript_genomic_start'), 'end': row.get('transcript_genomic_end'), - 'cdna_coding_start': row.get('cdna_coding_start'), - 'cdna_coding_end': row.get('cdna_coding_end'), 'aliases': [], + 'translations': [ + { + 'domains': row.get('AA_domain_ranges', []), + 'cdna_coding_start': row.get('cdna_coding_start'), + 'cdna_coding_end': row.get('cdna_coding_end'), + } + ], } - for int_value in ['start', 'end', 'cdna_coding_start', 'cdna_coding_end']: - if transcript.get(int_value) is not None: - transcript[int_value] = int(transcript[int_value]) gene['transcripts'].append(transcript) - return {'genes': list(genes.values())} + return coerce_number_types({'genes': list(genes.values())}) def strip_id_field(feature_id) -> Tuple[str, str]: @@ -620,8 +639,6 @@ def feature_key(row, parent=False): short_msg = '. '.join( [line for line in str(err).split('\n') if line.strip()][:3] ) # these can get super long - with open('tmp_out.json', 'w') as fh: - fh.write(json.dumps(result, sort_keys=True, indent=' ')) raise AssertionError(short_msg) # re-strip (mavis adds defaults) result = strip_empty_fields({'genes': list(genes_by_id.values())}) @@ -799,6 +816,10 @@ def convert_mavis_json_2to3(filename): # move translations into sep object for gene in content['genes']: + if gene['strand'] == '1': + gene['strand'] = '+' + elif gene['strand'] == '-1': + gene['strand'] = '-' for transcript in gene.get('transcripts', []): if any(transcript.get(k) for k in ['cdna_coding_start', 'cdna_coding_end', 'domains']): transcript['translations'] = [ @@ -811,12 +832,14 @@ def convert_mavis_json_2to3(filename): del transcript['domains'] del transcript['cdna_coding_start'] del transcript['cdna_coding_end'] + content = coerce_number_types(content) + content = strip_empty_fields(content) parse_annotations_json(content) content = strip_empty_fields(content) return content -if __name__ == '__main__': +def main(): parser = argparse.ArgumentParser() parser.add_argument( 'input', help='path to the tab-delimated mavis v2 style reference annotations file' @@ -849,3 +872,7 @@ def convert_mavis_json_2to3(filename): logging.info(f'writing: {args.output}') with open(args.output, 'w') as fh: fh.write(json.dumps(annotations, sort_keys=True)) + + +if __name__ == '__main__': + main() From 480c66e97f60dcbb1be37126e6b2cb9431ee07f6 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 8 Feb 2022 20:25:13 -0800 Subject: [PATCH 119/137] Update ensembl script to generate nested translations --- src/tools/generate_ensembl_json.py | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/tools/generate_ensembl_json.py b/src/tools/generate_ensembl_json.py index 9c44d2f1..91434eea 100755 --- a/src/tools/generate_ensembl_json.py +++ b/src/tools/generate_ensembl_json.py @@ -20,7 +20,6 @@ import simplejson as json from pyensembl import EnsemblRelease - VERSION = "1.0.0" SCRIPT = os.path.abspath(__file__) CACHE_DEFAULT = os.environ["HOME"] + "/.cache" @@ -248,7 +247,7 @@ def __init__( self.alias = defaultdict(set) self.data = EnsemblRelease(release, species) - self.download_pyensembl_cache() + self.download_pyensembl_cache(self.data, self.custom_cache) self.get_domain_cache() if self.best_file: @@ -311,35 +310,38 @@ def get_transcripts(self, eid): if not protein_id: return None + translation = { + 'domains': [], + "name": protein_id, + } result = { "name": str(transcript.transcript_id), "start": int(transcript.start), "end": int(transcript.end), "aliases": [str(transcript.transcript_name)], "is_best_transcript": str(transcript.transcript_id) in self.best, - "protein_id": transcript.protein_id, "exons": [], - "domains": [], + "translations": [translation], } # start/end are absolute genomic positions, so calculate positions relative to the mRNA start cpos = transcript.coding_sequence_position_ranges if transcript.strand in ("+", "1"): - result["cdna_coding_start"] = transcript.spliced_offset(cpos[0][0]) + 1 - result["cdna_coding_end"] = transcript.spliced_offset(cpos[-1][1]) + 1 + translation["cdna_coding_start"] = transcript.spliced_offset(cpos[0][0]) + 1 + translation["cdna_coding_end"] = transcript.spliced_offset(cpos[-1][1]) + 1 elif transcript.strand in ("-", "-1"): - result["cdna_coding_start"] = transcript.spliced_offset(cpos[0][1]) + 1 - result["cdna_coding_end"] = transcript.spliced_offset(cpos[-1][0]) + 1 + translation["cdna_coding_start"] = transcript.spliced_offset(cpos[0][1]) + 1 + translation["cdna_coding_end"] = transcript.spliced_offset(cpos[-1][0]) + 1 return result - def get_exons(self, eid): + def get_exons(self, eid: str) -> dict: """ Method parse exon info in the EnsemblRelease into json format. Args: - eid (str): Ensembl exon ID + eid: Ensembl exon ID Returns: - dict: exon info formatted for json + exon info formatted for json """ exon = self.data.exon_by_id(eid) result = {"name": str(exon.exon_id), "start": int(exon.start), "end": int(exon.end)} @@ -347,7 +349,7 @@ def get_exons(self, eid): return result @cached_domains - def get_domains(self, eid): + def get_domains(self, eid: str): """ Method request domain info from Ensembl and parse into json format. Args: @@ -404,8 +406,9 @@ def build_json(self): for eid in self.data.exon_ids_of_transcript_id(tid): exond = self.get_exons(eid) transd["exons"].append(exond) - domains = self.get_domains(transd["protein_id"]) - transd["domains"] = domains + for translation in transd['translations']: + domains = self.get_domains(translation["name"]) + transd["domains"] = domains gened["transcripts"].append(transd) else: count["non_coding"] += 1 @@ -547,7 +550,10 @@ def main(): help="a tab-separated file of Ensembl gene IDs and gene aliases (one ID and one alias per line)", ) opt_parser.add_argument( - "-c", "--custom-cache", help="use a non-default path to cache ensembl data" + "-c", + "--custom-cache", + help="use a non-default path to cache ensembl data", + default=CACHE_DEFAULT, ) opt_parser.add_argument( "-d", From 42fa68249049313e9e9d6e9f40fd54538ea1642f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 8 Feb 2022 22:26:38 -0800 Subject: [PATCH 120/137] Update references to pre-built annotations files --- docs/inputs/reference.md | 4 +- docs/tutorials/full.md | 2 +- src/mavis/annotate/file_io.py | 16 +++++++- src/tools/convert_annotations_format.py | 52 ++++++++++++++++++------- src/tools/generate_ensembl_json.py | 5 +-- src/tools/get_hg19_reference_files.sh | 3 +- src/tools/get_hg38_reference_files.sh | 3 +- 7 files changed, 60 insertions(+), 25 deletions(-) diff --git a/docs/inputs/reference.md b/docs/inputs/reference.md index 854e1058..8c7f0f36 100644 --- a/docs/inputs/reference.md +++ b/docs/inputs/reference.md @@ -15,7 +15,7 @@ not available, | File Name (Type/Format) | Environment Variable | Download | | --------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [reference genome](../../inputs/reference/#reference-genome) ([fasta](../../glossary/#fasta)) | `MAVIS_REFERENCE_GENOME` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.tar.gz) | -| [annotations](../../inputs/reference/#annotations) ([JSON](../../glossary/#json)) | `MAVIS_ANNOTATIONS` | [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69](http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations.json)
[![](../images/get_app-24px.svg) GRCh38 + Ensembl79](http://www.bcgsc.ca/downloads/mavis/ensembl79_hg38_annotations.json) | +| [annotations](../../inputs/reference/#annotations) ([JSON](../../glossary/#json)) | `MAVIS_ANNOTATIONS` | [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69](http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz)
[![](../images/get_app-24px.svg) GRCh38 + Ensembl79](http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz) | | [masking](../../inputs/reference/#masking-file) (text/tabbed) | `MAVIS_MASKING` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab) | | [template metadata](../../inputs/reference/#template-metadata) (text/tabbed) | `MAVIS_TEMPLATE_METADATA` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) | | [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | @@ -82,7 +82,7 @@ the ensembl annotations file including non-coding transcripts below. annotations file. On our standard COLO829 we increased the default memory for the annotation step from 12G to 18G. -[![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69 (includes non-coding genes)](http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations_with_ncrna.json) +[![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69 (includes non-coding genes)](http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations_with_ncrna.v3.json.gz) !!! warning the `mavis.annotate.file_io.load_reference_genes`{.interpreted-text diff --git a/docs/tutorials/full.md b/docs/tutorials/full.md index dc1828a9..20054def 100644 --- a/docs/tutorials/full.md +++ b/docs/tutorials/full.md @@ -155,7 +155,7 @@ Finally you will need to set output directory and the reference files "reference_inputs/hg19.2bit" ], "reference.annotations": [ - "reference_inputs/ensembl69_hg19_annotations.json" + "reference_inputs/ensembl69_hg19_annotations.v3.json" ], "reference.dgv_annotation": [ "reference_inputs/dgv_hg19_variants.tab" diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 0f8e9aef..7220fc9c 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -113,6 +113,8 @@ def parse_annotations_json( raise AssertionError(short_msg) genes_by_chr: ReferenceAnnotations = {} + tx_skipped = 0 + domain_errors = 0 for gene_dict in data['genes']: @@ -183,7 +185,8 @@ def parse_annotations_json( ) # check that the translation makes sense before including it if tx_length % CODON_SIZE != 0: - logger.warning( + tx_skipped += 1 + logger.debug( f'Ignoring translation ({translation.get("name")}). The translated region is not a multiple of three (length={tx_length})' ) continue @@ -206,7 +209,8 @@ def parse_annotations_json( ) ) except AssertionError as err: - logger.warning(repr(err)) + domain_errors += 1 + logger.debug(repr(err)) translation = Translation( start=translation['cdna_coding_start'], end=translation['cdna_coding_end'], @@ -227,6 +231,14 @@ def parse_annotations_json( spl_tx.translations.append(translation) if not best_transcripts_only or has_best: genes_by_chr.setdefault(gene.chr, []).append(gene) + if tx_skipped: + logger.warning( + f'Skipped {tx_skipped} translations where the CDS length was not a multiple of 3' + ) + if domain_errors: + logger.warning( + f'Skipped {domain_errors} domains due to errors (coordinates defined outside the translated region)' + ) return genes_by_chr diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index ac2b2671..775b6d1d 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -183,7 +183,7 @@ def parse_domain_list(row): skip_lines = 0 with open(filepath, 'r') as fh: lines = fh.readlines() - skip_lines = len([l for l in lines if l.startswith('##')]) + skip_lines = len([line for line in lines if line.startswith('##')]) df = pd.read_csv( filepath, @@ -815,26 +815,47 @@ def convert_mavis_json_2to3(filename): content = json.load(fh) # move translations into sep object + skipped_tx = 0 + total_tx = 0 for gene in content['genes']: - if gene['strand'] == '1': + if str(gene['strand']) == '1': gene['strand'] = '+' - elif gene['strand'] == '-1': + elif str(gene['strand']) == '-1': gene['strand'] = '-' for transcript in gene.get('transcripts', []): - if any(transcript.get(k) for k in ['cdna_coding_start', 'cdna_coding_end', 'domains']): - transcript['translations'] = [ - { - 'cdna_coding_start': transcript['cdna_coding_start'], - 'cdna_coding_end': transcript['cdna_coding_end'], - 'domains': transcript['domains'], - } - ] - del transcript['domains'] + if all(transcript.get(k) for k in ['cdna_coding_start', 'cdna_coding_end']): + total_tx += 1 + translation = { + 'cdna_coding_start': transcript['cdna_coding_start'], + 'cdna_coding_end': transcript['cdna_coding_end'], + 'domains': transcript.get('domains', []), + } + translated_length = ( + 1 + transcript['cdna_coding_end'] - transcript['cdna_coding_start'] + ) + + if 'domains' in transcript: + del transcript['domains'] + del transcript['cdna_coding_start'] del transcript['cdna_coding_end'] + + if translated_length % 3 != 0: + skipped_tx += 1 + logging.debug( + f'Ignoring translation ({transcript.get("name")}). The translated region is not a multiple of three (length={translated_length})' + ) + continue + transcript['translations'] = [translation] + if skipped_tx: + logging.warning( + f'dropped {skipped_tx} / {total_tx} translations for lengths that were not a multiple of 3' + ) content = coerce_number_types(content) content = strip_empty_fields(content) + logging.info('testing new JSON with MAVIS loader') parse_annotations_json(content) + logging.info('removing unnecessary empty fields') content = strip_empty_fields(content) return content @@ -857,8 +878,11 @@ def main(): ) args = parser.parse_args() - - logging.basicConfig(format='{message}', style='{', level=logging.getLevelName(args.log_level)) + logging.basicConfig( + format='{asctime} [{levelname}] {message}', + style='{', + level=logging.getLevelName(args.log_level), + ) if args.input_type == 'v2-tab': annotations = convert_tab_to_json(args.input) diff --git a/src/tools/generate_ensembl_json.py b/src/tools/generate_ensembl_json.py index 91434eea..176caee1 100755 --- a/src/tools/generate_ensembl_json.py +++ b/src/tools/generate_ensembl_json.py @@ -247,7 +247,7 @@ def __init__( self.alias = defaultdict(set) self.data = EnsemblRelease(release, species) - self.download_pyensembl_cache(self.data, self.custom_cache) + self.download_pyensembl_cache() self.get_domain_cache() if self.best_file: @@ -260,9 +260,6 @@ def __init__( def download_pyensembl_cache(self): """ Method download the pyensembl cache files for this release if not already there. - Args: - data (EnsemblRelease): pyensembl object for the release info - custom_cache (str): path to cirectory to cache pyensembl files """ if self.custom_cache: os.environ["PYENSEMBL_CACHE_DIR"] = self.custom_cache diff --git a/src/tools/get_hg19_reference_files.sh b/src/tools/get_hg19_reference_files.sh index 3fb40f46..eba597de 100644 --- a/src/tools/get_hg19_reference_files.sh +++ b/src/tools/get_hg19_reference_files.sh @@ -15,7 +15,8 @@ rm -f chr*.fa rm -f chromeFa.tar.gz echo "downloading the gene annotations file" -wget http://www.bcgsc.ca/downloads/mavis/ensembl69_hg19_annotations.json +wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz +gunzip ensembl69_hg19_annotations.v3.json.gz echo "downloading the masking file" wget http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab diff --git a/src/tools/get_hg38_reference_files.sh b/src/tools/get_hg38_reference_files.sh index 97c1face..c63bfb52 100644 --- a/src/tools/get_hg38_reference_files.sh +++ b/src/tools/get_hg38_reference_files.sh @@ -5,7 +5,8 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRC gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz echo "downloading the gene annotations file" -wget http://www.bcgsc.ca/downloads/mavis/ensembl79_hg38_annotations.json +wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz +gunzip ensembl79_hg38_annotations.v3.json.gz echo "downloading the masking file" wget http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab From 74a2bf0c189721b4f6ad8081b4622158f8e24c19 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 9 Feb 2022 12:04:06 -0800 Subject: [PATCH 121/137] Support loading gzipped vcfs --- src/mavis/cluster/main.py | 2 +- src/mavis/tools/__init__.py | 2 +- src/mavis/tools/vcf.py | 24 +++++++++++++++++------- tests/full-tutorial.config.json | 2 +- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/mavis/cluster/main.py b/src/mavis/cluster/main.py index 861a1aea..1e1b5ff9 100644 --- a/src/mavis/cluster/main.py +++ b/src/mavis/cluster/main.py @@ -158,7 +158,7 @@ def main( annotations.content, breakpoint_pairs, max_proximity=config[f'{SECTION}.max_proximity'] ) logger.info( - f'filtered from {len(breakpoint_pairs)} down to {len(pass_clusters)} (removed {uninformative_clusters})' + f'filtered from {len(breakpoint_pairs)} down to {len(pass_clusters)} (removed {len(uninformative_clusters)})' ) breakpoint_pairs = pass_clusters for bpp in uninformative_clusters: diff --git a/src/mavis/tools/__init__.py b/src/mavis/tools/__init__.py index 8649b31f..2dd0bc72 100644 --- a/src/mavis/tools/__init__.py +++ b/src/mavis/tools/__init__.py @@ -294,7 +294,7 @@ def _convert_tool_output( df.columns = [c[1:] if c.startswith('#') else c for c in df.columns] rows = df.where(df.notnull(), None).to_dict('records') if rows: - logger.info('found', len(rows), 'rows') + logger.info(f'found {len(rows)} rows') for row in rows: try: std_rows = _convert_tool_row( diff --git a/src/mavis/tools/vcf.py b/src/mavis/tools/vcf.py index f756df43..c1c07faf 100644 --- a/src/mavis/tools/vcf.py +++ b/src/mavis/tools/vcf.py @@ -1,3 +1,4 @@ +import gzip import logging import re from dataclasses import dataclass @@ -269,13 +270,22 @@ def pandas_vcf(input_file: str) -> Tuple[List[str], pd.DataFrame]: Read a standard vcf file into a pandas dataframe """ # read the comment/header information - header_lines = [] - with open(input_file, 'r') as fh: - line = '##' - while line.startswith('##'): - header_lines.append(line) - line = fh.readline().strip() - header_lines = header_lines[1:] + try: + header_lines = [] + with open(input_file, 'r') as fh: + line = '##' + while line.startswith('##'): + header_lines.append(line) + line = fh.readline().strip() + header_lines = header_lines[1:] + except UnicodeDecodeError: + header_lines = [] + with gzip.open(input_file, 'rt') as fh: + line = '##' + while line.startswith('##'): + header_lines.append(line) + line = fh.readline().strip() + header_lines = header_lines[1:] # read the data df = pd.read_csv( input_file, diff --git a/tests/full-tutorial.config.json b/tests/full-tutorial.config.json index bf125870..69fd41f1 100644 --- a/tests/full-tutorial.config.json +++ b/tests/full-tutorial.config.json @@ -77,7 +77,7 @@ "reference_inputs/hg19.2bit" ], "reference.annotations": [ - "reference_inputs/ensembl69_hg19_annotations.json" + "reference_inputs/ensembl69_hg19_annotations.v3.json" ], "reference.dgv_annotation": [ "reference_inputs/dgv_hg19_variants.tab" From 4663dfd4bf1d29ed7bc66cfa13be47207e1fc1fe Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 9 Feb 2022 12:22:44 -0800 Subject: [PATCH 122/137] Gzip one of the test vcfs --- tests/data/pindel_events.vcf | 50 ------------------------------- tests/data/pindel_events.vcf.gz | Bin 0 -> 1941 bytes tests/end_to_end/test_convert.py | 2 +- 3 files changed, 1 insertion(+), 51 deletions(-) delete mode 100644 tests/data/pindel_events.vcf create mode 100644 tests/data/pindel_events.vcf.gz diff --git a/tests/data/pindel_events.vcf b/tests/data/pindel_events.vcf deleted file mode 100644 index b64481f8..00000000 --- a/tests/data/pindel_events.vcf +++ /dev/null @@ -1,50 +0,0 @@ -##fileformat=VCFv4.0 -##fileDate=april2017 -##source=pindel -##reference=hg19 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT tumour -1 10097 . TAACCCTAACCC T . PASS END=10108;HOMLEN=11;HOMSEQ=AACCCTAACCC;SVLEN=-11;SVTYPE=DEL GT:AD 0/0:1,1 -1 10107 . C CA . PASS END=10107;HOMLEN=0;SVLEN=1;SVTYPE=INS GT:AD 0/0:1,1 -1 10108 . C CCTAACCCCTAACCCT . PASS END=10108;HOMLEN=0;SVLEN=15;SVTYPE=INS GT:AD 0/0:1,1 -1 10108 . C CAACCCTACCCCTACCCCTAACCCCTAACCCCTAACCCCAACCCCTACCCCTAACCCTAACCCTAAACCCT . PASS END=10108;HOMLEN=7;HOMSEQ=AACCCTA;SVLEN=70;SVTYPE=INS GT:AD 0/0:1,1 -1 10110 . ACCCTAACCCTAACCCTAACCCTAACCCTA AAACACAACCCCAAGCCTGAACTCCAGCCTCAACCAAATCCCATCCCCC . PASS END=10139;HOMLEN=0;SVLEN=-29;SVTYPE=RPL;NTLEN=48 GT:AD 0/0:1,1 -1 10113 . C CTAACCCTACCCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCA . PASS END=10113;HOMLEN=8;HOMSEQ=TAACCCTA;SVLEN=49;SVTYPE=INS GT:AD 0/0:1,1 -1 10128 . ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAA ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAAGCCTAACCCCTAACCCTAAGCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAA . PASS END=10256;HOMLEN=0;SVLEN=128;SVTYPE=DUP:TANDEM;NTLEN=89 GT:AD 0/0:1,4 -1 10172 . CCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC CCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCTTAACCTTAACCTTAACCTTAACCTAACCCTAACCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC . PASS END=10233;HOMLEN=0;SVLEN=61;SVTYPE=DUP:TANDEM;NTLEN=63 GT:AD 0/0:0,1 -1 10212 . ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAA ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAAGCCTAACCCCTAACCCTAAGCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAA . PASS END=10256;HOMLEN=0;SVLEN=44;SVTYPE=DUP:TANDEM;NTLEN=89 GT:AD 0/0:0,4 -1 10334 . TAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAA TCGCCCTAACCTTAACCCCCCACCCTCACCCAAACCCCCACCCCTCACCCCCACCCCCAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAC . PASS END=10371;HOMLEN=0;SVLEN=-37;SVTYPE=RPL;NTLEN=110 GT:AD 0/0:0,1 -1 10387 . TAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCC TAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCAAACCCTAACCCTAACCCTAACCCCAACCCTAACCCCAACCCAAACCCCAAACCCAACCCCCACCCATAACCAAACCGCAACACAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCC . PASS END=10430;HOMLEN=0;SVLEN=43;SVTYPE=DUP:TANDEM;NTLEN=103 GT:AD 0/0:1,1 -1 10398 . CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCCCCCGACCCTAGCCCTCACCCTTACCCTCTCCCTCTTTTTTACTGATACGGCGACCACCGAGATCTCCCCTCTTCCCCTCCACGCCGCTCTCCCGATCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC . PASS END=10480;HOMLEN=0;SVLEN=82;SVTYPE=DUP:TANDEM;NTLEN=97 GT:AD 0/0:18,1 -1 10399 . CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCG CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGCTATCCCTAACCCTGGACCTCACACTTTTTTTCAAGCAGAAGCCGGCATACGCGATATTACAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCG . PASS END=10472;HOMLEN=0;SVLEN=73;SVTYPE=DUP:TANDEM;NTLEN=96 GT:AD 0/0:17,1 -1 10399 . CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCC CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCTCACCCTCACACTCGCCCTACGCCTGACCCTATTTTTTCAAGCAGAAGACGGCATACGAGATATATAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCC . PASS END=10484;HOMLEN=0;SVLEN=85;SVTYPE=DUP:TANDEM;NTLEN=101 GT:AD 0/0:18,1 -1 10403 . ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGC ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCCCTAACCCTAACCCTAACCCCAACCCTACCCCTTCCCCTCACCCCTCGCCCTACCCCAAATCATAGCGCCTCCCGTTCCGACGCCCGCTCTCCCGCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGC . PASS END=10471;HOMLEN=0;SVLEN=68;SVTYPE=DUP:TANDEM;NTLEN=95 GT:AD 0/0:17,1 -1 10409 . ACCCTAACCCTAACCCTAACCCTAACCCTAAC A . PASS END=10440;HOMLEN=28;HOMSEQ=CCCTAACCCTAACCCTAACCCTAACCCT;SVLEN=-31;SVTYPE=DEL GT:AD 0/0:2,2 -1 10415 . ACCCTAACCCTAACCCTAACCCTAAC A . PASS END=10440;HOMLEN=27;HOMSEQ=CCCTAACCCTAACCCTAACCCTAACCC;SVLEN=-25;SVTYPE=DEL GT:AD 0/0:2,1 -1 10421 . A ACCCTAACCCTAACCCTAACCCCTAACCCTACCCCAACCCCTAC . PASS END=10421;HOMLEN=30;HOMSEQ=CCCTAACCCTAACCCTAACCCCTAACCCTA;SVLEN=43;SVTYPE=INS GT:AD 0/0:0,1 -1 10421 . ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCC ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCGCCACCCGCCCCCCCGTTTTTTCATGCTACGGCGACCACCGAGACCTACACTCTTTCCCTACACCCCGCCCTTCCGCCCTACCCTAACCCTAACCCCAACCCTTACCCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCC . PASS END=10496;HOMLEN=0;SVLEN=75;SVTYPE=DUP:TANDEM;NTLEN=106 GT:AD 0/0:18,1 -1 10439 . ACCCCTAACCCTAACCCTAACCCTAACCCTC AGCTCGACAGAGCACACATCTGAACCCCGGTCACTATAATATCCCCTAAGCCGTCTTCTGCTTCAAAAGCTGAGCCGCACGCCAGCCGTAGTCCCCGCCCCA . PASS END=10469;HOMLEN=0;SVLEN=-30;SVTYPE=RPL;NTLEN=101 GT:AD 0/0:15,1 -1 10440 . C CCCCTAA . PASS END=10440;HOMLEN=28;HOMSEQ=CCCTAACCCTAACCCTAACCCTAACCCT;SVLEN=6;SVTYPE=INS GT:AD 0/0:3,1 -1 10440 . C CCCCTAACCCTAA . PASS END=10440;HOMLEN=28;HOMSEQ=CCCTAACCCTAACCCTAACCCTAACCCT;SVLEN=12;SVTYPE=INS GT:AD 0/0:3,2 -1 10440 . C CCCCTAACCCCTAACCCTAA . PASS END=10440;HOMLEN=9;HOMSEQ=CCCTAACCC;SVLEN=19;SVTYPE=INS GT:AD 0/0:3,3 -1 10440 . C CCCCTAACCCCTAACCCTAACCCTAA . PASS END=10440;HOMLEN=9;HOMSEQ=CCCTAACCC;SVLEN=25;SVTYPE=INS GT:AD 0/0:3,1 -1 10440 . CCCCTAACCCTAACCCTAACCCTAACCCTCGCGGT CGCTCACCAGAGCCCCGAGCAGAGCAAGAGCGTAGACCTCGGCGGTCGCCGTAACCTTAAAAAAAACCATAACCATAACCATATCCCTGCCCTACCACAA . PASS END=10474;HOMLEN=0;SVLEN=-34;SVTYPE=RPL;NTLEN=99 GT:AD 0/0:18,1 -1 10440 . CCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCT CCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTCCGCCAACTCTGCCGGCACCCCCGACCTCCCCCCCCTCCTTTTTTAATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCT . PASS END=10508;HOMLEN=0;SVLEN=68;SVTYPE=DUP:TANDEM;NTLEN=115 GT:AD 0/0:18,2 -1 10440 . C CCCCTAACCCTAACTCTAGCACTCTAACCCTCTAACACTCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAA . PASS END=10440;HOMLEN=13;HOMSEQ=CCCTAACCCTAAC;SVLEN=80;SVTYPE=INS GT:AD 0/0:3,1 -1 10440 . CCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCC CCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCCCACCGCACCCGACCTTCCCCGTGCCCGCGTCCACCCCCTCCTTTTTTAATGATACTGCCACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCC . PASS END=10528;HOMLEN=0;SVLEN=88;SVTYPE=DUP:TANDEM;NTLEN=116 GT:AD 0/0:18,1 -1 10442 . CCTAACCCTAACCCTAA CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAAACCCAACCCTAACACAAAACCTCACCCCTGAACTCACGCCAC . PASS END=10458;HOMLEN=0;SVLEN=-16;SVTYPE=RPL;NTLEN=92 GT:AD 0/0:10,1 -1 10442 . C CCCCTAACCCTAACCCTAA . PASS END=10442;HOMLEN=1;HOMSEQ=C;SVLEN=18;SVTYPE=INS GT:AD 0/0:4,1 -1 10442 . CCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCC CTAACGGGAACCCTAACCCTAACGCTCCCCCCACACTCATATCTCGTACCACGGCTGCAGCACGAAGGTTGGCCTCAGGTGGGACTGTCACCTTGAGT . PASS END=10525;HOMLEN=0;SVLEN=-83;SVTYPE=RPL;NTLEN=97 GT:AD 0/0:18,1 -1 10447 . C CCCCTAACCCTAA . PASS END=10447;HOMLEN=2;HOMSEQ=CC;SVLEN=12;SVTYPE=INS GT:AD 0/0:5,2 -1 10452 . ACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGT AGATCGGAAGACCACACTCCTGAACTCCAGTCACTATAATATCCCTTATCCCGTCTCCTCCTTGAAAAAAAAACCTCACACACAACCGCACCCACTTTGCAATCCAA . PASS END=10551;HOMLEN=0;SVLEN=-99;SVTYPE=RPL;NTLEN=106 GT:AD 0/0:18,1 -1 10459 . C . PASS END=249239947;HOMLEN=0;SVLEN=249229492;SVTYPE=INV;NTLEN=0,0 GT:AD 0/1:11,3 diff --git a/tests/data/pindel_events.vcf.gz b/tests/data/pindel_events.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..79b9c756b99ae71ea41b7f67bf3455cc0257cc2f GIT binary patch literal 1941 zcmV;G2Wt2qiwFp{-g#gE18`|>WMyn$Wp-t5baO6tV`c!&S?h1wND%+*{uNgHmE`;c zJ1Lb`hHN7dl7?-h`{=+XSS2=RJFRkm{q5}7#!qlka(5st-iK#qfAiSc;nSywV8b7x z_{mSlcizQM>s0&jNe)i@l#l&w9Bd3tABn*vdXATTybZz?--v$9A9&0|F|>ZvT`_Pu zzqlTMzMPDw^T}ZT{B+Obu|8lgTRy%FQ~t=~!GtHvIM}8^6plYm!5Op_|(n~Oe7nb=S$mjUC%f7)(% zqAvY7ekpIZv8ToKTTcEI$uH742p_A|7yPehfd6QlTHN&na}qpcst4_C5ZZg5{^C4T zZ>JZkFEmSLO!YS#F1B1}!~VRP*7&q-5GEvWY5crQpJTD7R>bD4B=_kax#xv00Ix&{ zpYS+E8{2Mtgi;tchp}a};*!tB_3ag5sPxFVTQH0qk!sL{1-^on5xSX-+tpXOJpeRcE{3PPn2VK!M%^fMH3M0hxJZu#v z25!h=p@`5m)-_#o&M~%(b^RPYa$H(}E?s4OA|}zA$CK$yq3n!I6zvb~Odsfyldb_L zPw_~d{wSxfWwZ)Pm-9s{B?l>G5@bK!au;;%o`kdQd8M}Hhcp%x?S6SiH5fDTM%n@K zbWM;XO>U)D0p7&R1*Bq(iVhPUc$O1l34>W^gubg%nr?{5kFo3D4aNVBW0LHtJ1tALtj z9nhiPiRaKP;i+XlXy`_7AAFs^{4Z>~uN_)eKQq)aGc--Bb|mL*QwD{3v`~E}rU<7n zBwLaVq5NK{NK3jGs&MxK7z0j zxkwIE7$@Fq#IJy7nOehA%uY)YaJ8Irrt6d<)|;Z~t`$3?6phS#51R0yI7bU>WZ5Lc zR(=TNETW}Ut_Q4%7(f!R2!F){1vTIVEI2+4RgF0fjX6fA#zbOWQkYX{%#|8DmZV1{ zC7#)CgaY)SBa&%_$#haE0<h>ddSk}tuowSnM4?y)!WLXjAiI*P&9;W;9xvn!(=!{66iSNItvI-zIr?MJOW$y`t zdYe~SntAYM)Kk>sP-%T3dzSBJF$_Do`n>FJ2D*{D6ay5MA(s@f?kF3rc~RpiA9knD zuHEKEmge@*f>O(vEUSpBMseHQN2ACzItmWPz(8?yd#^bA1R3=Zq_`g2U28HI8%Y3P)!=rl5l>NQ!b%J!QcO=)J7;N8b0MltJxo`=4GK@Fl*JwK4R z5lkYytdsi+^L8%Q;*wp3iZ>!q)t%e85RCT4w_$61Gjq?{Ba%WOK2SJ|A?U-GVf6>( zq7ZB#>JvQX;WZFkMti^$&I%Y}RE}eIg-gT%kwaN8)mX!3y~xyBt^n1*ZRZ=aw5*gT z1v*Y-7`EHI+0OHroOPp%wb8-4hXigzfSg-L4a^*v2f4kKr_oVDt$};kcNBS=f%{5( zlRlhTu57{Th+Hj~duWJ6dTaJ6u48#C{XTbexSG|^xJIgRVp>gGb=_7U?7Oq}s31C& zVRpujLj}X5jCff_jzJRV@B?=uyi<{mJ5?iQ%3PzI10KpUUc32vmN& zU2eq?7a0eOELQ%O`d#t9hksy!x)Bxf1h%7)|7s^WjHb{cC=&8`nYw#;<9H{^XgGK# z6EDkRDZ{rZ%S%fQt7CYOzsBm0YICh9RqfDd*rBt}4%?4u%V@hsD*ZN7YD;w|v{?*y z`F#QZ(ZK_{o^IHUeVpjSmKbN0H=tEF&}Kk&S12ys#eaguvE;N{zqKEwBWM+Y7AtBr zX&w;K?Bq$oJc_3Tg_OsWkpk|?BFCh7y%vu^3aES1*x{mFw;H>Fwqv%S-C7H>tkK~_ zr1*2x*hQQJ*@m5mMZ2~B{yR>Cg@Oz@z)Pl>Op#tqvCfd8T>9g&mU=S$q4IEHi0tl6 z$C3sXRx#Ab3KZl8K&%FkZ8uBMZp;6bU5T|V3qQ)|%lX}xGG)VZ4bye4#(&c=VYs5% bVSktL&<49Qw|=JUA`SZoA`BWhR3rcZJeR;i literal 0 HcmV?d00001 diff --git a/tests/end_to_end/test_convert.py b/tests/end_to_end/test_convert.py index 81f57c1b..0e23328d 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/end_to_end/test_convert.py @@ -96,7 +96,7 @@ def test_manta(self): assert somatic_event.data.get('SOMATIC', False) is True def test_pindel(self): - self.run_main(get_data('pindel_events.vcf'), SUPPORTED_TOOL.PINDEL, False) + self.run_main(get_data('pindel_events.vcf.gz'), SUPPORTED_TOOL.PINDEL, False) def test_transabyss(self): self.run_main(get_data('transabyss_indels_output.tab'), SUPPORTED_TOOL.TA, False) From 8e90dc7e282a03ec21d69b7e732aa9e7d95363a6 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 9 Feb 2022 12:23:00 -0800 Subject: [PATCH 123/137] Add requests to tools dependencies --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 371d7e35..38a05509 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,6 +90,7 @@ deploy = tools = pyensembl simplejson + requests [options.entry_points] console_scripts = From 777dc0a6ec7efe70717e55f29adb664d3825972e Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 9 Feb 2022 13:35:09 -0800 Subject: [PATCH 124/137] Rename tools ubmodule to convert to match step name --- src/mavis/{tools => convert}/__init__.py | 0 src/mavis/{tools => convert}/breakdancer.py | 0 src/mavis/{tools => convert}/chimerascan.py | 0 src/mavis/{tools => convert}/cnvnator.py | 0 src/mavis/{tools => convert}/constants.py | 0 src/mavis/{tools => convert}/starfusion.py | 0 src/mavis/{tools => convert}/transabyss.py | 0 src/mavis/{tools => convert}/vcf.py | 0 src/mavis/main.py | 2 +- 9 files changed, 1 insertion(+), 1 deletion(-) rename src/mavis/{tools => convert}/__init__.py (100%) rename src/mavis/{tools => convert}/breakdancer.py (100%) rename src/mavis/{tools => convert}/chimerascan.py (100%) rename src/mavis/{tools => convert}/cnvnator.py (100%) rename src/mavis/{tools => convert}/constants.py (100%) rename src/mavis/{tools => convert}/starfusion.py (100%) rename src/mavis/{tools => convert}/transabyss.py (100%) rename src/mavis/{tools => convert}/vcf.py (100%) diff --git a/src/mavis/tools/__init__.py b/src/mavis/convert/__init__.py similarity index 100% rename from src/mavis/tools/__init__.py rename to src/mavis/convert/__init__.py diff --git a/src/mavis/tools/breakdancer.py b/src/mavis/convert/breakdancer.py similarity index 100% rename from src/mavis/tools/breakdancer.py rename to src/mavis/convert/breakdancer.py diff --git a/src/mavis/tools/chimerascan.py b/src/mavis/convert/chimerascan.py similarity index 100% rename from src/mavis/tools/chimerascan.py rename to src/mavis/convert/chimerascan.py diff --git a/src/mavis/tools/cnvnator.py b/src/mavis/convert/cnvnator.py similarity index 100% rename from src/mavis/tools/cnvnator.py rename to src/mavis/convert/cnvnator.py diff --git a/src/mavis/tools/constants.py b/src/mavis/convert/constants.py similarity index 100% rename from src/mavis/tools/constants.py rename to src/mavis/convert/constants.py diff --git a/src/mavis/tools/starfusion.py b/src/mavis/convert/starfusion.py similarity index 100% rename from src/mavis/tools/starfusion.py rename to src/mavis/convert/starfusion.py diff --git a/src/mavis/tools/transabyss.py b/src/mavis/convert/transabyss.py similarity index 100% rename from src/mavis/tools/transabyss.py rename to src/mavis/convert/transabyss.py diff --git a/src/mavis/tools/vcf.py b/src/mavis/convert/vcf.py similarity index 100% rename from src/mavis/tools/vcf.py rename to src/mavis/convert/vcf.py diff --git a/src/mavis/main.py b/src/mavis/main.py index c7e94bbe..fac18049 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -17,11 +17,11 @@ from .align import get_aligner_version from .annotate import main as annotate_main from .cluster import main as cluster_main +from .convert import SUPPORTED_TOOL, convert_tool_output from .overlay import check_overlay_args from .overlay import main as overlay_main from .pairing import main as pairing_main from .summary import main as summary_main -from .tools import SUPPORTED_TOOL, convert_tool_output from .util import filepath from .validate import main as validate_main From 428328b3fec59da426fee339009c67d956caf699 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 9 Feb 2022 13:35:26 -0800 Subject: [PATCH 125/137] Do not split tests by type - Make it easier to find tests for a particular part of MAVIS by mimicking the directory structure of the package itself. - To avoid a large number of changes and an untraceable diff, add 2 as suffix to all conflicting filenames --- .../{unit => }/data/calc_orf_test_sequence.fa | 0 tests/{unit => }/data/reference_sequences.fa | 0 .../data/test_assembly_sequences.txt | 0 tests/{end_to_end => test_mavis}/__init__.py | 0 .../{unit => test_mavis/annotate}/__init__.py | 0 .../annotate}/test_annotate.py | 5 +- .../annotate/test_annotate2.py} | 23 +++--- .../annotate}/test_annotate_examples.py | 4 +- .../annotate}/test_annotate_fileio.py | 0 .../annotate/test_annotate_fileio2.py} | 2 +- .../annotate}/test_call_indels.py | 2 +- .../annotate}/test_splicing.py | 5 +- .../config.py => test_mavis/bam/__init__.py} | 0 .../bam}/test_bam.py | 5 +- .../bam}/test_bam_cigar.py | 4 +- tests/test_mavis/cluster/__init__.py | 0 .../cluster}/test_cluster.py | 0 .../cluster/test_cluster2.py} | 4 +- tests/test_mavis/convert/__init__.py | 0 .../convert}/test_convert.py | 4 +- .../{unit => test_mavis/convert}/test_tool.py | 8 +-- .../convert}/test_tools_vcf.py | 6 +- tests/test_mavis/illustrate/__init__.py | 0 .../illustrate}/test_illustrate.py | 0 .../illustrate/test_illustrate2.py} | 4 +- .../__init__.py => test_mavis/mock.py} | 71 +++++++++++++------ tests/test_mavis/pairing/__init__.py | 0 .../pairing}/test_pairing.py | 0 tests/test_mavis/summary/__init__.py | 0 .../summary}/test_summary.py | 2 +- .../{integration => test_mavis}/test_align.py | 3 +- .../{integration => test_mavis}/test_args.py | 0 tests/{unit => test_mavis}/test_assemble.py | 7 +- .../test_assemble2.py} | 2 +- tests/{unit => test_mavis}/test_bam.py | 0 tests/{unit => test_mavis}/test_blat.py | 0 .../test_blat.py => test_mavis/test_blat2.py} | 2 +- tests/{unit => test_mavis}/test_breakpoint.py | 0 .../test_breakpoint2.py} | 2 +- tests/{unit => test_mavis}/test_constants.py | 0 tests/{end_to_end => test_mavis}/test_help.py | 0 tests/{unit => test_mavis}/test_interval.py | 0 .../test_overlay.py | 0 tests/{unit => test_mavis}/test_util.py | 0 tests/test_mavis/validate/__init__.py | 0 .../validate/test_call.py} | 4 +- .../validate/test_evidence.py} | 2 +- .../validate}/test_validate.py | 2 +- .../validate/test_validate2.py} | 4 +- tests/test_tools/__init__.py | 0 .../data/Homo_sapiens.GRCh38.kras.gff3 | 0 .../data/Homo_sapiens.GRCh38.kras.gff3.json | 0 .../data/Homo_sapiens.GRCh38.kras.gtf | 0 .../data/Homo_sapiens.GRCh38.kras.gtf.json | 0 .../{tools => test_tools}/data/K02718.1.gff3 | 0 .../data/K02718.1.gff3.json | 0 tests/{tools => test_tools}/data/K02718.1.gtf | 0 .../data/K02718.1.gtf.json | 0 .../data/ensembl69_hg19_annotations.kras.tab | 0 .../ensembl69_hg19_annotations.kras.tab.json | 0 .../data/example_genes.v2.json | 0 .../data/example_genes.v3.json | 0 .../test_convert_annotations_format.py | 1 - .../test_ref_alt_count.py | 0 tests/unit/mock.py | 45 ------------ 65 files changed, 98 insertions(+), 125 deletions(-) rename tests/{unit => }/data/calc_orf_test_sequence.fa (100%) rename tests/{unit => }/data/reference_sequences.fa (100%) rename tests/{unit => }/data/test_assembly_sequences.txt (100%) rename tests/{end_to_end => test_mavis}/__init__.py (100%) rename tests/{unit => test_mavis/annotate}/__init__.py (100%) rename tests/{unit => test_mavis/annotate}/test_annotate.py (99%) rename tests/{integration/test_annotate.py => test_mavis/annotate/test_annotate2.py} (99%) rename tests/{integration => test_mavis/annotate}/test_annotate_examples.py (98%) rename tests/{unit => test_mavis/annotate}/test_annotate_fileio.py (100%) rename tests/{integration/test_annotate_fileio.py => test_mavis/annotate/test_annotate_fileio2.py} (89%) rename tests/{unit => test_mavis/annotate}/test_call_indels.py (99%) rename tests/{integration => test_mavis/annotate}/test_splicing.py (98%) rename tests/{integration/config.py => test_mavis/bam/__init__.py} (100%) rename tests/{integration => test_mavis/bam}/test_bam.py (99%) rename tests/{integration => test_mavis/bam}/test_bam_cigar.py (99%) create mode 100644 tests/test_mavis/cluster/__init__.py rename tests/{unit => test_mavis/cluster}/test_cluster.py (100%) rename tests/{integration/test_cluster.py => test_mavis/cluster/test_cluster2.py} (99%) create mode 100644 tests/test_mavis/convert/__init__.py rename tests/{end_to_end => test_mavis/convert}/test_convert.py (98%) rename tests/{unit => test_mavis/convert}/test_tool.py (99%) rename tests/{unit => test_mavis/convert}/test_tools_vcf.py (90%) create mode 100644 tests/test_mavis/illustrate/__init__.py rename tests/{unit => test_mavis/illustrate}/test_illustrate.py (100%) rename tests/{integration/test_illustrate.py => test_mavis/illustrate/test_illustrate2.py} (99%) rename tests/{integration/__init__.py => test_mavis/mock.py} (87%) create mode 100644 tests/test_mavis/pairing/__init__.py rename tests/{integration => test_mavis/pairing}/test_pairing.py (100%) create mode 100644 tests/test_mavis/summary/__init__.py rename tests/{unit => test_mavis/summary}/test_summary.py (99%) rename tests/{integration => test_mavis}/test_align.py (99%) rename tests/{integration => test_mavis}/test_args.py (100%) rename tests/{unit => test_mavis}/test_assemble.py (97%) rename tests/{integration/test_assemble.py => test_mavis/test_assemble2.py} (99%) rename tests/{unit => test_mavis}/test_bam.py (100%) rename tests/{unit => test_mavis}/test_blat.py (100%) rename tests/{integration/test_blat.py => test_mavis/test_blat2.py} (99%) rename tests/{unit => test_mavis}/test_breakpoint.py (100%) rename tests/{integration/test_breakpoint.py => test_mavis/test_breakpoint2.py} (99%) rename tests/{unit => test_mavis}/test_constants.py (100%) rename tests/{end_to_end => test_mavis}/test_help.py (100%) rename tests/{unit => test_mavis}/test_interval.py (100%) rename tests/{end_to_end => test_mavis}/test_overlay.py (100%) rename tests/{unit => test_mavis}/test_util.py (100%) create mode 100644 tests/test_mavis/validate/__init__.py rename tests/{integration/test_validate_call.py => test_mavis/validate/test_call.py} (99%) rename tests/{integration/test_validate_evidence.py => test_mavis/validate/test_evidence.py} (99%) rename tests/{unit => test_mavis/validate}/test_validate.py (98%) rename tests/{integration/test_validate.py => test_mavis/validate/test_validate2.py} (99%) create mode 100644 tests/test_tools/__init__.py rename tests/{tools => test_tools}/data/Homo_sapiens.GRCh38.kras.gff3 (100%) rename tests/{tools => test_tools}/data/Homo_sapiens.GRCh38.kras.gff3.json (100%) rename tests/{tools => test_tools}/data/Homo_sapiens.GRCh38.kras.gtf (100%) rename tests/{tools => test_tools}/data/Homo_sapiens.GRCh38.kras.gtf.json (100%) rename tests/{tools => test_tools}/data/K02718.1.gff3 (100%) rename tests/{tools => test_tools}/data/K02718.1.gff3.json (100%) rename tests/{tools => test_tools}/data/K02718.1.gtf (100%) rename tests/{tools => test_tools}/data/K02718.1.gtf.json (100%) rename tests/{tools => test_tools}/data/ensembl69_hg19_annotations.kras.tab (100%) rename tests/{tools => test_tools}/data/ensembl69_hg19_annotations.kras.tab.json (100%) rename tests/{tools => test_tools}/data/example_genes.v2.json (100%) rename tests/{tools => test_tools}/data/example_genes.v3.json (100%) rename tests/{tools => test_tools}/test_convert_annotations_format.py (99%) rename tests/{end_to_end => test_tools}/test_ref_alt_count.py (100%) delete mode 100644 tests/unit/mock.py diff --git a/tests/unit/data/calc_orf_test_sequence.fa b/tests/data/calc_orf_test_sequence.fa similarity index 100% rename from tests/unit/data/calc_orf_test_sequence.fa rename to tests/data/calc_orf_test_sequence.fa diff --git a/tests/unit/data/reference_sequences.fa b/tests/data/reference_sequences.fa similarity index 100% rename from tests/unit/data/reference_sequences.fa rename to tests/data/reference_sequences.fa diff --git a/tests/unit/data/test_assembly_sequences.txt b/tests/data/test_assembly_sequences.txt similarity index 100% rename from tests/unit/data/test_assembly_sequences.txt rename to tests/data/test_assembly_sequences.txt diff --git a/tests/end_to_end/__init__.py b/tests/test_mavis/__init__.py similarity index 100% rename from tests/end_to_end/__init__.py rename to tests/test_mavis/__init__.py diff --git a/tests/unit/__init__.py b/tests/test_mavis/annotate/__init__.py similarity index 100% rename from tests/unit/__init__.py rename to tests/test_mavis/annotate/__init__.py diff --git a/tests/unit/test_annotate.py b/tests/test_mavis/annotate/test_annotate.py similarity index 99% rename from tests/unit/test_annotate.py rename to tests/test_mavis/annotate/test_annotate.py index 339d86ed..950b1558 100644 --- a/tests/unit/test_annotate.py +++ b/tests/test_mavis/annotate/test_annotate.py @@ -1,5 +1,4 @@ import itertools -import os import pytest import timeout_decorator @@ -7,7 +6,7 @@ from mavis.annotate.protein import Domain, DomainRegion, calculate_orf from mavis.annotate.variant import IndelCall -DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +from ...util import get_data class TestDomainAlignSeq: @@ -279,7 +278,7 @@ class TestCalculateORF: @timeout_decorator.timeout(20) def test_very_long(self): # load the sequence - with open(os.path.join(DATA_DIR, 'calc_orf_test_sequence.fa'), 'r') as fh: + with open(get_data('calc_orf_test_sequence.fa'), 'r') as fh: seq = fh.readlines()[0].strip() calculate_orf(seq, 300) diff --git a/tests/integration/test_annotate.py b/tests/test_mavis/annotate/test_annotate2.py similarity index 99% rename from tests/integration/test_annotate.py rename to tests/test_mavis/annotate/test_annotate2.py index d6e5e2ed..0ab63ddc 100644 --- a/tests/integration/test_annotate.py +++ b/tests/test_mavis/annotate/test_annotate2.py @@ -5,22 +5,21 @@ from mavis.annotate.base import BioInterval, ReferenceName from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.fusion import FusionTranscript, determine_prime -from mavis.annotate.genomic import Exon, Gene, PreTranscript, Template, Transcript -from mavis.annotate.protein import Domain, DomainRegion, Translation, calculate_orf, translate -from mavis.annotate.variant import ( - Annotation, - _gather_annotations, - _gather_breakpoint_annotations, - annotate_events, - overlapping_transcripts, -) +from mavis.annotate.genomic import (Exon, Gene, PreTranscript, Template, + Transcript) +from mavis.annotate.protein import (Domain, DomainRegion, Translation, + calculate_orf, translate) +from mavis.annotate.variant import (Annotation, _gather_annotations, + _gather_breakpoint_annotations, + annotate_events, overlapping_transcripts) from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, reverse_complement +from mavis.constants import (ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, + reverse_complement) from mavis.error import NotSpecifiedError from mavis.interval import Interval -from ..util import get_data -from . import MockObject, get_example_genes +from ...util import get_data +from ..mock import MockObject, get_example_genes REFERENCE_ANNOTATIONS = None REFERENCE_GENOME = None diff --git a/tests/integration/test_annotate_examples.py b/tests/test_mavis/annotate/test_annotate_examples.py similarity index 98% rename from tests/integration/test_annotate_examples.py rename to tests/test_mavis/annotate/test_annotate_examples.py index faf0297b..8e449dc0 100644 --- a/tests/integration/test_annotate_examples.py +++ b/tests/test_mavis/annotate/test_annotate_examples.py @@ -9,8 +9,8 @@ from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import ORIENT, PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE -from ..util import long_running_test -from . import MockLongString, MockObject, get_example_genes +from ...util import long_running_test +from ..mock import MockLongString, MockObject, get_example_genes def get_best(gene): diff --git a/tests/unit/test_annotate_fileio.py b/tests/test_mavis/annotate/test_annotate_fileio.py similarity index 100% rename from tests/unit/test_annotate_fileio.py rename to tests/test_mavis/annotate/test_annotate_fileio.py diff --git a/tests/integration/test_annotate_fileio.py b/tests/test_mavis/annotate/test_annotate_fileio2.py similarity index 89% rename from tests/integration/test_annotate_fileio.py rename to tests/test_mavis/annotate/test_annotate_fileio2.py index 53572a15..86c5a780 100644 --- a/tests/integration/test_annotate_fileio.py +++ b/tests/test_mavis/annotate/test_annotate_fileio2.py @@ -1,6 +1,6 @@ from mavis.annotate.file_io import load_annotations -from ..util import get_data +from ...util import get_data JSON = get_data('annotations_subsample.json') diff --git a/tests/unit/test_call_indels.py b/tests/test_mavis/annotate/test_call_indels.py similarity index 99% rename from tests/unit/test_call_indels.py rename to tests/test_mavis/annotate/test_call_indels.py index 840947df..b809a2ab 100644 --- a/tests/unit/test_call_indels.py +++ b/tests/test_mavis/annotate/test_call_indels.py @@ -1,7 +1,7 @@ import pytest from mavis.annotate.variant import IndelCall, call_protein_indel -from .mock import Mock, MockFunction +from ..mock import Mock, MockFunction class TestIndelCall: diff --git a/tests/integration/test_splicing.py b/tests/test_mavis/annotate/test_splicing.py similarity index 98% rename from tests/integration/test_splicing.py rename to tests/test_mavis/annotate/test_splicing.py index 471c2bf1..734cbf70 100644 --- a/tests/integration/test_splicing.py +++ b/tests/test_mavis/annotate/test_splicing.py @@ -6,10 +6,11 @@ from mavis.annotate.splicing import predict_splice_sites from mavis.annotate.variant import annotate_events from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, reverse_complement +from mavis.constants import (PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, + reverse_complement) from mavis.interval import Interval -from . import MockLongString, MockObject, get_example_genes +from ..mock import MockLongString, MockObject, get_example_genes EXAMPLE_GENES = None diff --git a/tests/integration/config.py b/tests/test_mavis/bam/__init__.py similarity index 100% rename from tests/integration/config.py rename to tests/test_mavis/bam/__init__.py diff --git a/tests/integration/test_bam.py b/tests/test_mavis/bam/test_bam.py similarity index 99% rename from tests/integration/test_bam.py rename to tests/test_mavis/bam/test_bam.py index 9ccbc09d..4f2132e7 100644 --- a/tests/integration/test_bam.py +++ b/tests/test_mavis/bam/test_bam.py @@ -1,5 +1,4 @@ import argparse -import logging import warnings from unittest import mock @@ -19,8 +18,8 @@ from mavis.constants import CIGAR, DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE from mavis.interval import Interval -from ..util import get_data -from . import MockBamFileHandle, MockRead +from ...util import get_data +from ..mock import MockBamFileHandle, MockRead REFERENCE_GENOME = None diff --git a/tests/integration/test_bam_cigar.py b/tests/test_mavis/bam/test_bam_cigar.py similarity index 99% rename from tests/integration/test_bam_cigar.py rename to tests/test_mavis/bam/test_bam_cigar.py index 1d0c49d8..11fd7b6a 100644 --- a/tests/integration/test_bam_cigar.py +++ b/tests/test_mavis/bam/test_bam_cigar.py @@ -21,8 +21,8 @@ from mavis.bam.read import SamRead from mavis.constants import CIGAR -from ..util import get_data -from . import MockObject, MockRead +from ...util import get_data +from ..mock import MockObject, MockRead REFERENCE_GENOME = None diff --git a/tests/test_mavis/cluster/__init__.py b/tests/test_mavis/cluster/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/test_cluster.py b/tests/test_mavis/cluster/test_cluster.py similarity index 100% rename from tests/unit/test_cluster.py rename to tests/test_mavis/cluster/test_cluster.py diff --git a/tests/integration/test_cluster.py b/tests/test_mavis/cluster/test_cluster2.py similarity index 99% rename from tests/integration/test_cluster.py rename to tests/test_mavis/cluster/test_cluster2.py index 3434c62b..0e8aa0a0 100644 --- a/tests/integration/test_cluster.py +++ b/tests/test_mavis/cluster/test_cluster2.py @@ -6,9 +6,7 @@ from mavis.interval import Interval from mavis.util import read_bpp_from_input_file - -from ..util import get_data - +from ...util import get_data FULL_BASE_EVENTS = get_data('mock_sv_events.tsv') CLUSTERED_EVENTS = get_data('clustering_input.tab') diff --git a/tests/test_mavis/convert/__init__.py b/tests/test_mavis/convert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/end_to_end/test_convert.py b/tests/test_mavis/convert/test_convert.py similarity index 98% rename from tests/end_to_end/test_convert.py rename to tests/test_mavis/convert/test_convert.py index 0e23328d..4470bc66 100644 --- a/tests/end_to_end/test_convert.py +++ b/tests/test_mavis/convert/test_convert.py @@ -6,12 +6,12 @@ from unittest.mock import patch from mavis.constants import ORIENT, SVTYPE +from mavis.convert import SUPPORTED_TOOL from mavis.main import main -from mavis.tools import SUPPORTED_TOOL from mavis.util import read_bpp_from_input_file from mavis_config.constants import SUBCOMMAND -from ..util import get_data, glob_exists +from ...util import get_data, glob_exists TEMP_OUTPUT = None diff --git a/tests/unit/test_tool.py b/tests/test_mavis/convert/test_tool.py similarity index 99% rename from tests/unit/test_tool.py rename to tests/test_mavis/convert/test_tool.py index 96531588..a6371452 100644 --- a/tests/unit/test_tool.py +++ b/tests/test_mavis/convert/test_tool.py @@ -2,11 +2,11 @@ import pytest from mavis.constants import COLUMNS, ORIENT, STRAND, SVTYPE -from mavis.tools import SUPPORTED_TOOL, _convert_tool_row, _parse_transabyss -from mavis.tools.vcf import convert_record as _parse_vcf_record -from mavis.tools.vcf import parse_bnd_alt as _parse_bnd_alt +from mavis.convert import SUPPORTED_TOOL, _convert_tool_row, _parse_transabyss +from mavis.convert.vcf import convert_record as _parse_vcf_record +from mavis.convert.vcf import parse_bnd_alt as _parse_bnd_alt -from .mock import Mock +from ..mock import Mock class TestDelly: diff --git a/tests/unit/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py similarity index 90% rename from tests/unit/test_tools_vcf.py rename to tests/test_mavis/convert/test_tools_vcf.py index 2036e656..f846d41a 100644 --- a/tests/unit/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,7 +1,7 @@ -from mavis.tools import SUPPORTED_TOOL, _convert_tool_row -from mavis.tools.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf +from mavis.convert import SUPPORTED_TOOL, _convert_tool_row +from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf -from ..util import get_data +from ...util import get_data def test_read_vcf(): diff --git a/tests/test_mavis/illustrate/__init__.py b/tests/test_mavis/illustrate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/test_illustrate.py b/tests/test_mavis/illustrate/test_illustrate.py similarity index 100% rename from tests/unit/test_illustrate.py rename to tests/test_mavis/illustrate/test_illustrate.py diff --git a/tests/integration/test_illustrate.py b/tests/test_mavis/illustrate/test_illustrate2.py similarity index 99% rename from tests/integration/test_illustrate.py rename to tests/test_mavis/illustrate/test_illustrate2.py index 0ce2bdae..d6bd1bbf 100644 --- a/tests/integration/test_illustrate.py +++ b/tests/test_mavis/illustrate/test_illustrate2.py @@ -20,8 +20,8 @@ from mavis.interval import Interval from svgwrite import Drawing -from ..util import get_data -from . import OUTPUT_SVG, MockObject, MockString, build_transcript +from ...util import get_data +from ..mock import OUTPUT_SVG, MockObject, MockString, build_transcript TEMPLATE_METADATA = None DEFAULTS.domain_name_regex_filter = r'.*' diff --git a/tests/integration/__init__.py b/tests/test_mavis/mock.py similarity index 87% rename from tests/integration/__init__.py rename to tests/test_mavis/mock.py index b71134a1..6e91f96d 100644 --- a/tests/integration/__init__.py +++ b/tests/test_mavis/mock.py @@ -1,13 +1,13 @@ import os +import types from mavis.align import query_coverage_interval -from mavis.annotate.genomic import Transcript, PreTranscript from mavis.annotate.file_io import load_annotations, load_reference_genome +from mavis.annotate.genomic import PreTranscript, Transcript from mavis.annotate.protein import Translation from mavis.constants import CIGAR, NA_MAPPING_QUALITY -from ..util import DATA_DIR - +from ..util import get_data ARGUMENT_ERROR = 2 @@ -16,6 +16,50 @@ _EXAMPLE_GENES = None +class Mock: + def __init__(self, **kwargs): + for attr, val in kwargs.items(): + setattr(self, attr, val) + + def bind_method(self, **kwargs): + for attr, val in kwargs.items(): + val = types.MethodType(val, self) # bind the method to self + setattr(self, attr, val) + + def add_attr(self, attr, val): + setattr(self, attr, val) + + def __contains__(self, item): + if hasattr(self, item): + return True + return False + + +class MockFunction: + def __init__(self, return_value): + self.return_value = return_value + + def __call__(self, *pos, **kwargs): + return self.return_value + + +class MockLongString: + def __init__(self, string, offset): + self.string = string + self.offset = offset + + def __len__(self): + return len(self.string) + self.offset + + def __getitem__(self, index): + if not isinstance(index, slice): + index = slice(index, index + 1) + index = slice(index.start - self.offset, index.stop - self.offset, index.step) + if index.start < 0: + raise NotImplementedError('string portion not given') + return self.string[index] + + def get_example_genes(): global _EXAMPLE_GENES if _EXAMPLE_GENES is None: @@ -25,8 +69,8 @@ def get_example_genes(): def set_example_genes(): result = {} - genes = load_annotations(os.path.join(DATA_DIR, 'example_genes.json')) - seqs = load_reference_genome(os.path.join(DATA_DIR, 'example_genes.fa')) + genes = load_annotations(get_data('example_genes.json')) + seqs = load_reference_genome(get_data('example_genes.fa')) for chr_genes in genes.values(): for gene in chr_genes: if gene.name in seqs: @@ -223,23 +267,6 @@ def __getitem__(self, index): return self.char -class MockLongString(str): - def __new__(cls, *args, offset=0, **kw): - s = str.__new__(cls, *args, **kw) - setattr(s, 'offset', offset) - return s - - def __getitem__(self, index): - if isinstance(index, slice): - index = slice(index.start - self.offset, index.stop - self.offset, index.step) - else: - index -= self.offset - return str.__getitem__(self, index) - - def __len__(self): - return self.offset + str.__len__(self) - - def mock_read_pair(mock1, mock2): if mock1.reference_id != mock2.reference_id: mock1.template_length = 0 diff --git a/tests/test_mavis/pairing/__init__.py b/tests/test_mavis/pairing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/test_pairing.py b/tests/test_mavis/pairing/test_pairing.py similarity index 100% rename from tests/integration/test_pairing.py rename to tests/test_mavis/pairing/test_pairing.py diff --git a/tests/test_mavis/summary/__init__.py b/tests/test_mavis/summary/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/test_summary.py b/tests/test_mavis/summary/test_summary.py similarity index 99% rename from tests/unit/test_summary.py rename to tests/test_mavis/summary/test_summary.py index 3e2a9efc..b66a8134 100644 --- a/tests/unit/test_summary.py +++ b/tests/test_mavis/summary/test_summary.py @@ -3,7 +3,7 @@ from mavis.constants import CALL_METHOD, COLUMNS, PROTOCOL, STRAND, SVTYPE from mavis.summary.summary import filter_by_annotations -from ..util import todo +from ...util import todo @pytest.fixture diff --git a/tests/integration/test_align.py b/tests/test_mavis/test_align.py similarity index 99% rename from tests/integration/test_align.py rename to tests/test_mavis/test_align.py index 0b3f556e..790a3d09 100644 --- a/tests/integration/test_align.py +++ b/tests/test_mavis/test_align.py @@ -1,4 +1,3 @@ -import shutil from unittest import mock import mavis.bam.cigar as _cigar @@ -14,7 +13,7 @@ from mavis_config import DEFAULTS from ..util import blat_only, bwa_only, get_data -from . import MockLongString, MockObject, MockRead +from .mock import MockLongString, MockObject, MockRead REFERENCE_GENOME = None diff --git a/tests/integration/test_args.py b/tests/test_mavis/test_args.py similarity index 100% rename from tests/integration/test_args.py rename to tests/test_mavis/test_args.py diff --git a/tests/unit/test_assemble.py b/tests/test_mavis/test_assemble.py similarity index 97% rename from tests/unit/test_assemble.py rename to tests/test_mavis/test_assemble.py index 73b3c6bf..52711a6c 100644 --- a/tests/unit/test_assemble.py +++ b/tests/test_mavis/test_assemble.py @@ -1,14 +1,11 @@ import itertools -import os import random import pytest from mavis.assemble import Contig, DeBruijnGraph, assemble, filter_contigs, kmers from mavis.constants import DNA_ALPHABET -from ..util import long_running_test - -DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +from ..util import get_data, long_running_test class TestModule: @@ -184,7 +181,7 @@ def test_trim_noncutting_paths_by_freq_degree_stop(self): @pytest.fixture def assembly_sequences(): # load the sequences - with open(os.path.join(DATA_DIR, 'test_assembly_sequences.txt')) as fh: + with open(get_data('test_assembly_sequences.txt')) as fh: seq = [i.strip() for i in fh.readlines()] return seq diff --git a/tests/integration/test_assemble.py b/tests/test_mavis/test_assemble2.py similarity index 99% rename from tests/integration/test_assemble.py rename to tests/test_mavis/test_assemble2.py index 07e22b64..8baa718c 100644 --- a/tests/integration/test_assemble.py +++ b/tests/test_mavis/test_assemble2.py @@ -8,7 +8,7 @@ from mavis_config import DEFAULTS from ..util import get_data, long_running_test -from . import MockObject +from .mock import MockObject class TestFilterContigs: diff --git a/tests/unit/test_bam.py b/tests/test_mavis/test_bam.py similarity index 100% rename from tests/unit/test_bam.py rename to tests/test_mavis/test_bam.py diff --git a/tests/unit/test_blat.py b/tests/test_mavis/test_blat.py similarity index 100% rename from tests/unit/test_blat.py rename to tests/test_mavis/test_blat.py diff --git a/tests/integration/test_blat.py b/tests/test_mavis/test_blat2.py similarity index 99% rename from tests/integration/test_blat.py rename to tests/test_mavis/test_blat2.py index 10fe8320..254f6a5d 100644 --- a/tests/integration/test_blat.py +++ b/tests/test_mavis/test_blat2.py @@ -9,7 +9,7 @@ from mavis.interval import Interval from ..util import get_data -from . import MockBamFileHandle, MockLongString, MockObject +from .mock import MockBamFileHandle, MockLongString, MockObject REFERENCE_GENOME = None diff --git a/tests/unit/test_breakpoint.py b/tests/test_mavis/test_breakpoint.py similarity index 100% rename from tests/unit/test_breakpoint.py rename to tests/test_mavis/test_breakpoint.py diff --git a/tests/integration/test_breakpoint.py b/tests/test_mavis/test_breakpoint2.py similarity index 99% rename from tests/integration/test_breakpoint.py rename to tests/test_mavis/test_breakpoint2.py index f6e0b3bb..134a8455 100644 --- a/tests/integration/test_breakpoint.py +++ b/tests/test_mavis/test_breakpoint2.py @@ -8,7 +8,7 @@ from mavis.validate.evidence import TranscriptomeEvidence from ..util import get_data -from . import MockObject, get_example_genes +from .mock import MockObject, get_example_genes REFERENCE_GENOME = None REF_CHR = 'fake' diff --git a/tests/unit/test_constants.py b/tests/test_mavis/test_constants.py similarity index 100% rename from tests/unit/test_constants.py rename to tests/test_mavis/test_constants.py diff --git a/tests/end_to_end/test_help.py b/tests/test_mavis/test_help.py similarity index 100% rename from tests/end_to_end/test_help.py rename to tests/test_mavis/test_help.py diff --git a/tests/unit/test_interval.py b/tests/test_mavis/test_interval.py similarity index 100% rename from tests/unit/test_interval.py rename to tests/test_mavis/test_interval.py diff --git a/tests/end_to_end/test_overlay.py b/tests/test_mavis/test_overlay.py similarity index 100% rename from tests/end_to_end/test_overlay.py rename to tests/test_mavis/test_overlay.py diff --git a/tests/unit/test_util.py b/tests/test_mavis/test_util.py similarity index 100% rename from tests/unit/test_util.py rename to tests/test_mavis/test_util.py diff --git a/tests/test_mavis/validate/__init__.py b/tests/test_mavis/validate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/test_validate_call.py b/tests/test_mavis/validate/test_call.py similarity index 99% rename from tests/integration/test_validate_call.py rename to tests/test_mavis/validate/test_call.py index 23d53e37..b959b6d5 100644 --- a/tests/integration/test_validate_call.py +++ b/tests/test_mavis/validate/test_call.py @@ -15,8 +15,8 @@ from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence -from ..util import get_data, todo -from . import MockBamFileHandle, MockLongString, MockRead, get_example_genes, mock_read_pair +from ...util import get_data, todo +from ..mock import MockBamFileHandle, MockLongString, MockRead, get_example_genes, mock_read_pair REFERENCE_GENOME = None diff --git a/tests/integration/test_validate_evidence.py b/tests/test_mavis/validate/test_evidence.py similarity index 99% rename from tests/integration/test_validate_evidence.py rename to tests/test_mavis/validate/test_evidence.py index 1cab995a..0c6c1420 100644 --- a/tests/integration/test_validate_evidence.py +++ b/tests/test_mavis/validate/test_evidence.py @@ -13,7 +13,7 @@ from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence from mavis_config import DEFAULTS -from . import MockBamFileHandle, MockObject, MockRead, mock_read_pair +from ..mock import MockBamFileHandle, MockObject, MockRead, mock_read_pair REFERENCE_GENOME = None diff --git a/tests/unit/test_validate.py b/tests/test_mavis/validate/test_validate.py similarity index 98% rename from tests/unit/test_validate.py rename to tests/test_mavis/validate/test_validate.py index 560ca909..f26fde4a 100644 --- a/tests/unit/test_validate.py +++ b/tests/test_mavis/validate/test_validate.py @@ -3,7 +3,7 @@ from mavis.validate.base import Evidence from mavis.validate.call import _call_interval_by_flanking_coverage -from .mock import Mock +from ..mock import Mock class CallIntervalByFlankingCoverage: diff --git a/tests/integration/test_validate.py b/tests/test_mavis/validate/test_validate2.py similarity index 99% rename from tests/integration/test_validate.py rename to tests/test_mavis/validate/test_validate2.py index e29a063f..0ec4278d 100644 --- a/tests/integration/test_validate.py +++ b/tests/test_mavis/validate/test_validate2.py @@ -9,8 +9,8 @@ from mavis.validate.evidence import GenomeEvidence from mavis_config import DEFAULTS -from ..util import get_data, long_running_test -from . import MockLongString, MockObject, MockRead, mock_read_pair +from ...util import get_data, long_running_test +from ..mock import MockLongString, MockObject, MockRead, mock_read_pair REFERENCE_GENOME = None diff --git a/tests/test_tools/__init__.py b/tests/test_tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3 b/tests/test_tools/data/Homo_sapiens.GRCh38.kras.gff3 similarity index 100% rename from tests/tools/data/Homo_sapiens.GRCh38.kras.gff3 rename to tests/test_tools/data/Homo_sapiens.GRCh38.kras.gff3 diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gff3.json b/tests/test_tools/data/Homo_sapiens.GRCh38.kras.gff3.json similarity index 100% rename from tests/tools/data/Homo_sapiens.GRCh38.kras.gff3.json rename to tests/test_tools/data/Homo_sapiens.GRCh38.kras.gff3.json diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gtf b/tests/test_tools/data/Homo_sapiens.GRCh38.kras.gtf similarity index 100% rename from tests/tools/data/Homo_sapiens.GRCh38.kras.gtf rename to tests/test_tools/data/Homo_sapiens.GRCh38.kras.gtf diff --git a/tests/tools/data/Homo_sapiens.GRCh38.kras.gtf.json b/tests/test_tools/data/Homo_sapiens.GRCh38.kras.gtf.json similarity index 100% rename from tests/tools/data/Homo_sapiens.GRCh38.kras.gtf.json rename to tests/test_tools/data/Homo_sapiens.GRCh38.kras.gtf.json diff --git a/tests/tools/data/K02718.1.gff3 b/tests/test_tools/data/K02718.1.gff3 similarity index 100% rename from tests/tools/data/K02718.1.gff3 rename to tests/test_tools/data/K02718.1.gff3 diff --git a/tests/tools/data/K02718.1.gff3.json b/tests/test_tools/data/K02718.1.gff3.json similarity index 100% rename from tests/tools/data/K02718.1.gff3.json rename to tests/test_tools/data/K02718.1.gff3.json diff --git a/tests/tools/data/K02718.1.gtf b/tests/test_tools/data/K02718.1.gtf similarity index 100% rename from tests/tools/data/K02718.1.gtf rename to tests/test_tools/data/K02718.1.gtf diff --git a/tests/tools/data/K02718.1.gtf.json b/tests/test_tools/data/K02718.1.gtf.json similarity index 100% rename from tests/tools/data/K02718.1.gtf.json rename to tests/test_tools/data/K02718.1.gtf.json diff --git a/tests/tools/data/ensembl69_hg19_annotations.kras.tab b/tests/test_tools/data/ensembl69_hg19_annotations.kras.tab similarity index 100% rename from tests/tools/data/ensembl69_hg19_annotations.kras.tab rename to tests/test_tools/data/ensembl69_hg19_annotations.kras.tab diff --git a/tests/tools/data/ensembl69_hg19_annotations.kras.tab.json b/tests/test_tools/data/ensembl69_hg19_annotations.kras.tab.json similarity index 100% rename from tests/tools/data/ensembl69_hg19_annotations.kras.tab.json rename to tests/test_tools/data/ensembl69_hg19_annotations.kras.tab.json diff --git a/tests/tools/data/example_genes.v2.json b/tests/test_tools/data/example_genes.v2.json similarity index 100% rename from tests/tools/data/example_genes.v2.json rename to tests/test_tools/data/example_genes.v2.json diff --git a/tests/tools/data/example_genes.v3.json b/tests/test_tools/data/example_genes.v3.json similarity index 100% rename from tests/tools/data/example_genes.v3.json rename to tests/test_tools/data/example_genes.v3.json diff --git a/tests/tools/test_convert_annotations_format.py b/tests/test_tools/test_convert_annotations_format.py similarity index 99% rename from tests/tools/test_convert_annotations_format.py rename to tests/test_tools/test_convert_annotations_format.py index a42daadb..7e637ae9 100644 --- a/tests/tools/test_convert_annotations_format.py +++ b/tests/test_tools/test_convert_annotations_format.py @@ -2,7 +2,6 @@ import os import pytest - from tools.convert_annotations_format import ( convert_gff2_to_mavis, convert_gff3_to_mavis, diff --git a/tests/end_to_end/test_ref_alt_count.py b/tests/test_tools/test_ref_alt_count.py similarity index 100% rename from tests/end_to_end/test_ref_alt_count.py rename to tests/test_tools/test_ref_alt_count.py diff --git a/tests/unit/mock.py b/tests/unit/mock.py deleted file mode 100644 index a1311acb..00000000 --- a/tests/unit/mock.py +++ /dev/null @@ -1,45 +0,0 @@ -import types - - -class Mock: - def __init__(self, **kwargs): - for attr, val in kwargs.items(): - setattr(self, attr, val) - - def bind_method(self, **kwargs): - for attr, val in kwargs.items(): - val = types.MethodType(val, self) # bind the method to self - setattr(self, attr, val) - - def add_attr(self, attr, val): - setattr(self, attr, val) - - def __contains__(self, item): - if hasattr(self, item): - return True - return False - - -class MockFunction: - def __init__(self, return_value): - self.return_value = return_value - - def __call__(self, *pos, **kwargs): - return self.return_value - - -class MockLongString: - def __init__(self, string, offset): - self.string = string - self.offset = offset - - def __len__(self): - return len(self.string) + self.offset - - def __getitem__(self, index): - if not isinstance(index, slice): - index = slice(index, index + 1) - index = slice(index.start - self.offset, index.stop - self.offset, index.step) - if index.start < 0: - raise NotImplementedError('string portion not given') - return self.string[index] From fc0a236e2204461ee023804ba99598c1b59f473b Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 9 Feb 2022 20:30:09 -0800 Subject: [PATCH 126/137] Specify page order for docs --- docs/background/.pages | 3 +++ docs/configuration/general.md | 2 +- docs/tutorials/.pages | 4 ++++ mkdocs.yml | 14 ++++++++++++++ setup.cfg | 1 + 5 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 docs/background/.pages create mode 100644 docs/tutorials/.pages diff --git a/docs/background/.pages b/docs/background/.pages new file mode 100644 index 00000000..278d1410 --- /dev/null +++ b/docs/background/.pages @@ -0,0 +1,3 @@ +nav: + - theory.md + - citations.md diff --git a/docs/configuration/general.md b/docs/configuration/general.md index 176aef2e..dea8801f 100644 --- a/docs/configuration/general.md +++ b/docs/configuration/general.md @@ -1,6 +1,6 @@ # Getting Started -An exhaustive list of the various configurable settings can be found [here](../settings) +An exhaustive list of the various configurable settings can be found [here](../settings). Alternatively you can view them through the [online schema explorer](https://json-schema.app/view?url=https://raw.githubusercontent.com/bcgsc/mavis_config/master/src/mavis_config/config.json) ## Pipeline Configuration File diff --git a/docs/tutorials/.pages b/docs/tutorials/.pages new file mode 100644 index 00000000..b9f03d9d --- /dev/null +++ b/docs/tutorials/.pages @@ -0,0 +1,4 @@ +nav: + - mini.md + - full.md + - ... diff --git a/mkdocs.yml b/mkdocs.yml index 736658d8..79900dff 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,7 +12,21 @@ markdown_extensions: - markdown_include.include: base_path: docs extra_css: [extra.css] +nav: + - index.md + - install.md + - migrating.md + - ... | background/**.md + - ... | inputs/**.md + - ... | outputs/**.md + - ... | configuration/**.md + - ... | tutorials/**.md + - development.md + - ... + - glossary.md + plugins: + - awesome-pages - mkdocs-simple-hooks: hooks: on_pre_build: "docs.hooks:build_package_docs" diff --git a/setup.cfg b/setup.cfg index 38a05509..c6adbdf0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,6 +61,7 @@ doc = mkdocs-material==5.4.0 markdown-include mkdocs-simple-hooks==0.1.2 + mkdocs-awesome-pages-plugin==22.0.3 test = timeout-decorator>=0.3.3 coverage>=4.2 From 192c267e9cca6dbf8caabdf4bdbd45ccf104c689 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 10 Feb 2022 11:45:06 -0800 Subject: [PATCH 127/137] Fix weird import linting --- tests/test_mavis/annotate/test_annotate2.py | 19 ++++++++++--------- tests/test_mavis/annotate/test_splicing.py | 3 +-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_mavis/annotate/test_annotate2.py b/tests/test_mavis/annotate/test_annotate2.py index 0ab63ddc..df45e307 100644 --- a/tests/test_mavis/annotate/test_annotate2.py +++ b/tests/test_mavis/annotate/test_annotate2.py @@ -5,16 +5,17 @@ from mavis.annotate.base import BioInterval, ReferenceName from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.fusion import FusionTranscript, determine_prime -from mavis.annotate.genomic import (Exon, Gene, PreTranscript, Template, - Transcript) -from mavis.annotate.protein import (Domain, DomainRegion, Translation, - calculate_orf, translate) -from mavis.annotate.variant import (Annotation, _gather_annotations, - _gather_breakpoint_annotations, - annotate_events, overlapping_transcripts) +from mavis.annotate.genomic import Exon, Gene, PreTranscript, Template, Transcript +from mavis.annotate.protein import Domain, DomainRegion, Translation, calculate_orf, translate +from mavis.annotate.variant import ( + Annotation, + _gather_annotations, + _gather_breakpoint_annotations, + annotate_events, + overlapping_transcripts, +) from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import (ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, - reverse_complement) +from mavis.constants import ORIENT, PRIME, PROTOCOL, STRAND, SVTYPE, reverse_complement from mavis.error import NotSpecifiedError from mavis.interval import Interval diff --git a/tests/test_mavis/annotate/test_splicing.py b/tests/test_mavis/annotate/test_splicing.py index 734cbf70..79978d6c 100644 --- a/tests/test_mavis/annotate/test_splicing.py +++ b/tests/test_mavis/annotate/test_splicing.py @@ -6,8 +6,7 @@ from mavis.annotate.splicing import predict_splice_sites from mavis.annotate.variant import annotate_events from mavis.breakpoint import Breakpoint, BreakpointPair -from mavis.constants import (PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, - reverse_complement) +from mavis.constants import PROTOCOL, SPLICE_TYPE, STRAND, SVTYPE, reverse_complement from mavis.interval import Interval from ..mock import MockLongString, MockObject, get_example_genes From 08aed1068abe379fbda70e13a716148985f357e0 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 10 Feb 2022 13:21:36 -0800 Subject: [PATCH 128/137] Fix name of extras in rtd config --- .readthedocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index aa35ae55..12ffe7f3 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -20,4 +20,4 @@ python: - method: pip path: . extra_requirements: - - docs + - doc From 8c252b2ea2f06782216ee33e0647d9f818af2c0d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 10 Feb 2022 13:24:29 -0800 Subject: [PATCH 129/137] Relax version requirement --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c6adbdf0..905b1bae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,7 +61,7 @@ doc = mkdocs-material==5.4.0 markdown-include mkdocs-simple-hooks==0.1.2 - mkdocs-awesome-pages-plugin==22.0.3 + mkdocs-awesome-pages-plugin test = timeout-decorator>=0.3.3 coverage>=4.2 From addc8470ec8b000b5aa58a1964953a397f67c05d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 10 Feb 2022 13:26:52 -0800 Subject: [PATCH 130/137] Relax more versions --- setup.cfg | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 905b1bae..ed47bfd5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,11 +56,11 @@ where = src [options.extras_require] doc = - mkdocs==1.1.2 + mkdocs>=1.1.2 markdown-refdocs - mkdocs-material==5.4.0 + mkdocs-material>=5.4.0 markdown-include - mkdocs-simple-hooks==0.1.2 + mkdocs-simple-hooks>=0.1.2 mkdocs-awesome-pages-plugin test = timeout-decorator>=0.3.3 @@ -79,11 +79,11 @@ dev = pytest pytest-cov pytest-xdist - mkdocs==1.1.2 + mkdocs>=1.1.2,<2 markdown-refdocs - mkdocs-material==5.4.0 + mkdocs-material>=5.4.0 markdown-include - mkdocs-simple-hooks==0.1.2 + mkdocs-simple-hooks>=0.1.2 types-setuptools>=57.4.7, <58 deploy = twine From e5c6a59ae7e42b57dd73b90f72783ec62cb609f9 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 16 Feb 2022 15:11:39 -0800 Subject: [PATCH 131/137] Make template metadata file optional resolves: #310 --- src/mavis/annotate/main.py | 11 +++++++++-- tests/mini-tutorial.annotate_only.config.json | 3 --- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/mavis/annotate/main.py b/src/mavis/annotate/main.py index ed426a04..1d6fd503 100644 --- a/src/mavis/annotate/main.py +++ b/src/mavis/annotate/main.py @@ -10,11 +10,12 @@ from ..error import DrawingFitError, NotSpecifiedError from ..illustrate.constants import DiagramSettings from ..illustrate.diagram import draw_sv_summary_diagram +from ..types import ReferenceGenome from ..util import generate_complete_stamp, logger, mkdirp, read_inputs from .constants import PASS_FILENAME from .file_io import ReferenceFile from .fusion import determine_prime -from .genomic import PreTranscript +from .genomic import PreTranscript, Template from .variant import ( annotate_events, call_protein_indel, @@ -30,7 +31,13 @@ } -def draw(drawing_config, ann, reference_genome, template_metadata, drawings_directory): +def draw( + drawing_config: DiagramSettings, + ann, + reference_genome: ReferenceGenome, + template_metadata: Dict[str, Template], + drawings_directory: str, +): """ produces the svg diagram and json legend for a given annotation """ diff --git a/tests/mini-tutorial.annotate_only.config.json b/tests/mini-tutorial.annotate_only.config.json index b270c7dc..545b8b78 100644 --- a/tests/mini-tutorial.annotate_only.config.json +++ b/tests/mini-tutorial.annotate_only.config.json @@ -45,8 +45,5 @@ ], "reference.reference_genome": [ "tests/data/mock_reference_genome.fa" - ], - "reference.template_metadata": [ - "tests/data/cytoBand.txt" ] } From 05bc7723c3483698f2e26c926d26ef4289c15c76 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 16 Feb 2022 15:45:17 -0800 Subject: [PATCH 132/137] set default arrays for reference files in config --- src/mavis/annotate/file_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py index 7220fc9c..b5dcc9e5 100644 --- a/src/mavis/annotate/file_io.py +++ b/src/mavis/annotate/file_io.py @@ -428,4 +428,4 @@ def load(self, ignore_cache=False, verbose=True): @classmethod def load_from_config(cls, config, file_type: str, **kwargs): - return ReferenceFile(file_type, *config[f'reference.{file_type}'], **kwargs) + return ReferenceFile(file_type, *config.get(f'reference.{file_type}', []), **kwargs) From 0ccd6ef671d1109101170ee90da319597159c610 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 16 Feb 2022 21:13:08 -0800 Subject: [PATCH 133/137] add wheel upgrade to document --- docs/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/install.md b/docs/install.md index b3468f7a..b9343c48 100644 --- a/docs/install.md +++ b/docs/install.md @@ -16,7 +16,7 @@ The simplest way to use MAVIS is via Singularity. The MAVIS docker container use by singularity will take care of installing the aligner as well. ```bash -pip install -U setuptools pip +pip install -U setuptools pip wheel pip install mavis_config # also installs snakemake ``` From 6a3d15dc34340fcc29699a8bd5c2787d95ac2be7 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 19 Feb 2022 12:16:27 -0800 Subject: [PATCH 134/137] Enforce unique transcript IDs --- src/tools/convert_annotations_format.py | 37 +- tests/data/mock_reference_annotations2.json | 155 +- tests/test_tools/data/viral.gtf | 447 +++ tests/test_tools/data/viral.gtf.json | 2683 +++++++++++++++++ .../test_convert_annotations_format.py | 1 + 5 files changed, 3317 insertions(+), 6 deletions(-) create mode 100644 tests/test_tools/data/viral.gtf create mode 100644 tests/test_tools/data/viral.gtf.json diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index 775b6d1d..eceba67a 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -464,6 +464,26 @@ def validate_gff_coordinates(nodes_df, links_df): raise ValueError(f'{errors.shape[0]} entries with impossible coordinates') +def enforce_uniq_transcript_ids(input_df) -> pd.DataFrame: + df = input_df.copy() + duplicates = df[df.type == 'transcript'].drop_duplicates(['seqid', 'parent_id', 'feature_id']) + + if duplicates.shape[0] == duplicates.feature_id.nunique(): + return df + + # there are some non-unique transcript IDs, make them all pre-pend the seqid + df.loc[df.type == 'transcript', 'feature_id'] = df.seqid + GFF_ID_DELIMITER + df.feature_id + df.loc[df.parent_type == 'transcript', 'parent_id'] = df.seqid + GFF_ID_DELIMITER + df.parent_id + duplicates = df[df.type == 'transcript'].drop_duplicates(['seqid', 'parent_id', 'feature_id']) + + if duplicates.shape[0] == duplicates.feature_id.nunique(): + return df.copy() + + raise ValueError( + f'Unable to enforce unique transcript IDs: ({duplicates.shape[0]},{duplicates.feature_id.nunique()})' + ) + + def convert_pandas_gff_to_mavis(df) -> Dict: df['error'] = '' df.loc[~df.type.isin(GFF_ALL_FEATURES), 'error'] = 'unrecognized type ' + df.type @@ -530,8 +550,13 @@ def simplify_type(t): nodes_df, links_df = fix_orphan_elements(nodes_df, links_df) nodes_df, links_df = insert_missing_transcripts(nodes_df, links_df) validate_gff_coordinates(nodes_df, links_df) + df = nodes_df.merge( + links_df[GFF_KEY_COLS + ['parent_type', 'parent_id']].drop_duplicates(), + how='outer', + on=GFF_KEY_COLS, + ).fillna('') - df = nodes_df.merge(links_df, how='outer', on=GFF_KEY_COLS).fillna('') + df = enforce_uniq_transcript_ids(df) def feature_key(row, parent=False): if not parent: @@ -783,6 +808,8 @@ def split_attributes(row): df.loc[df.type == 'gene', 'Name'] = df.gene_name df.loc[df.type == 'transcript', 'Name'] = df.transcript_name df['strand'] = df.strand.fillna('') + df['gene_id'] = df.gene_id.astype(str) + df.loc[df.gene_id.str.startswith('unassigned_gene_'), 'gene_id'] = '' df['Parent'] = '' df.loc[(df.type == 'transcript') & (df.gene_id != ''), 'Parent'] = 'gene:' + df.gene_id @@ -868,8 +895,8 @@ def main(): parser.add_argument('--input_type', default='v2', choices=['v2-tab', 'v2-json', 'gff3', 'gtf']) parser.add_argument('output', help='path to the JSON output file') parser.add_argument( - '--keep_alt', - help='do not filter out chromosome/seqid names starting with GL or KI', + '--filter_alt', + help='filter out chromosome/seqid names starting with GL or KI', action='store_true', default=False, ) @@ -889,9 +916,9 @@ def main(): elif args.input_type == 'v2-json': annotations = convert_mavis_json_2to3(args.input) elif args.input_type == 'gtf': - annotations = convert_gff2_to_mavis(args.input, not args.keep_alt) + annotations = convert_gff2_to_mavis(args.input, args.filter_alt) else: - annotations = convert_gff3_to_mavis(args.input, not args.keep_alt) + annotations = convert_gff3_to_mavis(args.input, args.filter_alt) logging.info(f'writing: {args.output}') with open(args.output, 'w') as fh: diff --git a/tests/data/mock_reference_annotations2.json b/tests/data/mock_reference_annotations2.json index f1ef1c50..065b1221 100644 --- a/tests/data/mock_reference_annotations2.json +++ b/tests/data/mock_reference_annotations2.json @@ -1 +1,154 @@ -{"genes": [{"aliases": [], "chr": "fake", "end": 200, "name": "GENE-A", "start": 100, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 200, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-A", "start": 100}]}, {"aliases": [], "chr": "fake", "end": 350, "name": "GENE-B", "start": 250, "strand": "-", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 350, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-B", "start": 250}]}, {"aliases": [], "chr": "fake", "end": 400, "name": "GENE-C", "start": 300, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 400, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-C", "start": 300}]}, {"aliases": [], "chr": "fake", "end": 550, "name": "GENE-D", "start": 450, "strand": "-", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 550, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-D", "start": 450}]}, {"aliases": [], "chr": "fake", "end": 600, "name": "GENE-E", "start": 500, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 600, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-E", "start": 500}]}, {"aliases": [], "chr": "fake", "end": 650, "name": "GENE-F", "start": 550, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": null, "cdna_coding_start": null, "domains": [], "end": 650, "exons": [], "is_best_transcript": true, "name": "TRANSCRIPT-E", "start": 550}]}]} \ No newline at end of file +{ + "genes": [ + { + "aliases": [ + ], + "chr": "fake", + "end": 200, + "name": "GENE-A", + "start": 100, + "strand": "+", + "transcripts": [ + { + "aliases": [ + ], + "cdna_coding_end": null, + "cdna_coding_start": null, + "domains": [ + ], + "end": 200, + "exons": [ + ], + "is_best_transcript": true, + "name": "TRANSCRIPT-A", + "start": 100 + } + ] + }, + { + "aliases": [ + ], + "chr": "fake", + "end": 350, + "name": "GENE-B", + "start": 250, + "strand": "-", + "transcripts": [ + { + "aliases": [ + ], + "cdna_coding_end": null, + "cdna_coding_start": null, + "domains": [ + ], + "end": 350, + "exons": [ + ], + "is_best_transcript": true, + "name": "TRANSCRIPT-B", + "start": 250 + } + ] + }, + { + "aliases": [ + ], + "chr": "fake", + "end": 400, + "name": "GENE-C", + "start": 300, + "strand": "+", + "transcripts": [ + { + "aliases": [ + ], + "cdna_coding_end": null, + "cdna_coding_start": null, + "domains": [ + ], + "end": 400, + "exons": [ + ], + "is_best_transcript": true, + "name": "TRANSCRIPT-C", + "start": 300 + } + ] + }, + { + "aliases": [ + ], + "chr": "fake", + "end": 550, + "name": "GENE-D", + "start": 450, + "strand": "-", + "transcripts": [ + { + "aliases": [ + ], + "cdna_coding_end": null, + "cdna_coding_start": null, + "domains": [ + ], + "end": 550, + "exons": [ + ], + "is_best_transcript": true, + "name": "TRANSCRIPT-D", + "start": 450 + } + ] + }, + { + "aliases": [ + ], + "chr": "fake", + "end": 600, + "name": "GENE-E", + "start": 500, + "strand": "+", + "transcripts": [ + { + "aliases": [ + ], + "cdna_coding_end": null, + "cdna_coding_start": null, + "domains": [ + ], + "end": 600, + "exons": [ + ], + "is_best_transcript": true, + "name": "TRANSCRIPT-E", + "start": 500 + } + ] + }, + { + "aliases": [ + ], + "chr": "fake", + "end": 650, + "name": "GENE-F", + "start": 550, + "strand": "+", + "transcripts": [ + { + "aliases": [ + ], + "cdna_coding_end": null, + "cdna_coding_start": null, + "domains": [ + ], + "end": 650, + "exons": [ + ], + "is_best_transcript": true, + "name": "TRANSCRIPT-F", + "start": 550 + } + ] + } + ] +} diff --git a/tests/test_tools/data/viral.gtf b/tests/test_tools/data/viral.gtf new file mode 100644 index 00000000..57c45450 --- /dev/null +++ b/tests/test_tools/data/viral.gtf @@ -0,0 +1,447 @@ +X74464.1 EMBL gene 200 646 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +X74464.1 EMBL CDS 200 643 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36801"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36801"; gbkey "CDS"; gene "E6"; note "alternative"; product "early protein"; protein_id "CAA52482.1"; exon_number "1"; +X74464.1 EMBL start_codon 200 202 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36801"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36801"; gbkey "CDS"; gene "E6"; note "alternative"; product "early protein"; protein_id "CAA52482.1"; exon_number "1"; +X74464.1 EMBL stop_codon 644 646 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36801"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36801"; gbkey "CDS"; gene "E6"; note "alternative"; product "early protein"; protein_id "CAA52482.1"; exon_number "1"; +X74464.1 EMBL CDS 221 643 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36801"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36801"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52483.1"; exon_number "1"; +X74464.1 EMBL start_codon 221 223 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36801"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36801"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52483.1"; exon_number "1"; +X74464.1 EMBL stop_codon 644 646 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36801"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36801"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52483.1"; exon_number "1"; +X74464.1 EMBL gene 643 924 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X74464.1 EMBL CDS 643 921 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36817"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36817"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52484.1"; exon_number "1"; +X74464.1 EMBL start_codon 643 645 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36817"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36817"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52484.1"; exon_number "1"; +X74464.1 EMBL stop_codon 922 924 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36817"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36817"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52484.1"; exon_number "1"; +X74464.1 EMBL gene 917 2734 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +X74464.1 EMBL CDS 917 2731 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q05111"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q05111"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52485.1"; exon_number "1"; +X74464.1 EMBL start_codon 917 919 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q05111"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q05111"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52485.1"; exon_number "1"; +X74464.1 EMBL stop_codon 2732 2734 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q05111"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q05111"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52485.1"; exon_number "1"; +X74464.1 EMBL gene 2676 4061 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +X74464.1 EMBL CDS 2676 4058 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36780"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36780"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52486.1"; exon_number "1"; +X74464.1 EMBL start_codon 2676 2678 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36780"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36780"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52486.1"; exon_number "1"; +X74464.1 EMBL stop_codon 4059 4061 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36780"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36780"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52486.1"; exon_number "1"; +X74464.1 EMBL gene 4129 5730 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X74464.1 EMBL CDS 4129 5727 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_6"; db_xref "GOA:P36746"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36746"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52487.1"; exon_number "1"; +X74464.1 EMBL start_codon 4129 4131 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_6"; db_xref "GOA:P36746"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36746"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52487.1"; exon_number "1"; +X74464.1 EMBL stop_codon 5728 5730 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_6"; db_xref "GOA:P36746"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36746"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52487.1"; exon_number "1"; +X74464.1 EMBL gene 5745 7268 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X74464.1 EMBL CDS 5745 7265 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q02480"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q02480"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52488.1"; exon_number "1"; +X74464.1 EMBL start_codon 5745 5747 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q02480"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q02480"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52488.1"; exon_number "1"; +X74464.1 EMBL stop_codon 7266 7268 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q02480"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q02480"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52488.1"; exon_number "1"; +K02718.1 Genbank gene 83 559 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 83 556 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 65 to 559; putative"; product "transforming protein"; protein_id "AAA46939.1"; exon_number "1"; +K02718.1 Genbank start_codon 83 85 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 65 to 559; putative"; product "transforming protein"; protein_id "AAA46939.1"; exon_number "1"; +K02718.1 Genbank stop_codon 557 559 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 65 to 559; putative"; product "transforming protein"; protein_id "AAA46939.1"; exon_number "1"; +K02718.1 Genbank gene 562 858 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 562 855 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from 544 to 858; putative"; product "transforming protein"; protein_id "AAA46940.1"; exon_number "1"; +K02718.1 Genbank start_codon 562 564 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from 544 to 858; putative"; product "transforming protein"; protein_id "AAA46940.1"; exon_number "1"; +K02718.1 Genbank stop_codon 856 858 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from 544 to 858; putative"; product "transforming protein"; protein_id "AAA46940.1"; exon_number "1"; +K02718.1 Genbank gene 865 1140 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; part "1"; +K02718.1 Genbank gene 1140 2813 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; part "2"; +K02718.1 Genbank CDS 865 1140 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "1"; +K02718.1 Genbank CDS 1140 2810 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "2"; +K02718.1 Genbank start_codon 865 867 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "1"; +K02718.1 Genbank stop_codon 2811 2813 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 interrupted ORF from 859 to 2813; putative"; product "replication protein"; protein_id "AAA46936.1"; exon_number "2"; +K02718.1 Genbank gene 2755 3852 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 2755 3849 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from 2725 to 3852; putative"; product "regulatory protein"; protein_id "AAA46941.1"; exon_number "1"; +K02718.1 Genbank start_codon 2755 2757 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from 2725 to 3852; putative"; product "regulatory protein"; protein_id "AAA46941.1"; exon_number "1"; +K02718.1 Genbank stop_codon 3850 3852 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from 2725 to 3852; putative"; product "regulatory protein"; protein_id "AAA46941.1"; exon_number "1"; +K02718.1 Genbank gene 3332 3619 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 3332 3616 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46937.1"; protein_id "AAA46937.1"; exon_number "1"; +K02718.1 Genbank stop_codon 3617 3619 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46937.1"; protein_id "AAA46937.1"; exon_number "1"; +K02718.1 Genbank gene 3863 4099 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 3863 4096 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; partial "true"; product "AAA46938.1"; protein_id "AAA46938.1"; exon_number "1"; +K02718.1 Genbank stop_codon 4097 4099 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; partial "true"; product "AAA46938.1"; protein_id "AAA46938.1"; exon_number "1"; +K02718.1 Genbank gene 4235 5656 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 4235 5653 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from 4133 to 5656; putative"; product "minor capsid protein"; protein_id "AAA46942.1"; exon_number "1"; +K02718.1 Genbank start_codon 4235 4237 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from 4133 to 5656; putative"; product "minor capsid protein"; protein_id "AAA46942.1"; exon_number "1"; +K02718.1 Genbank stop_codon 5654 5656 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from 4133 to 5656; putative"; product "minor capsid protein"; protein_id "AAA46942.1"; exon_number "1"; +K02718.1 Genbank gene 5559 7154 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +K02718.1 Genbank CDS 5559 7151 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from 5526 to 7154; putative"; product "major capsid protein"; protein_id "AAA46943.1"; exon_number "1"; +K02718.1 Genbank start_codon 5559 5561 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from 5526 to 7154; putative"; product "major capsid protein"; protein_id "AAA46943.1"; exon_number "1"; +K02718.1 Genbank stop_codon 7152 7154 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from 5526 to 7154; putative"; product "major capsid protein"; protein_id "AAA46943.1"; exon_number "1"; +X05015.1 EMBL gene 105 581 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 105 578 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P06463"; db_xref "InterPro:IPR001334"; db_xref "PDB:2I04"; db_xref "PDB:2I0I"; db_xref "PDB:2I0L"; db_xref "UniProtKB/Swiss-Prot:P06463"; gbkey "CDS"; gene "E6"; product "E6 protein"; protein_id "CAA28664.1"; exon_number "1"; +X05015.1 EMBL start_codon 105 107 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P06463"; db_xref "InterPro:IPR001334"; db_xref "PDB:2I04"; db_xref "PDB:2I0I"; db_xref "PDB:2I0L"; db_xref "UniProtKB/Swiss-Prot:P06463"; gbkey "CDS"; gene "E6"; product "E6 protein"; protein_id "CAA28664.1"; exon_number "1"; +X05015.1 EMBL stop_codon 579 581 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P06463"; db_xref "InterPro:IPR001334"; db_xref "PDB:2I04"; db_xref "PDB:2I0I"; db_xref "PDB:2I0L"; db_xref "UniProtKB/Swiss-Prot:P06463"; gbkey "CDS"; gene "E6"; product "E6 protein"; protein_id "CAA28664.1"; exon_number "1"; +X05015.1 EMBL gene 590 907 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 590 904 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P06788"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P06788"; gbkey "CDS"; gene "E7"; product "E7 protein"; protein_id "CAA28665.1"; exon_number "1"; +X05015.1 EMBL start_codon 590 592 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P06788"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P06788"; gbkey "CDS"; gene "E7"; product "E7 protein"; protein_id "CAA28665.1"; exon_number "1"; +X05015.1 EMBL stop_codon 905 907 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P06788"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P06788"; gbkey "CDS"; gene "E7"; product "E7 protein"; protein_id "CAA28665.1"; exon_number "1"; +X05015.1 EMBL gene 914 2887 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 914 2884 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P06789"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "PDB:1R9W"; db_xref "PDB:1TUE"; db_xref "UniProtKB/Swiss-Prot:P06789"; gbkey "CDS"; gene "E1"; product "E1 protein"; protein_id "CAA28666.1"; exon_number "1"; +X05015.1 EMBL start_codon 914 916 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P06789"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "PDB:1R9W"; db_xref "PDB:1TUE"; db_xref "UniProtKB/Swiss-Prot:P06789"; gbkey "CDS"; gene "E1"; product "E1 protein"; protein_id "CAA28666.1"; exon_number "1"; +X05015.1 EMBL stop_codon 2885 2887 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P06789"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "PDB:1R9W"; db_xref "PDB:1TUE"; db_xref "UniProtKB/Swiss-Prot:P06789"; gbkey "CDS"; gene "E1"; product "E1 protein"; protein_id "CAA28666.1"; exon_number "1"; +X05015.1 EMBL gene 2817 3914 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 2817 3911 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P06790"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "PDB:1F9F"; db_xref "PDB:1JJ4"; db_xref "PDB:1QQH"; db_xref "PDB:1TUE"; db_xref "UniProtKB/Swiss-Prot:P06790"; gbkey "CDS"; gene "E2"; product "E2 protein"; protein_id "CAA28667.1"; exon_number "1"; +X05015.1 EMBL start_codon 2817 2819 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P06790"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "PDB:1F9F"; db_xref "PDB:1JJ4"; db_xref "PDB:1QQH"; db_xref "PDB:1TUE"; db_xref "UniProtKB/Swiss-Prot:P06790"; gbkey "CDS"; gene "E2"; product "E2 protein"; protein_id "CAA28667.1"; exon_number "1"; +X05015.1 EMBL stop_codon 3912 3914 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P06790"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "PDB:1F9F"; db_xref "PDB:1JJ4"; db_xref "PDB:1QQH"; db_xref "PDB:1TUE"; db_xref "UniProtKB/Swiss-Prot:P06790"; gbkey "CDS"; gene "E2"; product "E2 protein"; protein_id "CAA28667.1"; exon_number "1"; +X05015.1 EMBL gene 3418 3684 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 3418 3681 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; db_xref "InterPro:IPR003861"; db_xref "UniProtKB/Swiss-Prot:P06791"; gbkey "CDS"; gene "E4"; product "E4 protein"; protein_id "CAA28668.1"; exon_number "1"; +X05015.1 EMBL start_codon 3418 3420 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; db_xref "InterPro:IPR003861"; db_xref "UniProtKB/Swiss-Prot:P06791"; gbkey "CDS"; gene "E4"; product "E4 protein"; protein_id "CAA28668.1"; exon_number "1"; +X05015.1 EMBL stop_codon 3682 3684 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; db_xref "InterPro:IPR003861"; db_xref "UniProtKB/Swiss-Prot:P06791"; gbkey "CDS"; gene "E4"; product "E4 protein"; protein_id "CAA28668.1"; exon_number "1"; +X05015.1 EMBL gene 3936 4157 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 3936 4154 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; db_xref "InterPro:IPR004270"; db_xref "UniProtKB/Swiss-Prot:P06792"; gbkey "CDS"; gene "E5"; product "E5 protein"; protein_id "CAA28669.1"; exon_number "1"; +X05015.1 EMBL start_codon 3936 3938 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; db_xref "InterPro:IPR004270"; db_xref "UniProtKB/Swiss-Prot:P06792"; gbkey "CDS"; gene "E5"; product "E5 protein"; protein_id "CAA28669.1"; exon_number "1"; +X05015.1 EMBL stop_codon 4155 4157 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; db_xref "InterPro:IPR004270"; db_xref "UniProtKB/Swiss-Prot:P06792"; gbkey "CDS"; gene "E5"; product "E5 protein"; protein_id "CAA28669.1"; exon_number "1"; +X05015.1 EMBL gene 4244 5632 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 4244 5629 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; db_xref "GOA:P06793"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P06793"; gbkey "CDS"; gene "L2"; product "L2 protein"; protein_id "CAA28670.1"; exon_number "1"; +X05015.1 EMBL start_codon 4244 4246 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; db_xref "GOA:P06793"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P06793"; gbkey "CDS"; gene "L2"; product "L2 protein"; protein_id "CAA28670.1"; exon_number "1"; +X05015.1 EMBL stop_codon 5630 5632 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; db_xref "GOA:P06793"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P06793"; gbkey "CDS"; gene "L2"; product "L2 protein"; protein_id "CAA28670.1"; exon_number "1"; +X05015.1 EMBL gene 5430 7136 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X05015.1 EMBL CDS 5430 7133 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; db_xref "GOA:P06794"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "PDB:2R5I"; db_xref "UniProtKB/Swiss-Prot:P06794"; gbkey "CDS"; gene "L1"; product "L1 protein"; protein_id "CAA28671.1"; exon_number "1"; +X05015.1 EMBL start_codon 5430 5432 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; db_xref "GOA:P06794"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "PDB:2R5I"; db_xref "UniProtKB/Swiss-Prot:P06794"; gbkey "CDS"; gene "L1"; product "L1 protein"; protein_id "CAA28671.1"; exon_number "1"; +X05015.1 EMBL stop_codon 7134 7136 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; db_xref "GOA:P06794"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "PDB:2R5I"; db_xref "UniProtKB/Swiss-Prot:P06794"; gbkey "CDS"; gene "L1"; product "L1 protein"; protein_id "CAA28671.1"; exon_number "1"; +X74474.1 EMBL gene 102 563 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +X74474.1 EMBL CDS 102 560 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36809"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36809"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52543.1"; exon_number "1"; +X74474.1 EMBL start_codon 102 104 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36809"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36809"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52543.1"; exon_number "1"; +X74474.1 EMBL stop_codon 561 563 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36809"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36809"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52543.1"; exon_number "1"; +X74474.1 EMBL gene 566 883 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X74474.1 EMBL CDS 566 880 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36826"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36826"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52544.1"; exon_number "1"; +X74474.1 EMBL start_codon 566 568 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36826"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36826"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52544.1"; exon_number "1"; +X74474.1 EMBL stop_codon 881 883 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36826"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36826"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52544.1"; exon_number "1"; +X74474.1 EMBL gene 890 2785 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +X74474.1 EMBL CDS 890 2782 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q05112"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q05112"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52545.1"; exon_number "1"; +X74474.1 EMBL start_codon 890 892 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q05112"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q05112"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52545.1"; exon_number "1"; +X74474.1 EMBL stop_codon 2783 2785 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q05112"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q05112"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52545.1"; exon_number "1"; +X74474.1 EMBL gene 2727 3863 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +X74474.1 EMBL CDS 2727 3860 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36790"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36790"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52546.1"; exon_number "1"; +X74474.1 EMBL start_codon 2727 2729 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36790"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36790"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52546.1"; exon_number "1"; +X74474.1 EMBL stop_codon 3861 3863 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36790"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36790"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52546.1"; exon_number "1"; +X74474.1 EMBL gene 4280 5671 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X74474.1 EMBL CDS 4280 5668 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36756"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36756"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52547.1"; exon_number "1"; +X74474.1 EMBL start_codon 4280 4282 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36756"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36756"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52547.1"; exon_number "1"; +X74474.1 EMBL stop_codon 5669 5671 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36756"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36756"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52547.1"; exon_number "1"; +X74474.1 EMBL gene 5631 7157 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X74474.1 EMBL CDS 5631 7154 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q02515"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q02515"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52548.1"; exon_number "1"; +X74474.1 EMBL start_codon 5631 5633 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q02515"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q02515"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52548.1"; exon_number "1"; +X74474.1 EMBL stop_codon 7155 7157 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q02515"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q02515"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52548.1"; exon_number "1"; +AB027020.1 DDBJ gene 102 557 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +AB027020.1 DDBJ CDS 102 554 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; product "BAA90727.1"; protein_id "BAA90727.1"; exon_number "1"; +AB027020.1 DDBJ start_codon 102 104 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; product "BAA90727.1"; protein_id "BAA90727.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 555 557 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; product "BAA90727.1"; protein_id "BAA90727.1"; exon_number "1"; +AB027020.1 DDBJ gene 564 878 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +AB027020.1 DDBJ CDS 564 875 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; product "BAA90728.1"; protein_id "BAA90728.1"; exon_number "1"; +AB027020.1 DDBJ start_codon 564 566 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; product "BAA90728.1"; protein_id "BAA90728.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 876 878 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; product "BAA90728.1"; protein_id "BAA90728.1"; exon_number "1"; +AB027020.1 DDBJ gene 886 2790 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +AB027020.1 DDBJ CDS 886 2787 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; product "BAA90729.1"; protein_id "BAA90729.1"; exon_number "1"; +AB027020.1 DDBJ start_codon 886 888 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; product "BAA90729.1"; protein_id "BAA90729.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 2788 2790 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; product "BAA90729.1"; protein_id "BAA90729.1"; exon_number "1"; +AB027020.1 DDBJ gene 2732 3838 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +AB027020.1 DDBJ CDS 2732 3835 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; product "BAA90730.1"; protein_id "BAA90730.1"; exon_number "1"; +AB027020.1 DDBJ start_codon 2732 2734 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; product "BAA90730.1"; protein_id "BAA90730.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 3836 3838 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; product "BAA90730.1"; protein_id "BAA90730.1"; exon_number "1"; +AB027020.1 DDBJ gene 3309 3614 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; partial "true"; +AB027020.1 DDBJ CDS 3309 3611 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; note "start codon is not identified"; partial "true"; product "BAA90731.1"; protein_id "BAA90731.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 3612 3614 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; note "start codon is not identified"; partial "true"; product "BAA90731.1"; protein_id "BAA90731.1"; exon_number "1"; +AB027020.1 DDBJ gene 3846 4142 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; partial "true"; +AB027020.1 DDBJ CDS 3846 4139 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; note "start codon is not identified"; partial "true"; product "BAA90732.1"; protein_id "BAA90732.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 4140 4142 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; note "start codon is not identified"; partial "true"; product "BAA90732.1"; protein_id "BAA90732.1"; exon_number "1"; +AB027020.1 DDBJ gene 4157 5560 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +AB027020.1 DDBJ CDS 4157 5557 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; product "BAA90733.1"; protein_id "BAA90733.1"; exon_number "1"; +AB027020.1 DDBJ start_codon 4157 4159 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; product "BAA90733.1"; protein_id "BAA90733.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 5558 5560 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; product "BAA90733.1"; protein_id "BAA90733.1"; exon_number "1"; +AB027020.1 DDBJ gene 5541 7064 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +AB027020.1 DDBJ CDS 5541 7061 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; product "BAA90734.1"; protein_id "BAA90734.1"; exon_number "1"; +AB027020.1 DDBJ start_codon 5541 5543 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; product "BAA90734.1"; protein_id "BAA90734.1"; exon_number "1"; +AB027020.1 DDBJ stop_codon 7062 7064 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; product "BAA90734.1"; protein_id "BAA90734.1"; exon_number "1"; +D90400.1 DDBJ CDS 110 556 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; note "open reading frame E6"; product "BAA31845.1"; protein_id "BAA31845.1"; exon_number "1"; +D90400.1 DDBJ start_codon 110 112 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; note "open reading frame E6"; product "BAA31845.1"; protein_id "BAA31845.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 557 559 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; note "open reading frame E6"; product "BAA31845.1"; protein_id "BAA31845.1"; exon_number "1"; +D90400.1 DDBJ CDS 574 867 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; note "open reading frame E7"; product "BAA31846.1"; protein_id "BAA31846.1"; exon_number "1"; +D90400.1 DDBJ start_codon 574 576 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; note "open reading frame E7"; product "BAA31846.1"; protein_id "BAA31846.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 868 870 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; note "open reading frame E7"; product "BAA31846.1"; protein_id "BAA31846.1"; exon_number "1"; +D90400.1 DDBJ CDS 883 2814 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; note "open reading frame E1"; product "BAA31847.1"; protein_id "BAA31847.1"; exon_number "1"; +D90400.1 DDBJ start_codon 883 885 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; note "open reading frame E1"; product "BAA31847.1"; protein_id "BAA31847.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 2815 2817 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; note "open reading frame E1"; product "BAA31847.1"; protein_id "BAA31847.1"; exon_number "1"; +D90400.1 DDBJ CDS 2753 3826 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; note "open reading frame E2"; product "BAA31848.1"; protein_id "BAA31848.1"; exon_number "1"; +D90400.1 DDBJ start_codon 2753 2755 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; note "open reading frame E2"; product "BAA31848.1"; protein_id "BAA31848.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 3827 3829 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; note "open reading frame E2"; product "BAA31848.1"; protein_id "BAA31848.1"; exon_number "1"; +D90400.1 DDBJ CDS 3330 3602 . + 0 gene_id "unassigned_gene_5"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; note "no ATG start codon~open reading frame E4"; partial "true"; product "BAA14396.1"; protein_id "BAA14396.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 3603 3605 . + 0 gene_id "unassigned_gene_5"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; note "no ATG start codon~open reading frame E4"; partial "true"; product "BAA14396.1"; protein_id "BAA14396.1"; exon_number "1"; +D90400.1 DDBJ CDS 3892 4119 . + 0 gene_id "unassigned_gene_6"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; note "open reading frame E5"; product "BAA31849.1"; protein_id "BAA31849.1"; exon_number "1"; +D90400.1 DDBJ start_codon 3892 3894 . + 0 gene_id "unassigned_gene_6"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; note "open reading frame E5"; product "BAA31849.1"; protein_id "BAA31849.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 4120 4122 . + 0 gene_id "unassigned_gene_6"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; note "open reading frame E5"; product "BAA31849.1"; protein_id "BAA31849.1"; exon_number "1"; +D90400.1 DDBJ CDS 4244 5659 . + 0 gene_id "unassigned_gene_7"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; note "open reading frame L2"; product "BAA31850.1"; protein_id "BAA31850.1"; exon_number "1"; +D90400.1 DDBJ start_codon 4244 4246 . + 0 gene_id "unassigned_gene_7"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; note "open reading frame L2"; product "BAA31850.1"; protein_id "BAA31850.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 5660 5662 . + 0 gene_id "unassigned_gene_7"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; note "open reading frame L2"; product "BAA31850.1"; protein_id "BAA31850.1"; exon_number "1"; +D90400.1 DDBJ CDS 5565 7136 . + 0 gene_id "unassigned_gene_8"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; note "open reading frame L1"; product "BAA31851.1"; protein_id "BAA31851.1"; exon_number "1"; +D90400.1 DDBJ start_codon 5565 5567 . + 0 gene_id "unassigned_gene_8"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; note "open reading frame L1"; product "BAA31851.1"; protein_id "BAA31851.1"; exon_number "1"; +D90400.1 DDBJ stop_codon 7137 7139 . + 0 gene_id "unassigned_gene_8"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; note "open reading frame L1"; product "BAA31851.1"; protein_id "BAA31851.1"; exon_number "1"; +DQ080079.1 Genbank CDS 1 474 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; product "E6 protein"; protein_id "AAZ39491.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 1 3 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; product "E6 protein"; protein_id "AAZ39491.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 475 477 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; product "E6 protein"; protein_id "AAZ39491.1"; exon_number "1"; +DQ080079.1 Genbank CDS 484 813 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; product "E7 protein"; protein_id "AAZ39492.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 484 486 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; product "E7 protein"; protein_id "AAZ39492.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 814 816 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; product "E7 protein"; protein_id "AAZ39492.1"; exon_number "1"; +DQ080079.1 Genbank CDS 823 2742 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; product "E1 protein"; protein_id "AAZ39493.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 823 825 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; product "E1 protein"; protein_id "AAZ39493.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 2743 2745 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; product "E1 protein"; protein_id "AAZ39493.1"; exon_number "1"; +DQ080079.1 Genbank CDS 2672 3781 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; product "E2 protein"; protein_id "AAZ39494.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 2672 2674 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; product "E2 protein"; protein_id "AAZ39494.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 3782 3784 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; product "E2 protein"; protein_id "AAZ39494.1"; exon_number "1"; +DQ080079.1 Genbank CDS 3267 3548 . + 0 gene_id "unassigned_gene_5"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; note "lacks traditional start codon"; partial "true"; product "E4 protein"; protein_id "AAZ39495.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 3549 3551 . + 0 gene_id "unassigned_gene_5"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; note "lacks traditional start codon"; partial "true"; product "E4 protein"; protein_id "AAZ39495.1"; exon_number "1"; +DQ080079.1 Genbank CDS 3830 4048 . + 0 gene_id "unassigned_gene_6"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; product "E5 protein"; protein_id "AAZ39496.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 3830 3832 . + 0 gene_id "unassigned_gene_6"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; product "E5 protein"; protein_id "AAZ39496.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 4049 4051 . + 0 gene_id "unassigned_gene_6"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; product "E5 protein"; protein_id "AAZ39496.1"; exon_number "1"; +DQ080079.1 Genbank CDS 4098 5504 . + 0 gene_id "unassigned_gene_7"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; product "L2 protein"; protein_id "AAZ39497.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 4098 4100 . + 0 gene_id "unassigned_gene_7"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; product "L2 protein"; protein_id "AAZ39497.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 5505 5507 . + 0 gene_id "unassigned_gene_7"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; product "L2 protein"; protein_id "AAZ39497.1"; exon_number "1"; +DQ080079.1 Genbank CDS 5488 7002 . + 0 gene_id "unassigned_gene_8"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; product "L1 protein"; protein_id "AAZ39498.1"; exon_number "1"; +DQ080079.1 Genbank start_codon 5488 5490 . + 0 gene_id "unassigned_gene_8"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; product "L1 protein"; protein_id "AAZ39498.1"; exon_number "1"; +DQ080079.1 Genbank stop_codon 7003 7005 . + 0 gene_id "unassigned_gene_8"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; product "L1 protein"; protein_id "AAZ39498.1"; exon_number "1"; +J04353.1 Genbank gene 108 557 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 108 554 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "ORF E6 from bp 39 to 557"; product "transforming protein"; protein_id "AAA46950.1"; exon_number "1"; +J04353.1 Genbank start_codon 108 110 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "ORF E6 from bp 39 to 557"; product "transforming protein"; protein_id "AAA46950.1"; exon_number "1"; +J04353.1 Genbank stop_codon 555 557 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "ORF E6 from bp 39 to 557"; product "transforming protein"; protein_id "AAA46950.1"; exon_number "1"; +J04353.1 Genbank gene 560 856 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 560 853 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "ORF E7 from bp 545 to 856"; product "transforming protein"; protein_id "AAA46951.1"; exon_number "1"; +J04353.1 Genbank start_codon 560 562 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "ORF E7 from bp 545 to 856"; product "transforming protein"; protein_id "AAA46951.1"; exon_number "1"; +J04353.1 Genbank stop_codon 854 856 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "ORF E7 from bp 545 to 856"; product "transforming protein"; protein_id "AAA46951.1"; exon_number "1"; +J04353.1 Genbank gene 862 2751 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 862 2748 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "ORF E1 from bp 850 to bp 2751"; product "replication protein"; protein_id "AAA46952.1"; exon_number "1"; +J04353.1 Genbank start_codon 862 864 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "ORF E1 from bp 850 to bp 2751"; product "replication protein"; protein_id "AAA46952.1"; exon_number "1"; +J04353.1 Genbank stop_codon 2749 2751 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "ORF E1 from bp 850 to bp 2751"; product "replication protein"; protein_id "AAA46952.1"; exon_number "1"; +J04353.1 Genbank gene 2693 3811 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 2693 3808 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "ORF E2 from bp 2663 to 3811"; product "regulatory protein"; protein_id "AAA46953.1"; exon_number "1"; +J04353.1 Genbank start_codon 2693 2695 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "ORF E2 from bp 2663 to 3811"; product "regulatory protein"; protein_id "AAA46953.1"; exon_number "1"; +J04353.1 Genbank stop_codon 3809 3811 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "ORF E2 from bp 2663 to 3811"; product "regulatory protein"; protein_id "AAA46953.1"; exon_number "1"; +J04353.1 Genbank gene 3270 3578 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 3270 3575 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46949.1"; protein_id "AAA46949.1"; exon_number "1"; +J04353.1 Genbank stop_codon 3576 3578 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46949.1"; protein_id "AAA46949.1"; exon_number "1"; +J04353.1 Genbank gene 3816 4070 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 3816 4067 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAA46954.1"; protein_id "AAA46954.1"; exon_number "1"; +J04353.1 Genbank start_codon 3816 3818 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAA46954.1"; protein_id "AAA46954.1"; exon_number "1"; +J04353.1 Genbank stop_codon 4068 4070 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAA46954.1"; protein_id "AAA46954.1"; exon_number "1"; +J04353.1 Genbank gene 4171 5571 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 4171 5568 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "ORF L2 from bp 4060 to bp 5571"; product "minor capsid protein"; protein_id "AAA46955.1"; exon_number "1"; +J04353.1 Genbank start_codon 4171 4173 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "ORF L2 from bp 4060 to bp 5571"; product "minor capsid protein"; protein_id "AAA46955.1"; exon_number "1"; +J04353.1 Genbank stop_codon 5569 5571 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "ORF L2 from bp 4060 to bp 5571"; product "minor capsid protein"; protein_id "AAA46955.1"; exon_number "1"; +J04353.1 Genbank gene 5552 7066 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +J04353.1 Genbank CDS 5552 7063 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "ORF L1 from bp 5516 to 7066"; product "major capsid protein"; protein_id "AAA46956.1"; exon_number "1"; +J04353.1 Genbank start_codon 5552 5554 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "ORF L1 from bp 5516 to 7066"; product "major capsid protein"; protein_id "AAA46956.1"; exon_number "1"; +J04353.1 Genbank stop_codon 7064 7066 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "ORF L1 from bp 5516 to 7066"; product "major capsid protein"; protein_id "AAA46956.1"; exon_number "1"; +M12732.1 Genbank gene 109 558 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 109 555 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 76 to 558; putative"; product "transforming protein"; protein_id "AAA46958.1"; exon_number "1"; +M12732.1 Genbank start_codon 109 111 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 76 to 558; putative"; product "transforming protein"; protein_id "AAA46958.1"; exon_number "1"; +M12732.1 Genbank stop_codon 556 558 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "E6 ORF from 76 to 558; putative"; product "transforming protein"; protein_id "AAA46958.1"; exon_number "1"; +M12732.1 Genbank gene 573 866 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 573 863 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from bp 543 to 866; putative"; product "transforming protein"; protein_id "AAA46959.1"; exon_number "1"; +M12732.1 Genbank start_codon 573 575 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from bp 543 to 866; putative"; product "transforming protein"; protein_id "AAA46959.1"; exon_number "1"; +M12732.1 Genbank stop_codon 864 866 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "E7 ORF from bp 543 to 866; putative"; product "transforming protein"; protein_id "AAA46959.1"; exon_number "1"; +M12732.1 Genbank gene 879 2813 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 879 2810 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 ORF from bp 867 to 2813; putative"; product "replication protein"; protein_id "AAA46960.1"; exon_number "1"; +M12732.1 Genbank start_codon 879 881 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 ORF from bp 867 to 2813; putative"; product "replication protein"; protein_id "AAA46960.1"; exon_number "1"; +M12732.1 Genbank stop_codon 2811 2813 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "E1 ORF from bp 867 to 2813; putative"; product "replication protein"; protein_id "AAA46960.1"; exon_number "1"; +M12732.1 Genbank gene 2749 3810 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 2749 3807 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from bp 2728 to 3810; putative"; product "regulatory protein"; protein_id "AAA46961.1"; exon_number "1"; +M12732.1 Genbank start_codon 2749 2751 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from bp 2728 to 3810; putative"; product "regulatory protein"; protein_id "AAA46961.1"; exon_number "1"; +M12732.1 Genbank stop_codon 3808 3810 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "E2 ORF from bp 2728 to 3810; putative"; product "regulatory protein"; protein_id "AAA46961.1"; exon_number "1"; +M12732.1 Genbank gene 3326 3577 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 3326 3574 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46957.1"; protein_id "AAA46957.1"; exon_number "1"; +M12732.1 Genbank stop_codon 3575 3577 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAA46957.1"; protein_id "AAA46957.1"; exon_number "1"; +M12732.1 Genbank gene 3854 4081 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 3854 4078 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAA46962.1"; protein_id "AAA46962.1"; exon_number "1"; +M12732.1 Genbank start_codon 3854 3856 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAA46962.1"; protein_id "AAA46962.1"; exon_number "1"; +M12732.1 Genbank stop_codon 4079 4081 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAA46962.1"; protein_id "AAA46962.1"; exon_number "1"; +M12732.1 Genbank gene 4210 5613 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 4210 5610 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from bp 4198 to 5613; putative"; product "minor capsid protein"; protein_id "AAA46963.1"; exon_number "1"; +M12732.1 Genbank start_codon 4210 4212 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from bp 4198 to 5613; putative"; product "minor capsid protein"; protein_id "AAA46963.1"; exon_number "1"; +M12732.1 Genbank stop_codon 5611 5613 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "L2 ORF from bp 4198 to 5613; putative"; product "minor capsid protein"; protein_id "AAA46963.1"; exon_number "1"; +M12732.1 Genbank gene 5594 7093 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +M12732.1 Genbank CDS 5594 7090 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from bp 5516 to 7093; putative"; product "major capsid protein"; protein_id "AAA46964.1"; exon_number "1"; +M12732.1 Genbank start_codon 5594 5596 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from bp 5516 to 7093; putative"; product "major capsid protein"; protein_id "AAA46964.1"; exon_number "1"; +M12732.1 Genbank stop_codon 7091 7093 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from bp 5516 to 7093; putative"; product "major capsid protein"; protein_id "AAA46964.1"; exon_number "1"; +M62849.1 Genbank gene 107 583 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 107 580 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "putative"; product "transforming protein"; protein_id "AAA47050.1"; exon_number "1"; +M62849.1 Genbank start_codon 107 109 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "putative"; product "transforming protein"; protein_id "AAA47050.1"; exon_number "1"; +M62849.1 Genbank stop_codon 581 583 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; note "putative"; product "transforming protein"; protein_id "AAA47050.1"; exon_number "1"; +M62849.1 Genbank gene 592 921 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 592 918 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "putative"; product "transforming protein"; protein_id "AAA47051.1"; exon_number "1"; +M62849.1 Genbank start_codon 592 594 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "putative"; product "transforming protein"; protein_id "AAA47051.1"; exon_number "1"; +M62849.1 Genbank stop_codon 919 921 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; note "putative"; product "transforming protein"; protein_id "AAA47051.1"; exon_number "1"; +M62849.1 Genbank gene 928 2871 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 928 2868 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "putative"; product "replication protein"; protein_id "AAA47052.1"; exon_number "1"; +M62849.1 Genbank start_codon 928 930 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "putative"; product "replication protein"; protein_id "AAA47052.1"; exon_number "1"; +M62849.1 Genbank stop_codon 2869 2871 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; gene "E1"; note "putative"; product "replication protein"; protein_id "AAA47052.1"; exon_number "1"; +M62849.1 Genbank gene 2798 3910 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 2798 3907 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "putative"; product "regulatory protein"; protein_id "AAA47053.1"; exon_number "1"; +M62849.1 Genbank start_codon 2798 2800 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "putative"; product "regulatory protein"; protein_id "AAA47053.1"; exon_number "1"; +M62849.1 Genbank stop_codon 3908 3910 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; gene "E2"; note "putative"; product "regulatory protein"; protein_id "AAA47053.1"; exon_number "1"; +M62849.1 Genbank gene 3393 3677 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 3393 3674 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "E4 ORF"; protein_id "AAA47049.1"; exon_number "1"; +M62849.1 Genbank stop_codon 3675 3677 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "E4 ORF"; protein_id "AAA47049.1"; exon_number "1"; +M62849.1 Genbank gene 3958 4176 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 3958 4173 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "E5 ORF"; protein_id "AAA47054.1"; exon_number "1"; +M62849.1 Genbank start_codon 3958 3960 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "E5 ORF"; protein_id "AAA47054.1"; exon_number "1"; +M62849.1 Genbank stop_codon 4174 4176 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "E5 ORF"; protein_id "AAA47054.1"; exon_number "1"; +M62849.1 Genbank gene 4250 5662 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 4250 5659 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "putative"; product "minor capsid protein"; protein_id "AAA47055.1"; exon_number "1"; +M62849.1 Genbank start_codon 4250 4252 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "putative"; product "minor capsid protein"; protein_id "AAA47055.1"; exon_number "1"; +M62849.1 Genbank stop_codon 5660 5662 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; gene "L2"; note "putative"; product "minor capsid protein"; protein_id "AAA47055.1"; exon_number "1"; +M62849.1 Genbank gene 5643 7160 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +M62849.1 Genbank CDS 5643 7157 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from bp 5610 to 7160; putative"; product "major capsid protein"; protein_id "AAA47056.1"; exon_number "1"; +M62849.1 Genbank start_codon 5643 5645 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from bp 5610 to 7160; putative"; product "major capsid protein"; protein_id "AAA47056.1"; exon_number "1"; +M62849.1 Genbank stop_codon 7158 7160 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; gene "L1"; note "L1 ORF from bp 5610 to 7160; putative"; product "major capsid protein"; protein_id "AAA47056.1"; exon_number "1"; +U21941.1 Genbank gene 107 583 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +U21941.1 Genbank CDS 107 580 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; product "AAC54850.1"; protein_id "AAC54850.1"; exon_number "1"; +U21941.1 Genbank start_codon 107 109 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; product "AAC54850.1"; protein_id "AAC54850.1"; exon_number "1"; +U21941.1 Genbank stop_codon 581 583 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "E6"; product "AAC54850.1"; protein_id "AAC54850.1"; exon_number "1"; +U21941.1 Genbank gene 592 921 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +U21941.1 Genbank CDS 592 918 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; product "AAC54851.1"; protein_id "AAC54851.1"; exon_number "1"; +U21941.1 Genbank start_codon 592 594 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; product "AAC54851.1"; protein_id "AAC54851.1"; exon_number "1"; +U21941.1 Genbank stop_codon 919 921 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; gbkey "CDS"; gene "E7"; product "AAC54851.1"; protein_id "AAC54851.1"; exon_number "1"; +U21941.1 Genbank CDS 928 2883 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; product "AAC54852.1"; protein_id "AAC54852.1"; exon_number "1"; +U21941.1 Genbank start_codon 928 930 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; product "AAC54852.1"; protein_id "AAC54852.1"; exon_number "1"; +U21941.1 Genbank stop_codon 2884 2886 . + 0 gene_id "unassigned_gene_1"; transcript_id "unassigned_transcript_3"; gbkey "CDS"; product "AAC54852.1"; protein_id "AAC54852.1"; exon_number "1"; +U21941.1 Genbank gene 928 2746 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "other"; part "1"; +U21941.1 Genbank gene 2748 2886 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "other"; part "2"; +U21941.1 Genbank CDS 2813 3892 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; product "AAC54853.1"; protein_id "AAC54853.1"; exon_number "1"; +U21941.1 Genbank start_codon 2813 2815 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; product "AAC54853.1"; protein_id "AAC54853.1"; exon_number "1"; +U21941.1 Genbank stop_codon 3893 3895 . + 0 gene_id "unassigned_gene_2"; transcript_id "unassigned_transcript_4"; gbkey "CDS"; product "AAC54853.1"; protein_id "AAC54853.1"; exon_number "1"; +U21941.1 Genbank gene 2813 3079 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "other"; part "1"; +U21941.1 Genbank gene 3081 3260 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "other"; part "2"; +U21941.1 Genbank gene 3262 3895 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "other"; part "3"; +U21941.1 Genbank gene 3408 3662 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "protein_coding"; +U21941.1 Genbank CDS 3408 3659 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAC54854.1"; protein_id "AAC54854.1"; exon_number "1"; +U21941.1 Genbank stop_codon 3660 3662 . + 0 gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "CDS"; gene "E4"; partial "true"; product "AAC54854.1"; protein_id "AAC54854.1"; exon_number "1"; +U21941.1 Genbank gene 3909 4145 . + . gene_id "E5"; transcript_id ""; gbkey "Gene"; gene "E5"; gene_biotype "protein_coding"; +U21941.1 Genbank CDS 3909 4142 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAC54855.1"; protein_id "AAC54855.1"; exon_number "1"; +U21941.1 Genbank start_codon 3909 3911 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAC54855.1"; protein_id "AAC54855.1"; exon_number "1"; +U21941.1 Genbank stop_codon 4143 4145 . + 0 gene_id "E5"; transcript_id "unassigned_transcript_6"; gbkey "CDS"; gene "E5"; product "AAC54855.1"; protein_id "AAC54855.1"; exon_number "1"; +U21941.1 Genbank CDS 4209 5606 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; product "AAC54856.1"; protein_id "AAC54856.1"; exon_number "1"; +U21941.1 Genbank start_codon 4209 4211 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; product "AAC54856.1"; protein_id "AAC54856.1"; exon_number "1"; +U21941.1 Genbank stop_codon 5607 5609 . + 0 gene_id "unassigned_gene_3"; transcript_id "unassigned_transcript_7"; gbkey "CDS"; product "AAC54856.1"; protein_id "AAC54856.1"; exon_number "1"; +U21941.1 Genbank gene 4209 4752 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "other"; part "1"; +U21941.1 Genbank gene 4754 5174 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "other"; part "2"; +U21941.1 Genbank gene 5176 5560 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "other"; part "3"; +U21941.1 Genbank gene 5562 5609 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "other"; part "4"; +U21941.1 Genbank CDS 5590 7101 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; product "AAC54857.1"; protein_id "AAC54857.1"; exon_number "1"; +U21941.1 Genbank start_codon 5590 5592 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; product "AAC54857.1"; protein_id "AAC54857.1"; exon_number "1"; +U21941.1 Genbank stop_codon 7102 7104 . + 0 gene_id "unassigned_gene_4"; transcript_id "unassigned_transcript_8"; gbkey "CDS"; product "AAC54857.1"; protein_id "AAC54857.1"; exon_number "1"; +U21941.1 Genbank gene 5590 6784 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "other"; part "1"; +U21941.1 Genbank gene 6786 7104 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "other"; part "2"; +X74477.1 EMBL gene 110 559 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +X74477.1 EMBL CDS 110 556 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P27228"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P27228"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52561.1"; exon_number "1"; +X74477.1 EMBL start_codon 110 112 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P27228"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P27228"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52561.1"; exon_number "1"; +X74477.1 EMBL stop_codon 557 559 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P27228"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P27228"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52561.1"; exon_number "1"; +X74477.1 EMBL gene 562 861 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X74477.1 EMBL CDS 562 858 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P27230"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P27230"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52562.1"; exon_number "1"; +X74477.1 EMBL start_codon 562 564 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P27230"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P27230"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52562.1"; exon_number "1"; +X74477.1 EMBL stop_codon 859 861 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P27230"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P27230"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52562.1"; exon_number "1"; +X74477.1 EMBL gene 868 2781 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +X74477.1 EMBL CDS 868 2778 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P27220"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:P27220"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52563.1"; exon_number "1"; +X74477.1 EMBL start_codon 868 870 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P27220"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:P27220"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52563.1"; exon_number "1"; +X74477.1 EMBL stop_codon 2779 2781 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P27220"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:P27220"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52563.1"; exon_number "1"; +X74477.1 EMBL gene 2714 2717 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; partial "true"; +X74477.1 EMBL CDS 2714 2714 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P27222"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P27222"; gbkey "CDS"; gene "E2"; partial "true"; product "early protein"; protein_id "CAA52564.1"; exon_number "1"; +X74477.1 EMBL start_codon 2714 2716 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P27222"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P27222"; gbkey "CDS"; gene "E2"; partial "true"; product "early protein"; protein_id "CAA52564.1"; exon_number "1"; +X74477.1 EMBL gene 4211 5620 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X74477.1 EMBL CDS 4211 5617 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P27234"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P27234"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52565.1"; exon_number "1"; +X74477.1 EMBL start_codon 4211 4213 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P27234"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P27234"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52565.1"; exon_number "1"; +X74477.1 EMBL stop_codon 5618 5620 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P27234"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P27234"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52565.1"; exon_number "1"; +X74477.1 EMBL gene 5601 7109 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X74477.1 EMBL CDS 5601 7106 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:P27232"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "PDB:2R5J"; db_xref "UniProtKB/Swiss-Prot:P27232"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52566.1"; exon_number "1"; +X74477.1 EMBL start_codon 5601 5603 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:P27232"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "PDB:2R5J"; db_xref "UniProtKB/Swiss-Prot:P27232"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52566.1"; exon_number "1"; +X74477.1 EMBL stop_codon 7107 7109 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:P27232"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "PDB:2R5J"; db_xref "UniProtKB/Swiss-Prot:P27232"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52566.1"; exon_number "1"; +X74481.1 EMBL gene 102 548 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +X74481.1 EMBL CDS 102 545 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36814"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36814"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52585.1"; exon_number "1"; +X74481.1 EMBL start_codon 102 104 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36814"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36814"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52585.1"; exon_number "1"; +X74481.1 EMBL stop_codon 546 548 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P36814"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P36814"; gbkey "CDS"; gene "E6"; product "early protein"; protein_id "CAA52585.1"; exon_number "1"; +X74481.1 EMBL gene 553 852 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X74481.1 EMBL CDS 553 849 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36831"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36831"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52586.1"; exon_number "1"; +X74481.1 EMBL start_codon 553 555 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36831"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36831"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52586.1"; exon_number "1"; +X74481.1 EMBL stop_codon 850 852 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36831"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36831"; gbkey "CDS"; gene "E7"; product "early protein"; protein_id "CAA52586.1"; exon_number "1"; +X74481.1 EMBL gene 864 2807 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +X74481.1 EMBL CDS 864 2804 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36730"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:P36730"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52587.1"; exon_number "1"; +X74481.1 EMBL start_codon 864 866 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36730"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:P36730"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52587.1"; exon_number "1"; +X74481.1 EMBL stop_codon 2805 2807 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36730"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:P36730"; gbkey "CDS"; gene "E1"; product "early protein"; protein_id "CAA52587.1"; exon_number "1"; +X74481.1 EMBL gene 2743 3849 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +X74481.1 EMBL CDS 2743 3846 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36796"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36796"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52588.1"; exon_number "1"; +X74481.1 EMBL start_codon 2743 2745 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36796"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36796"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52588.1"; exon_number "1"; +X74481.1 EMBL stop_codon 3847 3849 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36796"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36796"; gbkey "CDS"; gene "E2"; product "early protein"; protein_id "CAA52588.1"; exon_number "1"; +X74481.1 EMBL gene 4262 5662 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X74481.1 EMBL CDS 4262 5659 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36763"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36763"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52589.1"; exon_number "1"; +X74481.1 EMBL start_codon 4262 4264 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36763"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36763"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52589.1"; exon_number "1"; +X74481.1 EMBL stop_codon 5660 5662 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36763"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36763"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52589.1"; exon_number "1"; +X74481.1 EMBL gene 5565 7154 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X74481.1 EMBL CDS 5565 7151 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q05138"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q05138"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52590.1"; exon_number "1"; +X74481.1 EMBL start_codon 5565 5567 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q05138"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q05138"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52590.1"; exon_number "1"; +X74481.1 EMBL stop_codon 7152 7154 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q05138"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:Q05138"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52590.1"; exon_number "1"; +X74483.1 EMBL gene 102 566 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; partial "true"; +X74483.1 EMBL CDS 102 563 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P24836"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P24836"; gbkey "CDS"; gene "E6"; partial "true"; product "envelope protein"; protein_id "CAA52596.1"; exon_number "1"; +X74483.1 EMBL start_codon 102 104 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:P24836"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/Swiss-Prot:P24836"; gbkey "CDS"; gene "E6"; partial "true"; product "envelope protein"; protein_id "CAA52596.1"; exon_number "1"; +X74483.1 EMBL gene 572 889 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X74483.1 EMBL CDS 572 886 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36833"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36833"; gbkey "CDS"; gene "E7"; product "envelope protein"; protein_id "CAA52597.1"; exon_number "1"; +X74483.1 EMBL start_codon 572 574 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36833"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36833"; gbkey "CDS"; gene "E7"; product "envelope protein"; protein_id "CAA52597.1"; exon_number "1"; +X74483.1 EMBL stop_codon 887 889 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:P36833"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/Swiss-Prot:P36833"; gbkey "CDS"; gene "E7"; product "envelope protein"; protein_id "CAA52597.1"; exon_number "1"; +X74483.1 EMBL gene 2918 3850 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +X74483.1 EMBL CDS 2918 3847 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36798"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36798"; gbkey "CDS"; gene "E2"; product "envelope protein"; protein_id "CAA52598.1"; exon_number "1"; +X74483.1 EMBL start_codon 2918 2920 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36798"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36798"; gbkey "CDS"; gene "E2"; product "envelope protein"; protein_id "CAA52598.1"; exon_number "1"; +X74483.1 EMBL stop_codon 3848 3850 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_3"; db_xref "GOA:P36798"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/Swiss-Prot:P36798"; gbkey "CDS"; gene "E2"; product "envelope protein"; protein_id "CAA52598.1"; exon_number "1"; +X74483.1 EMBL gene 4222 5616 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X74483.1 EMBL CDS 4222 5613 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36765"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36765"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52599.1"; exon_number "1"; +X74483.1 EMBL start_codon 4222 4224 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36765"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36765"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52599.1"; exon_number "1"; +X74483.1 EMBL stop_codon 5614 5616 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:P36765"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/Swiss-Prot:P36765"; gbkey "CDS"; gene "L2"; product "late protein"; protein_id "CAA52599.1"; exon_number "1"; +X74483.1 EMBL gene 5492 7096 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X74483.1 EMBL CDS 5492 7093 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36743"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:P36743"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52600.1"; exon_number "1"; +X74483.1 EMBL start_codon 5492 5494 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36743"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:P36743"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52600.1"; exon_number "1"; +X74483.1 EMBL stop_codon 7094 7096 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_5"; db_xref "GOA:P36743"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/Swiss-Prot:P36743"; gbkey "CDS"; gene "L1"; product "late protein"; protein_id "CAA52600.1"; exon_number "1"; +X77858.1 EMBL gene 55 537 . + . gene_id "ORF putative E6"; transcript_id ""; gbkey "Gene"; gene "ORF putative E6"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 55 534 . + 0 gene_id "ORF putative E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:Q81964"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/TrEMBL:Q81964"; gbkey "CDS"; gene "ORF putative E6"; product "CAA54849.1"; protein_id "CAA54849.1"; exon_number "1"; +X77858.1 EMBL start_codon 55 57 . + 0 gene_id "ORF putative E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:Q81964"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/TrEMBL:Q81964"; gbkey "CDS"; gene "ORF putative E6"; product "CAA54849.1"; protein_id "CAA54849.1"; exon_number "1"; +X77858.1 EMBL stop_codon 535 537 . + 0 gene_id "ORF putative E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:Q81964"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/TrEMBL:Q81964"; gbkey "CDS"; gene "ORF putative E6"; product "CAA54849.1"; protein_id "CAA54849.1"; exon_number "1"; +X77858.1 EMBL gene 542 865 . + . gene_id "ORF putative E7"; transcript_id ""; gbkey "Gene"; gene "ORF putative E7"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 542 862 . + 0 gene_id "ORF putative E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:Q81965"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/TrEMBL:Q81965"; gbkey "CDS"; gene "ORF putative E7"; product "CAA54850.1"; protein_id "CAA54850.1"; exon_number "1"; +X77858.1 EMBL start_codon 542 544 . + 0 gene_id "ORF putative E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:Q81965"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/TrEMBL:Q81965"; gbkey "CDS"; gene "ORF putative E7"; product "CAA54850.1"; protein_id "CAA54850.1"; exon_number "1"; +X77858.1 EMBL stop_codon 863 865 . + 0 gene_id "ORF putative E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:Q81965"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/TrEMBL:Q81965"; gbkey "CDS"; gene "ORF putative E7"; product "CAA54850.1"; protein_id "CAA54850.1"; exon_number "1"; +X77858.1 EMBL gene 872 2806 . + . gene_id "ORF putative E1"; transcript_id ""; gbkey "Gene"; gene "ORF putative E1"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 872 2803 . + 0 gene_id "ORF putative E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q81966"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q81966"; gbkey "CDS"; gene "ORF putative E1"; product "CAA54851.1"; protein_id "CAA54851.1"; exon_number "1"; +X77858.1 EMBL start_codon 872 874 . + 0 gene_id "ORF putative E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q81966"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q81966"; gbkey "CDS"; gene "ORF putative E1"; product "CAA54851.1"; protein_id "CAA54851.1"; exon_number "1"; +X77858.1 EMBL stop_codon 2804 2806 . + 0 gene_id "ORF putative E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q81966"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q81966"; gbkey "CDS"; gene "ORF putative E1"; product "CAA54851.1"; protein_id "CAA54851.1"; exon_number "1"; +X77858.1 EMBL gene 2736 3848 . + . gene_id "ORF putative E2"; transcript_id ""; gbkey "Gene"; gene "ORF putative E2"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 2736 3845 . + 0 gene_id "ORF putative E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q81967"; db_xref "HSSP:1JJ4"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/TrEMBL:Q81967"; gbkey "CDS"; gene "ORF putative E2"; product "CAA54852.1"; protein_id "CAA54852.1"; exon_number "1"; +X77858.1 EMBL start_codon 2736 2738 . + 0 gene_id "ORF putative E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q81967"; db_xref "HSSP:1JJ4"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/TrEMBL:Q81967"; gbkey "CDS"; gene "ORF putative E2"; product "CAA54852.1"; protein_id "CAA54852.1"; exon_number "1"; +X77858.1 EMBL stop_codon 3846 3848 . + 0 gene_id "ORF putative E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q81967"; db_xref "HSSP:1JJ4"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/TrEMBL:Q81967"; gbkey "CDS"; gene "ORF putative E2"; product "CAA54852.1"; protein_id "CAA54852.1"; exon_number "1"; +X77858.1 EMBL gene 3268 3615 . + . gene_id "ORF putative E4"; transcript_id ""; gbkey "Gene"; gene "ORF putative E4"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 3268 3612 . + 0 gene_id "ORF putative E4"; transcript_id "unassigned_transcript_5"; db_xref "InterPro:IPR003861"; db_xref "UniProtKB/TrEMBL:Q76QV7"; gbkey "CDS"; gene "ORF putative E4"; product "CAA54853.1"; protein_id "CAA54853.1"; exon_number "1"; +X77858.1 EMBL start_codon 3268 3270 . + 0 gene_id "ORF putative E4"; transcript_id "unassigned_transcript_5"; db_xref "InterPro:IPR003861"; db_xref "UniProtKB/TrEMBL:Q76QV7"; gbkey "CDS"; gene "ORF putative E4"; product "CAA54853.1"; protein_id "CAA54853.1"; exon_number "1"; +X77858.1 EMBL stop_codon 3613 3615 . + 0 gene_id "ORF putative E4"; transcript_id "unassigned_transcript_5"; db_xref "InterPro:IPR003861"; db_xref "UniProtKB/TrEMBL:Q76QV7"; gbkey "CDS"; gene "ORF putative E4"; product "CAA54853.1"; protein_id "CAA54853.1"; exon_number "1"; +X77858.1 EMBL gene 3908 4129 . + . gene_id "ORF putative E5"; transcript_id ""; gbkey "Gene"; gene "ORF putative E5"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 3908 4126 . + 0 gene_id "ORF putative E5"; transcript_id "unassigned_transcript_6"; db_xref "InterPro:IPR004270"; db_xref "UniProtKB/TrEMBL:Q81969"; gbkey "CDS"; gene "ORF putative E5"; product "CAA54854.1"; protein_id "CAA54854.1"; exon_number "1"; +X77858.1 EMBL start_codon 3908 3910 . + 0 gene_id "ORF putative E5"; transcript_id "unassigned_transcript_6"; db_xref "InterPro:IPR004270"; db_xref "UniProtKB/TrEMBL:Q81969"; gbkey "CDS"; gene "ORF putative E5"; product "CAA54854.1"; protein_id "CAA54854.1"; exon_number "1"; +X77858.1 EMBL stop_codon 4127 4129 . + 0 gene_id "ORF putative E5"; transcript_id "unassigned_transcript_6"; db_xref "InterPro:IPR004270"; db_xref "UniProtKB/TrEMBL:Q81969"; gbkey "CDS"; gene "ORF putative E5"; product "CAA54854.1"; protein_id "CAA54854.1"; exon_number "1"; +X77858.1 EMBL gene 4231 5625 . + . gene_id "ORF putative L2"; transcript_id ""; gbkey "Gene"; gene "ORF putative L2"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 4231 5622 . + 0 gene_id "ORF putative L2"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q81970"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/TrEMBL:Q81970"; gbkey "CDS"; gene "ORF putative L2"; product "CAA54855.1"; protein_id "CAA54855.1"; exon_number "1"; +X77858.1 EMBL start_codon 4231 4233 . + 0 gene_id "ORF putative L2"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q81970"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/TrEMBL:Q81970"; gbkey "CDS"; gene "ORF putative L2"; product "CAA54855.1"; protein_id "CAA54855.1"; exon_number "1"; +X77858.1 EMBL stop_codon 5623 5625 . + 0 gene_id "ORF putative L2"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q81970"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/TrEMBL:Q81970"; gbkey "CDS"; gene "ORF putative L2"; product "CAA54855.1"; protein_id "CAA54855.1"; exon_number "1"; +X77858.1 EMBL gene 5606 7132 . + . gene_id "ORF putative L1"; transcript_id ""; gbkey "Gene"; gene "ORF putative L1"; gene_biotype "protein_coding"; +X77858.1 EMBL CDS 5606 7129 . + 0 gene_id "ORF putative L1"; transcript_id "unassigned_transcript_8"; db_xref "GOA:Q81971"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q81971"; gbkey "CDS"; gene "ORF putative L1"; product "CAA54856.1"; protein_id "CAA54856.1"; exon_number "1"; +X77858.1 EMBL start_codon 5606 5608 . + 0 gene_id "ORF putative L1"; transcript_id "unassigned_transcript_8"; db_xref "GOA:Q81971"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q81971"; gbkey "CDS"; gene "ORF putative L1"; product "CAA54856.1"; protein_id "CAA54856.1"; exon_number "1"; +X77858.1 EMBL stop_codon 7130 7132 . + 0 gene_id "ORF putative L1"; transcript_id "unassigned_transcript_8"; db_xref "GOA:Q81971"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q81971"; gbkey "CDS"; gene "ORF putative L1"; product "CAA54856.1"; protein_id "CAA54856.1"; exon_number "1"; +X94165.1 EMBL gene 102 548 . + . gene_id "E6"; transcript_id ""; gbkey "Gene"; gene "E6"; gene_biotype "protein_coding"; +X94165.1 EMBL CDS 102 545 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:Q82005"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/TrEMBL:Q82005"; gbkey "CDS"; gene "E6"; note "early gene, putative"; product "CAA63882.1"; protein_id "CAA63882.1"; exon_number "1"; +X94165.1 EMBL start_codon 102 104 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:Q82005"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/TrEMBL:Q82005"; gbkey "CDS"; gene "E6"; note "early gene, putative"; product "CAA63882.1"; protein_id "CAA63882.1"; exon_number "1"; +X94165.1 EMBL stop_codon 546 548 . + 0 gene_id "E6"; transcript_id "unassigned_transcript_1"; db_xref "GOA:Q82005"; db_xref "InterPro:IPR001334"; db_xref "UniProtKB/TrEMBL:Q82005"; gbkey "CDS"; gene "E6"; note "early gene, putative"; product "CAA63882.1"; protein_id "CAA63882.1"; exon_number "1"; +X94165.1 EMBL gene 550 843 . + . gene_id "E7"; transcript_id ""; gbkey "Gene"; gene "E7"; gene_biotype "protein_coding"; +X94165.1 EMBL CDS 550 840 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:Q82006"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/TrEMBL:Q82006"; gbkey "CDS"; gene "E7"; note "putative"; product "CAA63883.1"; protein_id "CAA63883.1"; exon_number "1"; +X94165.1 EMBL start_codon 550 552 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:Q82006"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/TrEMBL:Q82006"; gbkey "CDS"; gene "E7"; note "putative"; product "CAA63883.1"; protein_id "CAA63883.1"; exon_number "1"; +X94165.1 EMBL stop_codon 841 843 . + 0 gene_id "E7"; transcript_id "unassigned_transcript_2"; db_xref "GOA:Q82006"; db_xref "InterPro:IPR000148"; db_xref "UniProtKB/TrEMBL:Q82006"; gbkey "CDS"; gene "E7"; note "putative"; product "CAA63883.1"; protein_id "CAA63883.1"; exon_number "1"; +X94165.1 EMBL gene 850 2802 . + . gene_id "E1"; transcript_id ""; gbkey "Gene"; gene "E1"; gene_biotype "protein_coding"; +X94165.1 EMBL CDS 850 2799 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q82007"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q82007"; gbkey "CDS"; gene "E1"; note "putative"; product "CAA63884.1"; protein_id "CAA63884.1"; exon_number "1"; +X94165.1 EMBL start_codon 850 852 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q82007"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q82007"; gbkey "CDS"; gene "E1"; note "putative"; product "CAA63884.1"; protein_id "CAA63884.1"; exon_number "1"; +X94165.1 EMBL stop_codon 2800 2802 . + 0 gene_id "E1"; transcript_id "unassigned_transcript_3"; db_xref "GOA:Q82007"; db_xref "InterPro:IPR001177"; db_xref "InterPro:IPR014000"; db_xref "InterPro:IPR014015"; db_xref "InterPro:IPR016393"; db_xref "UniProtKB/Swiss-Prot:Q82007"; gbkey "CDS"; gene "E1"; note "putative"; product "CAA63884.1"; protein_id "CAA63884.1"; exon_number "1"; +X94165.1 EMBL gene 2741 3793 . + . gene_id "E2"; transcript_id ""; gbkey "Gene"; gene "E2"; gene_biotype "protein_coding"; +X94165.1 EMBL CDS 2741 3790 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q82008"; db_xref "HSSP:1JJ4"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/TrEMBL:Q82008"; gbkey "CDS"; gene "E2"; note "putative"; product "CAA63885.1"; protein_id "CAA63885.1"; exon_number "1"; +X94165.1 EMBL start_codon 2741 2743 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q82008"; db_xref "HSSP:1JJ4"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/TrEMBL:Q82008"; gbkey "CDS"; gene "E2"; note "putative"; product "CAA63885.1"; protein_id "CAA63885.1"; exon_number "1"; +X94165.1 EMBL stop_codon 3791 3793 . + 0 gene_id "E2"; transcript_id "unassigned_transcript_4"; db_xref "GOA:Q82008"; db_xref "HSSP:1JJ4"; db_xref "InterPro:IPR000427"; db_xref "InterPro:IPR001866"; db_xref "InterPro:IPR009021"; db_xref "InterPro:IPR012677"; db_xref "UniProtKB/TrEMBL:Q82008"; gbkey "CDS"; gene "E2"; note "putative"; product "CAA63885.1"; protein_id "CAA63885.1"; exon_number "1"; +X94165.1 EMBL gene 3324 3560 . + . gene_id "E4"; transcript_id ""; gbkey "Gene"; gene "E4"; gene_biotype "other"; +X94165.1 EMBL transcript 3324 3560 . + . gene_id "E4"; transcript_id "unassigned_transcript_5"; gbkey "mRNA"; gene "E4"; note "putative"; transcript_biotype "mRNA"; +X94165.1 EMBL exon 3324 3560 . + . gene_id "E4"; transcript_id "unassigned_transcript_5"; gene "E4"; note "putative"; transcript_biotype "mRNA"; exon_number "1"; +X94165.1 EMBL gene 4083 5510 . + . gene_id "L2"; transcript_id ""; gbkey "Gene"; gene "L2"; gene_biotype "protein_coding"; +X94165.1 EMBL CDS 4083 5507 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q82009"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/TrEMBL:Q82009"; gbkey "CDS"; gene "L2"; note "late gene, putative"; product "CAA63886.1"; protein_id "CAA63886.1"; exon_number "1"; +X94165.1 EMBL start_codon 4083 4085 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q82009"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/TrEMBL:Q82009"; gbkey "CDS"; gene "L2"; note "late gene, putative"; product "CAA63886.1"; protein_id "CAA63886.1"; exon_number "1"; +X94165.1 EMBL stop_codon 5508 5510 . + 0 gene_id "L2"; transcript_id "unassigned_transcript_6"; db_xref "GOA:Q82009"; db_xref "InterPro:IPR000784"; db_xref "UniProtKB/TrEMBL:Q82009"; gbkey "CDS"; gene "L2"; note "late gene, putative"; product "CAA63886.1"; protein_id "CAA63886.1"; exon_number "1"; +X94165.1 EMBL gene 5494 7005 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; gene "L1"; gene_biotype "protein_coding"; +X94165.1 EMBL CDS 5494 7002 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q82010"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q82010"; gbkey "CDS"; gene "L1"; note "putative"; product "CAA63887.1"; protein_id "CAA63887.1"; exon_number "1"; +X94165.1 EMBL start_codon 5494 5496 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q82010"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q82010"; gbkey "CDS"; gene "L1"; note "putative"; product "CAA63887.1"; protein_id "CAA63887.1"; exon_number "1"; +X94165.1 EMBL stop_codon 7003 7005 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q82010"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q82010"; gbkey "CDS"; gene "L1"; note "putative"; product "CAA63887.1"; protein_id "CAA63887.1"; exon_number "1"; diff --git a/tests/test_tools/data/viral.gtf.json b/tests/test_tools/data/viral.gtf.json new file mode 100644 index 00000000..710f2e0d --- /dev/null +++ b/tests/test_tools/data/viral.gtf.json @@ -0,0 +1,2683 @@ +{ + "genes": [ + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 2790, + "name": "E1", + "start": 886, + "strand": "+", + "transcripts": [ + { + "end": 2787, + "name": "AB027020.1_E1_T", + "start": 886, + "translations": [ + { + "biotype": "CDS", + "end": 2787, + "name": "BAA90729.1", + "start": 886 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 2751, + "name": "E1", + "start": 862, + "strand": "+", + "transcripts": [ + { + "end": 2748, + "name": "J04353.1_E1_T", + "start": 862, + "translations": [ + { + "biotype": "CDS", + "end": 2748, + "name": "AAA46952.1", + "start": 862 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 2813, + "name": "E1", + "start": 865, + "strand": "+", + "transcripts": [ + { + "end": 2810, + "name": "K02718.1_E1_T", + "start": 865, + "translations": [ + { + "biotype": "CDS", + "end": 2810, + "name": "AAA46936.1", + "start": 865 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 2813, + "name": "E1", + "start": 879, + "strand": "+", + "transcripts": [ + { + "end": 2810, + "name": "M12732.1_E1_T", + "start": 879, + "translations": [ + { + "biotype": "CDS", + "end": 2810, + "name": "AAA46960.1", + "start": 879 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 2871, + "name": "E1", + "start": 928, + "strand": "+", + "transcripts": [ + { + "end": 2868, + "name": "M62849.1_E1_T", + "start": 928, + "translations": [ + { + "biotype": "CDS", + "end": 2868, + "name": "AAA47052.1", + "start": 928 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 2886, + "name": "E1", + "start": 928, + "strand": "+" + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 2887, + "name": "E1", + "start": 914, + "strand": "+", + "transcripts": [ + { + "end": 2884, + "name": "X05015.1_E1_T", + "start": 914, + "translations": [ + { + "biotype": "CDS", + "end": 2884, + "name": "CAA28666.1", + "start": 914 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74464.1", + "end": 2734, + "name": "E1", + "start": 917, + "strand": "+", + "transcripts": [ + { + "end": 2731, + "name": "X74464.1_E1_T", + "start": 917, + "translations": [ + { + "biotype": "CDS", + "end": 2731, + "name": "CAA52485.1", + "start": 917 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74474.1", + "end": 2785, + "name": "E1", + "start": 890, + "strand": "+", + "transcripts": [ + { + "end": 2782, + "name": "X74474.1_E1_T", + "start": 890, + "translations": [ + { + "biotype": "CDS", + "end": 2782, + "name": "CAA52545.1", + "start": 890 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74477.1", + "end": 2781, + "name": "E1", + "start": 868, + "strand": "+", + "transcripts": [ + { + "end": 2778, + "name": "X74477.1_E1_T", + "start": 868, + "translations": [ + { + "biotype": "CDS", + "end": 2778, + "name": "CAA52563.1", + "start": 868 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74481.1", + "end": 2807, + "name": "E1", + "start": 864, + "strand": "+", + "transcripts": [ + { + "end": 2804, + "name": "X74481.1_E1_T", + "start": 864, + "translations": [ + { + "biotype": "CDS", + "end": 2804, + "name": "CAA52587.1", + "start": 864 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 2802, + "name": "E1", + "start": 850, + "strand": "+", + "transcripts": [ + { + "end": 2799, + "name": "X94165.1_E1_T", + "start": 850, + "translations": [ + { + "biotype": "CDS", + "end": 2799, + "name": "CAA63884.1", + "start": 850 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 3838, + "name": "E2", + "start": 2732, + "strand": "+", + "transcripts": [ + { + "end": 3835, + "name": "AB027020.1_E2_T", + "start": 2732, + "translations": [ + { + "biotype": "CDS", + "end": 3835, + "name": "BAA90730.1", + "start": 2732 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 3811, + "name": "E2", + "start": 2693, + "strand": "+", + "transcripts": [ + { + "end": 3808, + "name": "J04353.1_E2_T", + "start": 2693, + "translations": [ + { + "biotype": "CDS", + "end": 3808, + "name": "AAA46953.1", + "start": 2693 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 3852, + "name": "E2", + "start": 2755, + "strand": "+", + "transcripts": [ + { + "end": 3849, + "name": "K02718.1_E2_T", + "start": 2755, + "translations": [ + { + "biotype": "CDS", + "end": 3849, + "name": "AAA46941.1", + "start": 2755 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 3810, + "name": "E2", + "start": 2749, + "strand": "+", + "transcripts": [ + { + "end": 3807, + "name": "M12732.1_E2_T", + "start": 2749, + "translations": [ + { + "biotype": "CDS", + "end": 3807, + "name": "AAA46961.1", + "start": 2749 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 3910, + "name": "E2", + "start": 2798, + "strand": "+", + "transcripts": [ + { + "end": 3907, + "name": "M62849.1_E2_T", + "start": 2798, + "translations": [ + { + "biotype": "CDS", + "end": 3907, + "name": "AAA47053.1", + "start": 2798 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 3895, + "name": "E2", + "start": 2813, + "strand": "+" + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 3914, + "name": "E2", + "start": 2817, + "strand": "+", + "transcripts": [ + { + "end": 3911, + "name": "X05015.1_E2_T", + "start": 2817, + "translations": [ + { + "biotype": "CDS", + "end": 3911, + "name": "CAA28667.1", + "start": 2817 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74464.1", + "end": 4061, + "name": "E2", + "start": 2676, + "strand": "+", + "transcripts": [ + { + "end": 4058, + "name": "X74464.1_E2_T", + "start": 2676, + "translations": [ + { + "biotype": "CDS", + "end": 4058, + "name": "CAA52486.1", + "start": 2676 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74474.1", + "end": 3863, + "name": "E2", + "start": 2727, + "strand": "+", + "transcripts": [ + { + "end": 3860, + "name": "X74474.1_E2_T", + "start": 2727, + "translations": [ + { + "biotype": "CDS", + "end": 3860, + "name": "CAA52546.1", + "start": 2727 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74477.1", + "end": 2717, + "name": "E2", + "start": 2714, + "strand": "+", + "transcripts": [ + { + "end": 2714, + "name": "X74477.1_E2_T", + "start": 2714, + "translations": [ + { + "biotype": "CDS", + "end": 2714, + "name": "CAA52564.1", + "start": 2714 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74481.1", + "end": 3849, + "name": "E2", + "start": 2743, + "strand": "+", + "transcripts": [ + { + "end": 3846, + "name": "X74481.1_E2_T", + "start": 2743, + "translations": [ + { + "biotype": "CDS", + "end": 3846, + "name": "CAA52588.1", + "start": 2743 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74483.1", + "end": 3850, + "name": "E2", + "start": 2918, + "strand": "+", + "transcripts": [ + { + "end": 3847, + "name": "X74483.1_E2_T", + "start": 2918, + "translations": [ + { + "biotype": "CDS", + "end": 3847, + "name": "CAA52598.1", + "start": 2918 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 3793, + "name": "E2", + "start": 2741, + "strand": "+", + "transcripts": [ + { + "end": 3790, + "name": "X94165.1_E2_T", + "start": 2741, + "translations": [ + { + "biotype": "CDS", + "end": 3790, + "name": "CAA63885.1", + "start": 2741 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 3614, + "name": "E4", + "start": 3309, + "strand": "+", + "transcripts": [ + { + "end": 3611, + "name": "AB027020.1_E4_T", + "start": 3309, + "translations": [ + { + "biotype": "CDS", + "end": 3611, + "name": "BAA90731.1", + "start": 3309 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 3578, + "name": "E4", + "start": 3270, + "strand": "+", + "transcripts": [ + { + "end": 3575, + "name": "J04353.1_E4_T", + "start": 3270, + "translations": [ + { + "biotype": "CDS", + "end": 3575, + "name": "AAA46949.1", + "start": 3270 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 3619, + "name": "E4", + "start": 3332, + "strand": "+", + "transcripts": [ + { + "end": 3616, + "name": "K02718.1_E4_T", + "start": 3332, + "translations": [ + { + "biotype": "CDS", + "end": 3616, + "name": "AAA46937.1", + "start": 3332 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 3577, + "name": "E4", + "start": 3326, + "strand": "+", + "transcripts": [ + { + "end": 3574, + "name": "M12732.1_E4_T", + "start": 3326, + "translations": [ + { + "biotype": "CDS", + "end": 3574, + "name": "AAA46957.1", + "start": 3326 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 3677, + "name": "E4", + "start": 3393, + "strand": "+", + "transcripts": [ + { + "end": 3674, + "name": "M62849.1_E4_T", + "start": 3393, + "translations": [ + { + "biotype": "CDS", + "end": 3674, + "name": "AAA47049.1", + "start": 3393 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 3662, + "name": "E4", + "start": 3408, + "strand": "+", + "transcripts": [ + { + "end": 3659, + "name": "U21941.1_E4_T", + "start": 3408, + "translations": [ + { + "biotype": "CDS", + "end": 3659, + "name": "AAC54854.1", + "start": 3408 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 3684, + "name": "E4", + "start": 3418, + "strand": "+", + "transcripts": [ + { + "end": 3681, + "name": "X05015.1_E4_T", + "start": 3418, + "translations": [ + { + "biotype": "CDS", + "end": 3681, + "name": "CAA28668.1", + "start": 3418 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 3560, + "name": "E4", + "start": 3324, + "strand": "+", + "transcripts": [ + { + "biotype": "transcript", + "end": 3560, + "name": "X94165.1_unassigned_transcript_5", + "start": 3324 + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 4142, + "name": "E5", + "start": 3846, + "strand": "+", + "transcripts": [ + { + "end": 4139, + "name": "AB027020.1_E5_T", + "start": 3846, + "translations": [ + { + "biotype": "CDS", + "end": 4139, + "name": "BAA90732.1", + "start": 3846 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 4070, + "name": "E5", + "start": 3816, + "strand": "+", + "transcripts": [ + { + "end": 4067, + "name": "J04353.1_E5_T", + "start": 3816, + "translations": [ + { + "biotype": "CDS", + "end": 4067, + "name": "AAA46954.1", + "start": 3816 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 4099, + "name": "E5", + "start": 3863, + "strand": "+", + "transcripts": [ + { + "end": 4096, + "name": "K02718.1_E5_T", + "start": 3863, + "translations": [ + { + "biotype": "CDS", + "end": 4096, + "name": "AAA46938.1", + "start": 3863 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 4081, + "name": "E5", + "start": 3854, + "strand": "+", + "transcripts": [ + { + "end": 4078, + "name": "M12732.1_E5_T", + "start": 3854, + "translations": [ + { + "biotype": "CDS", + "end": 4078, + "name": "AAA46962.1", + "start": 3854 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 4176, + "name": "E5", + "start": 3958, + "strand": "+", + "transcripts": [ + { + "end": 4173, + "name": "M62849.1_E5_T", + "start": 3958, + "translations": [ + { + "biotype": "CDS", + "end": 4173, + "name": "AAA47054.1", + "start": 3958 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 4145, + "name": "E5", + "start": 3909, + "strand": "+", + "transcripts": [ + { + "end": 4142, + "name": "U21941.1_E5_T", + "start": 3909, + "translations": [ + { + "biotype": "CDS", + "end": 4142, + "name": "AAC54855.1", + "start": 3909 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 4157, + "name": "E5", + "start": 3936, + "strand": "+", + "transcripts": [ + { + "end": 4154, + "name": "X05015.1_E5_T", + "start": 3936, + "translations": [ + { + "biotype": "CDS", + "end": 4154, + "name": "CAA28669.1", + "start": 3936 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 557, + "name": "E6", + "start": 102, + "strand": "+", + "transcripts": [ + { + "end": 554, + "name": "AB027020.1_E6_T", + "start": 102, + "translations": [ + { + "biotype": "CDS", + "end": 554, + "name": "BAA90727.1", + "start": 102 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 557, + "name": "E6", + "start": 108, + "strand": "+", + "transcripts": [ + { + "end": 554, + "name": "J04353.1_E6_T", + "start": 108, + "translations": [ + { + "biotype": "CDS", + "end": 554, + "name": "AAA46950.1", + "start": 108 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 559, + "name": "E6", + "start": 83, + "strand": "+", + "transcripts": [ + { + "end": 556, + "name": "K02718.1_E6_T", + "start": 83, + "translations": [ + { + "biotype": "CDS", + "end": 556, + "name": "AAA46939.1", + "start": 83 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 558, + "name": "E6", + "start": 109, + "strand": "+", + "transcripts": [ + { + "end": 555, + "name": "M12732.1_E6_T", + "start": 109, + "translations": [ + { + "biotype": "CDS", + "end": 555, + "name": "AAA46958.1", + "start": 109 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 583, + "name": "E6", + "start": 107, + "strand": "+", + "transcripts": [ + { + "end": 580, + "name": "M62849.1_E6_T", + "start": 107, + "translations": [ + { + "biotype": "CDS", + "end": 580, + "name": "AAA47050.1", + "start": 107 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 583, + "name": "E6", + "start": 107, + "strand": "+", + "transcripts": [ + { + "end": 580, + "name": "U21941.1_E6_T", + "start": 107, + "translations": [ + { + "biotype": "CDS", + "end": 580, + "name": "AAC54850.1", + "start": 107 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 581, + "name": "E6", + "start": 105, + "strand": "+", + "transcripts": [ + { + "end": 578, + "name": "X05015.1_E6_T", + "start": 105, + "translations": [ + { + "biotype": "CDS", + "end": 578, + "name": "CAA28664.1", + "start": 105 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74464.1", + "end": 646, + "name": "E6", + "start": 200, + "strand": "+", + "transcripts": [ + { + "end": 643, + "name": "X74464.1_E6_T", + "start": 200, + "translations": [ + { + "biotype": "CDS", + "end": 643, + "name": "CAA52482.1", + "start": 200 + }, + { + "biotype": "CDS", + "end": 643, + "name": "CAA52483.1", + "start": 221 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74474.1", + "end": 563, + "name": "E6", + "start": 102, + "strand": "+", + "transcripts": [ + { + "end": 560, + "name": "X74474.1_E6_T", + "start": 102, + "translations": [ + { + "biotype": "CDS", + "end": 560, + "name": "CAA52543.1", + "start": 102 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74477.1", + "end": 559, + "name": "E6", + "start": 110, + "strand": "+", + "transcripts": [ + { + "end": 556, + "name": "X74477.1_E6_T", + "start": 110, + "translations": [ + { + "biotype": "CDS", + "end": 556, + "name": "CAA52561.1", + "start": 110 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74481.1", + "end": 548, + "name": "E6", + "start": 102, + "strand": "+", + "transcripts": [ + { + "end": 545, + "name": "X74481.1_E6_T", + "start": 102, + "translations": [ + { + "biotype": "CDS", + "end": 545, + "name": "CAA52585.1", + "start": 102 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74483.1", + "end": 566, + "name": "E6", + "start": 102, + "strand": "+", + "transcripts": [ + { + "end": 563, + "name": "X74483.1_E6_T", + "start": 102, + "translations": [ + { + "biotype": "CDS", + "end": 563, + "name": "CAA52596.1", + "start": 102 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 548, + "name": "E6", + "start": 102, + "strand": "+", + "transcripts": [ + { + "end": 545, + "name": "X94165.1_E6_T", + "start": 102, + "translations": [ + { + "biotype": "CDS", + "end": 545, + "name": "CAA63882.1", + "start": 102 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 878, + "name": "E7", + "start": 564, + "strand": "+", + "transcripts": [ + { + "end": 875, + "name": "AB027020.1_E7_T", + "start": 564, + "translations": [ + { + "biotype": "CDS", + "end": 875, + "name": "BAA90728.1", + "start": 564 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 856, + "name": "E7", + "start": 560, + "strand": "+", + "transcripts": [ + { + "end": 853, + "name": "J04353.1_E7_T", + "start": 560, + "translations": [ + { + "biotype": "CDS", + "end": 853, + "name": "AAA46951.1", + "start": 560 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 858, + "name": "E7", + "start": 562, + "strand": "+", + "transcripts": [ + { + "end": 855, + "name": "K02718.1_E7_T", + "start": 562, + "translations": [ + { + "biotype": "CDS", + "end": 855, + "name": "AAA46940.1", + "start": 562 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 866, + "name": "E7", + "start": 573, + "strand": "+", + "transcripts": [ + { + "end": 863, + "name": "M12732.1_E7_T", + "start": 573, + "translations": [ + { + "biotype": "CDS", + "end": 863, + "name": "AAA46959.1", + "start": 573 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 921, + "name": "E7", + "start": 592, + "strand": "+", + "transcripts": [ + { + "end": 918, + "name": "M62849.1_E7_T", + "start": 592, + "translations": [ + { + "biotype": "CDS", + "end": 918, + "name": "AAA47051.1", + "start": 592 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 921, + "name": "E7", + "start": 592, + "strand": "+", + "transcripts": [ + { + "end": 918, + "name": "U21941.1_E7_T", + "start": 592, + "translations": [ + { + "biotype": "CDS", + "end": 918, + "name": "AAC54851.1", + "start": 592 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 907, + "name": "E7", + "start": 590, + "strand": "+", + "transcripts": [ + { + "end": 904, + "name": "X05015.1_E7_T", + "start": 590, + "translations": [ + { + "biotype": "CDS", + "end": 904, + "name": "CAA28665.1", + "start": 590 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74464.1", + "end": 924, + "name": "E7", + "start": 643, + "strand": "+", + "transcripts": [ + { + "end": 921, + "name": "X74464.1_E7_T", + "start": 643, + "translations": [ + { + "biotype": "CDS", + "end": 921, + "name": "CAA52484.1", + "start": 643 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74474.1", + "end": 883, + "name": "E7", + "start": 566, + "strand": "+", + "transcripts": [ + { + "end": 880, + "name": "X74474.1_E7_T", + "start": 566, + "translations": [ + { + "biotype": "CDS", + "end": 880, + "name": "CAA52544.1", + "start": 566 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74477.1", + "end": 861, + "name": "E7", + "start": 562, + "strand": "+", + "transcripts": [ + { + "end": 858, + "name": "X74477.1_E7_T", + "start": 562, + "translations": [ + { + "biotype": "CDS", + "end": 858, + "name": "CAA52562.1", + "start": 562 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74481.1", + "end": 852, + "name": "E7", + "start": 553, + "strand": "+", + "transcripts": [ + { + "end": 849, + "name": "X74481.1_E7_T", + "start": 553, + "translations": [ + { + "biotype": "CDS", + "end": 849, + "name": "CAA52586.1", + "start": 553 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74483.1", + "end": 889, + "name": "E7", + "start": 572, + "strand": "+", + "transcripts": [ + { + "end": 886, + "name": "X74483.1_E7_T", + "start": 572, + "translations": [ + { + "biotype": "CDS", + "end": 886, + "name": "CAA52597.1", + "start": 572 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 843, + "name": "E7", + "start": 550, + "strand": "+", + "transcripts": [ + { + "end": 840, + "name": "X94165.1_E7_T", + "start": 550, + "translations": [ + { + "biotype": "CDS", + "end": 840, + "name": "CAA63883.1", + "start": 550 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 7064, + "name": "L1", + "start": 5541, + "strand": "+", + "transcripts": [ + { + "end": 7061, + "name": "AB027020.1_L1_T", + "start": 5541, + "translations": [ + { + "biotype": "CDS", + "end": 7061, + "name": "BAA90734.1", + "start": 5541 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 7066, + "name": "L1", + "start": 5552, + "strand": "+", + "transcripts": [ + { + "end": 7063, + "name": "J04353.1_L1_T", + "start": 5552, + "translations": [ + { + "biotype": "CDS", + "end": 7063, + "name": "AAA46956.1", + "start": 5552 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 7154, + "name": "L1", + "start": 5559, + "strand": "+", + "transcripts": [ + { + "end": 7151, + "name": "K02718.1_L1_T", + "start": 5559, + "translations": [ + { + "biotype": "CDS", + "end": 7151, + "name": "AAA46943.1", + "start": 5559 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 7093, + "name": "L1", + "start": 5594, + "strand": "+", + "transcripts": [ + { + "end": 7090, + "name": "M12732.1_L1_T", + "start": 5594, + "translations": [ + { + "biotype": "CDS", + "end": 7090, + "name": "AAA46964.1", + "start": 5594 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 7160, + "name": "L1", + "start": 5643, + "strand": "+", + "transcripts": [ + { + "end": 7157, + "name": "M62849.1_L1_T", + "start": 5643, + "translations": [ + { + "biotype": "CDS", + "end": 7157, + "name": "AAA47056.1", + "start": 5643 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 7104, + "name": "L1", + "start": 5590, + "strand": "+" + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 7136, + "name": "L1", + "start": 5430, + "strand": "+", + "transcripts": [ + { + "end": 7133, + "name": "X05015.1_L1_T", + "start": 5430, + "translations": [ + { + "biotype": "CDS", + "end": 7133, + "name": "CAA28671.1", + "start": 5430 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74464.1", + "end": 7268, + "name": "L1", + "start": 5745, + "strand": "+", + "transcripts": [ + { + "end": 7265, + "name": "X74464.1_L1_T", + "start": 5745, + "translations": [ + { + "biotype": "CDS", + "end": 7265, + "name": "CAA52488.1", + "start": 5745 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74474.1", + "end": 7157, + "name": "L1", + "start": 5631, + "strand": "+", + "transcripts": [ + { + "end": 7154, + "name": "X74474.1_L1_T", + "start": 5631, + "translations": [ + { + "biotype": "CDS", + "end": 7154, + "name": "CAA52548.1", + "start": 5631 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74477.1", + "end": 7109, + "name": "L1", + "start": 5601, + "strand": "+", + "transcripts": [ + { + "end": 7106, + "name": "X74477.1_L1_T", + "start": 5601, + "translations": [ + { + "biotype": "CDS", + "end": 7106, + "name": "CAA52566.1", + "start": 5601 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74481.1", + "end": 7154, + "name": "L1", + "start": 5565, + "strand": "+", + "transcripts": [ + { + "end": 7151, + "name": "X74481.1_L1_T", + "start": 5565, + "translations": [ + { + "biotype": "CDS", + "end": 7151, + "name": "CAA52590.1", + "start": 5565 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74483.1", + "end": 7096, + "name": "L1", + "start": 5492, + "strand": "+", + "transcripts": [ + { + "end": 7093, + "name": "X74483.1_L1_T", + "start": 5492, + "translations": [ + { + "biotype": "CDS", + "end": 7093, + "name": "CAA52600.1", + "start": 5492 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 7005, + "name": "L1", + "start": 5494, + "strand": "+", + "transcripts": [ + { + "end": 7002, + "name": "X94165.1_L1_T", + "start": 5494, + "translations": [ + { + "biotype": "CDS", + "end": 7002, + "name": "CAA63887.1", + "start": 5494 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "AB027020.1", + "end": 5560, + "name": "L2", + "start": 4157, + "strand": "+", + "transcripts": [ + { + "end": 5557, + "name": "AB027020.1_L2_T", + "start": 4157, + "translations": [ + { + "biotype": "CDS", + "end": 5557, + "name": "BAA90733.1", + "start": 4157 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "J04353.1", + "end": 5571, + "name": "L2", + "start": 4171, + "strand": "+", + "transcripts": [ + { + "end": 5568, + "name": "J04353.1_L2_T", + "start": 4171, + "translations": [ + { + "biotype": "CDS", + "end": 5568, + "name": "AAA46955.1", + "start": 4171 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "K02718.1", + "end": 5656, + "name": "L2", + "start": 4235, + "strand": "+", + "transcripts": [ + { + "end": 5653, + "name": "K02718.1_L2_T", + "start": 4235, + "translations": [ + { + "biotype": "CDS", + "end": 5653, + "name": "AAA46942.1", + "start": 4235 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M12732.1", + "end": 5613, + "name": "L2", + "start": 4210, + "strand": "+", + "transcripts": [ + { + "end": 5610, + "name": "M12732.1_L2_T", + "start": 4210, + "translations": [ + { + "biotype": "CDS", + "end": 5610, + "name": "AAA46963.1", + "start": 4210 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "M62849.1", + "end": 5662, + "name": "L2", + "start": 4250, + "strand": "+", + "transcripts": [ + { + "end": 5659, + "name": "M62849.1_L2_T", + "start": 4250, + "translations": [ + { + "biotype": "CDS", + "end": 5659, + "name": "AAA47055.1", + "start": 4250 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "U21941.1", + "end": 5609, + "name": "L2", + "start": 4209, + "strand": "+" + }, + { + "biotype": "gene", + "chr": "X05015.1", + "end": 5632, + "name": "L2", + "start": 4244, + "strand": "+", + "transcripts": [ + { + "end": 5629, + "name": "X05015.1_L2_T", + "start": 4244, + "translations": [ + { + "biotype": "CDS", + "end": 5629, + "name": "CAA28670.1", + "start": 4244 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74464.1", + "end": 5730, + "name": "L2", + "start": 4129, + "strand": "+", + "transcripts": [ + { + "end": 5727, + "name": "X74464.1_L2_T", + "start": 4129, + "translations": [ + { + "biotype": "CDS", + "end": 5727, + "name": "CAA52487.1", + "start": 4129 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74474.1", + "end": 5671, + "name": "L2", + "start": 4280, + "strand": "+", + "transcripts": [ + { + "end": 5668, + "name": "X74474.1_L2_T", + "start": 4280, + "translations": [ + { + "biotype": "CDS", + "end": 5668, + "name": "CAA52547.1", + "start": 4280 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74477.1", + "end": 5620, + "name": "L2", + "start": 4211, + "strand": "+", + "transcripts": [ + { + "end": 5617, + "name": "X74477.1_L2_T", + "start": 4211, + "translations": [ + { + "biotype": "CDS", + "end": 5617, + "name": "CAA52565.1", + "start": 4211 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74481.1", + "end": 5662, + "name": "L2", + "start": 4262, + "strand": "+", + "transcripts": [ + { + "end": 5659, + "name": "X74481.1_L2_T", + "start": 4262, + "translations": [ + { + "biotype": "CDS", + "end": 5659, + "name": "CAA52589.1", + "start": 4262 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X74483.1", + "end": 5616, + "name": "L2", + "start": 4222, + "strand": "+", + "transcripts": [ + { + "end": 5613, + "name": "X74483.1_L2_T", + "start": 4222, + "translations": [ + { + "biotype": "CDS", + "end": 5613, + "name": "CAA52599.1", + "start": 4222 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X94165.1", + "end": 5510, + "name": "L2", + "start": 4083, + "strand": "+", + "transcripts": [ + { + "end": 5507, + "name": "X94165.1_L2_T", + "start": 4083, + "translations": [ + { + "biotype": "CDS", + "end": 5507, + "name": "CAA63886.1", + "start": 4083 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 2806, + "name": "ORF putative E1", + "start": 872, + "strand": "+", + "transcripts": [ + { + "end": 2803, + "name": "X77858.1_ORF putative E1_T", + "start": 872, + "translations": [ + { + "biotype": "CDS", + "end": 2803, + "name": "CAA54851.1", + "start": 872 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 3848, + "name": "ORF putative E2", + "start": 2736, + "strand": "+", + "transcripts": [ + { + "end": 3845, + "name": "X77858.1_ORF putative E2_T", + "start": 2736, + "translations": [ + { + "biotype": "CDS", + "end": 3845, + "name": "CAA54852.1", + "start": 2736 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 3615, + "name": "ORF putative E4", + "start": 3268, + "strand": "+", + "transcripts": [ + { + "end": 3612, + "name": "X77858.1_ORF putative E4_T", + "start": 3268, + "translations": [ + { + "biotype": "CDS", + "end": 3612, + "name": "CAA54853.1", + "start": 3268 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 4129, + "name": "ORF putative E5", + "start": 3908, + "strand": "+", + "transcripts": [ + { + "end": 4126, + "name": "X77858.1_ORF putative E5_T", + "start": 3908, + "translations": [ + { + "biotype": "CDS", + "end": 4126, + "name": "CAA54854.1", + "start": 3908 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 537, + "name": "ORF putative E6", + "start": 55, + "strand": "+", + "transcripts": [ + { + "end": 534, + "name": "X77858.1_ORF putative E6_T", + "start": 55, + "translations": [ + { + "biotype": "CDS", + "end": 534, + "name": "CAA54849.1", + "start": 55 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 865, + "name": "ORF putative E7", + "start": 542, + "strand": "+", + "transcripts": [ + { + "end": 862, + "name": "X77858.1_ORF putative E7_T", + "start": 542, + "translations": [ + { + "biotype": "CDS", + "end": 862, + "name": "CAA54850.1", + "start": 542 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 7132, + "name": "ORF putative L1", + "start": 5606, + "strand": "+", + "transcripts": [ + { + "end": 7129, + "name": "X77858.1_ORF putative L1_T", + "start": 5606, + "translations": [ + { + "biotype": "CDS", + "end": 7129, + "name": "CAA54856.1", + "start": 5606 + } + ] + } + ] + }, + { + "biotype": "gene", + "chr": "X77858.1", + "end": 5625, + "name": "ORF putative L2", + "start": 4231, + "strand": "+", + "transcripts": [ + { + "end": 5622, + "name": "X77858.1_ORF putative L2_T", + "start": 4231, + "translations": [ + { + "biotype": "CDS", + "end": 5622, + "name": "CAA54855.1", + "start": 4231 + } + ] + } + ] + }, + { + "chr": "U21941.1", + "end": 2883, + "name": "G_AAC54852.1", + "start": 928, + "strand": "+", + "transcripts": [ + { + "end": 2883, + "name": "U21941.1_G_AAC54852.1_T", + "start": 928, + "translations": [ + { + "biotype": "CDS", + "end": 2883, + "name": "AAC54852.1", + "start": 928 + } + ] + } + ] + }, + { + "chr": "U21941.1", + "end": 3892, + "name": "G_AAC54853.1", + "start": 2813, + "strand": "+", + "transcripts": [ + { + "end": 3892, + "name": "U21941.1_G_AAC54853.1_T", + "start": 2813, + "translations": [ + { + "biotype": "CDS", + "end": 3892, + "name": "AAC54853.1", + "start": 2813 + } + ] + } + ] + }, + { + "chr": "U21941.1", + "end": 5606, + "name": "G_AAC54856.1", + "start": 4209, + "strand": "+", + "transcripts": [ + { + "end": 5606, + "name": "U21941.1_G_AAC54856.1_T", + "start": 4209, + "translations": [ + { + "biotype": "CDS", + "end": 5606, + "name": "AAC54856.1", + "start": 4209 + } + ] + } + ] + }, + { + "chr": "U21941.1", + "end": 7101, + "name": "G_AAC54857.1", + "start": 5590, + "strand": "+", + "transcripts": [ + { + "end": 7101, + "name": "U21941.1_G_AAC54857.1_T", + "start": 5590, + "translations": [ + { + "biotype": "CDS", + "end": 7101, + "name": "AAC54857.1", + "start": 5590 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 474, + "name": "G_AAZ39491.1", + "start": 1, + "strand": "+", + "transcripts": [ + { + "end": 474, + "name": "DQ080079.1_G_AAZ39491.1_T", + "start": 1, + "translations": [ + { + "biotype": "CDS", + "end": 474, + "name": "AAZ39491.1", + "start": 1 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 813, + "name": "G_AAZ39492.1", + "start": 484, + "strand": "+", + "transcripts": [ + { + "end": 813, + "name": "DQ080079.1_G_AAZ39492.1_T", + "start": 484, + "translations": [ + { + "biotype": "CDS", + "end": 813, + "name": "AAZ39492.1", + "start": 484 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 2742, + "name": "G_AAZ39493.1", + "start": 823, + "strand": "+", + "transcripts": [ + { + "end": 2742, + "name": "DQ080079.1_G_AAZ39493.1_T", + "start": 823, + "translations": [ + { + "biotype": "CDS", + "end": 2742, + "name": "AAZ39493.1", + "start": 823 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 3781, + "name": "G_AAZ39494.1", + "start": 2672, + "strand": "+", + "transcripts": [ + { + "end": 3781, + "name": "DQ080079.1_G_AAZ39494.1_T", + "start": 2672, + "translations": [ + { + "biotype": "CDS", + "end": 3781, + "name": "AAZ39494.1", + "start": 2672 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 3548, + "name": "G_AAZ39495.1", + "start": 3267, + "strand": "+", + "transcripts": [ + { + "end": 3548, + "name": "DQ080079.1_G_AAZ39495.1_T", + "start": 3267, + "translations": [ + { + "biotype": "CDS", + "end": 3548, + "name": "AAZ39495.1", + "start": 3267 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 4048, + "name": "G_AAZ39496.1", + "start": 3830, + "strand": "+", + "transcripts": [ + { + "end": 4048, + "name": "DQ080079.1_G_AAZ39496.1_T", + "start": 3830, + "translations": [ + { + "biotype": "CDS", + "end": 4048, + "name": "AAZ39496.1", + "start": 3830 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 5504, + "name": "G_AAZ39497.1", + "start": 4098, + "strand": "+", + "transcripts": [ + { + "end": 5504, + "name": "DQ080079.1_G_AAZ39497.1_T", + "start": 4098, + "translations": [ + { + "biotype": "CDS", + "end": 5504, + "name": "AAZ39497.1", + "start": 4098 + } + ] + } + ] + }, + { + "chr": "DQ080079.1", + "end": 7002, + "name": "G_AAZ39498.1", + "start": 5488, + "strand": "+", + "transcripts": [ + { + "end": 7002, + "name": "DQ080079.1_G_AAZ39498.1_T", + "start": 5488, + "translations": [ + { + "biotype": "CDS", + "end": 7002, + "name": "AAZ39498.1", + "start": 5488 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 3602, + "name": "G_BAA14396.1", + "start": 3330, + "strand": "+", + "transcripts": [ + { + "end": 3602, + "name": "D90400.1_G_BAA14396.1_T", + "start": 3330, + "translations": [ + { + "biotype": "CDS", + "end": 3602, + "name": "BAA14396.1", + "start": 3330 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 556, + "name": "G_BAA31845.1", + "start": 110, + "strand": "+", + "transcripts": [ + { + "end": 556, + "name": "D90400.1_G_BAA31845.1_T", + "start": 110, + "translations": [ + { + "biotype": "CDS", + "end": 556, + "name": "BAA31845.1", + "start": 110 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 867, + "name": "G_BAA31846.1", + "start": 574, + "strand": "+", + "transcripts": [ + { + "end": 867, + "name": "D90400.1_G_BAA31846.1_T", + "start": 574, + "translations": [ + { + "biotype": "CDS", + "end": 867, + "name": "BAA31846.1", + "start": 574 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 2814, + "name": "G_BAA31847.1", + "start": 883, + "strand": "+", + "transcripts": [ + { + "end": 2814, + "name": "D90400.1_G_BAA31847.1_T", + "start": 883, + "translations": [ + { + "biotype": "CDS", + "end": 2814, + "name": "BAA31847.1", + "start": 883 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 3826, + "name": "G_BAA31848.1", + "start": 2753, + "strand": "+", + "transcripts": [ + { + "end": 3826, + "name": "D90400.1_G_BAA31848.1_T", + "start": 2753, + "translations": [ + { + "biotype": "CDS", + "end": 3826, + "name": "BAA31848.1", + "start": 2753 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 4119, + "name": "G_BAA31849.1", + "start": 3892, + "strand": "+", + "transcripts": [ + { + "end": 4119, + "name": "D90400.1_G_BAA31849.1_T", + "start": 3892, + "translations": [ + { + "biotype": "CDS", + "end": 4119, + "name": "BAA31849.1", + "start": 3892 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 5659, + "name": "G_BAA31850.1", + "start": 4244, + "strand": "+", + "transcripts": [ + { + "end": 5659, + "name": "D90400.1_G_BAA31850.1_T", + "start": 4244, + "translations": [ + { + "biotype": "CDS", + "end": 5659, + "name": "BAA31850.1", + "start": 4244 + } + ] + } + ] + }, + { + "chr": "D90400.1", + "end": 7136, + "name": "G_BAA31851.1", + "start": 5565, + "strand": "+", + "transcripts": [ + { + "end": 7136, + "name": "D90400.1_G_BAA31851.1_T", + "start": 5565, + "translations": [ + { + "biotype": "CDS", + "end": 7136, + "name": "BAA31851.1", + "start": 5565 + } + ] + } + ] + } + ] +} diff --git a/tests/test_tools/test_convert_annotations_format.py b/tests/test_tools/test_convert_annotations_format.py index 7e637ae9..2516af84 100644 --- a/tests/test_tools/test_convert_annotations_format.py +++ b/tests/test_tools/test_convert_annotations_format.py @@ -51,6 +51,7 @@ def sort_elements(data): 'ensembl69_hg19_annotations.kras.tab.json', 'v2-tab', ], + ['viral.gtf', 'viral.gtf.json', 'gtf'], ], ) def test_gff_examples(filename, expected_file, input_type): From 5aefd884d621d71f915165848441cddee8fe8ca0 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Sat, 19 Feb 2022 12:40:28 -0800 Subject: [PATCH 135/137] Include mix of viral/human transcripts --- src/tools/convert_annotations_format.py | 9 +- tests/test_tools/data/viral.gtf | 58 ++++++ tests/test_tools/data/viral.gtf.json | 250 ++++++++++++++++++++++++ 3 files changed, 315 insertions(+), 2 deletions(-) diff --git a/src/tools/convert_annotations_format.py b/src/tools/convert_annotations_format.py index eceba67a..ca0f0cc3 100644 --- a/src/tools/convert_annotations_format.py +++ b/src/tools/convert_annotations_format.py @@ -472,8 +472,13 @@ def enforce_uniq_transcript_ids(input_df) -> pd.DataFrame: return df # there are some non-unique transcript IDs, make them all pre-pend the seqid - df.loc[df.type == 'transcript', 'feature_id'] = df.seqid + GFF_ID_DELIMITER + df.feature_id - df.loc[df.parent_type == 'transcript', 'parent_id'] = df.seqid + GFF_ID_DELIMITER + df.parent_id + # do not change ensembl transcript IDs since they should already be unique + df.loc[(df.type == 'transcript') & (~df.feature_id.str.startswith('ENST')), 'feature_id'] = ( + df.seqid + GFF_ID_DELIMITER + df.feature_id + ) + df.loc[ + (df.parent_type == 'transcript') & (~df.parent_id.str.startswith('ENST')), 'parent_id' + ] = (df.seqid + GFF_ID_DELIMITER + df.parent_id) duplicates = df[df.type == 'transcript'].drop_duplicates(['seqid', 'parent_id', 'feature_id']) if duplicates.shape[0] == duplicates.feature_id.nunique(): diff --git a/tests/test_tools/data/viral.gtf b/tests/test_tools/data/viral.gtf index 57c45450..cafb8d74 100644 --- a/tests/test_tools/data/viral.gtf +++ b/tests/test_tools/data/viral.gtf @@ -445,3 +445,61 @@ X94165.1 EMBL gene 5494 7005 . + . gene_id "L1"; transcript_id ""; gbkey "Gene"; X94165.1 EMBL CDS 5494 7002 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q82010"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q82010"; gbkey "CDS"; gene "L1"; note "putative"; product "CAA63887.1"; protein_id "CAA63887.1"; exon_number "1"; X94165.1 EMBL start_codon 5494 5496 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q82010"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q82010"; gbkey "CDS"; gene "L1"; note "putative"; product "CAA63887.1"; protein_id "CAA63887.1"; exon_number "1"; X94165.1 EMBL stop_codon 7003 7005 . + 0 gene_id "L1"; transcript_id "unassigned_transcript_7"; db_xref "GOA:Q82010"; db_xref "HSSP:1DZL"; db_xref "InterPro:IPR002210"; db_xref "InterPro:IPR011222"; db_xref "UniProtKB/TrEMBL:Q82010"; gbkey "CDS"; gene "L1"; note "putative"; product "CAA63887.1"; protein_id "CAA63887.1"; exon_number "1"; +chr6 havana gene 54770583 54771134 . + . gene_id "ENSG00000220635"; gene_version "2"; gene_name "KRASP1"; gene_source "havana"; gene_biotype "processed_pseudogene"; gene_type "processed_pseudogene"; +chr6 havana transcript 54770583 54771134 . + . gene_id "ENSG00000220635"; gene_version "2"; transcript_id "ENST00000407852"; transcript_version "2"; gene_name "KRASP1"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "KRASP1-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; tag "basic"; transcript_support_level "NA"; gene_type "processed_pseudogene"; +chr6 havana exon 54770583 54771134 . + . gene_id "ENSG00000220635"; gene_version "2"; transcript_id "ENST00000407852"; transcript_version "2"; exon_number "1"; gene_name "KRASP1"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "KRASP1-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; exon_id "ENSE00001550689"; exon_version "2"; tag "basic"; transcript_support_level "NA"; gene_type "processed_pseudogene"; +chr12 ensembl_havana gene 25205246 25250936 . - . gene_id "ENSG00000133703"; gene_version "13"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; gene_type "protein_coding"; +chr12 ensembl_havana transcript 25205246 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001189804"; exon_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001719809"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25227234 25227412 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; exon_id "ENSE00002456976"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; protein_id "ENSP00000308495"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana five_prime_utr 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana three_prime_utr 25205246 25209794 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000311936"; transcript_version "8"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-202"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8702"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; gene_type "protein_coding"; +chr12 ensembl_havana transcript 25205246 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00000000028"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25227234 25227412 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00001719809"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25227234 25227412 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25225614 25225773 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00001644818"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25225614 25225773 . - 1 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "4"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25215437 25215560 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00001189807"; exon_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana CDS 25215444 25215560 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; protein_id "ENSP00000256078"; protein_version "5"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana stop_codon 25215441 25215443 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana exon 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; exon_number "6"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; exon_id "ENSE00002477035"; exon_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana five_prime_utr 25250751 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana three_prime_utr 25215437 25215440 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 ensembl_havana three_prime_utr 25205246 25209911 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000256078"; transcript_version "10"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS8703"; tag "basic"; transcript_support_level "1 (assigned to previous version 8)"; gene_type "protein_coding"; +chr12 havana transcript 25209168 25250936 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana exon 25250751 25250936 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002446502"; exon_version "1"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000452512"; protein_version "1"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana exon 25209168 25209911 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002464674"; exon_version "1"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana CDS 25209798 25209911 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000452512"; protein_version "1"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana stop_codon 25209795 25209797 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana five_prime_utr 25250751 25250936 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana three_prime_utr 25209168 25209794 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000557334"; transcript_version "5"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-204"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; gene_type "protein_coding"; +chr12 havana transcript 25233819 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana exon 25250764 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002530521"; exon_version "1"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana exon 25245274 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00000936617"; exon_version "1"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana CDS 25245274 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000451856"; protein_version "1"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana start_codon 25245382 25245384 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "2"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana exon 25233819 25235226 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002478081"; exon_version "1"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana CDS 25235209 25235226 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000451856"; protein_version "1"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana stop_codon 25235206 25235208 . - 0 gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; exon_number "3"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana five_prime_utr 25250764 25250929 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana five_prime_utr 25245385 25245395 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; +chr12 havana three_prime_utr 25233819 25235205 . - . gene_id "ENSG00000133703"; gene_version "13"; transcript_id "ENST00000556131"; transcript_version "1"; gene_name "KRAS"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "KRAS-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1"; gene_type "protein_coding"; diff --git a/tests/test_tools/data/viral.gtf.json b/tests/test_tools/data/viral.gtf.json index 710f2e0d..db29832e 100644 --- a/tests/test_tools/data/viral.gtf.json +++ b/tests/test_tools/data/viral.gtf.json @@ -2678,6 +2678,256 @@ ] } ] + }, + { + "aliases": [ + "KRAS" + ], + "biotype": "gene", + "chr": "chr12", + "end": 25250936, + "name": "ENSG00000133703", + "start": 25205246, + "strand": "-", + "transcripts": [ + { + "aliases": [ + "KRAS-201" + ], + "biotype": "transcript", + "end": 25250929, + "exons": [ + { + "end": 25250929, + "name": "ENSE00000000028", + "number": "1", + "start": 25250751, + "version": "2" + }, + { + "end": 25245395, + "name": "ENSE00000936617", + "number": "2", + "start": 25245274, + "version": "1" + }, + { + "end": 25215560, + "name": "ENSE00001189807", + "number": "5", + "start": 25215437, + "version": "5" + }, + { + "end": 25225773, + "name": "ENSE00001644818", + "number": "4", + "start": 25225614, + "version": "1" + }, + { + "end": 25227412, + "name": "ENSE00001719809", + "number": "3", + "start": 25227234, + "version": "1" + }, + { + "end": 25209911, + "name": "ENSE00002477035", + "number": "6", + "start": 25205246, + "version": "3" + } + ], + "name": "ENST00000256078", + "start": 25205246, + "translations": [ + { + "biotype": "CDS", + "end": 25245384, + "name": "ENSP00000256078", + "start": 25215444, + "version": "5" + } + ], + "version": "10" + }, + { + "aliases": [ + "KRAS-202" + ], + "biotype": "transcript", + "end": 25250929, + "exons": [ + { + "end": 25245395, + "name": "ENSE00000936617", + "number": "2", + "start": 25245274, + "version": "1" + }, + { + "end": 25250929, + "name": "ENSE00001189804", + "number": "1", + "start": 25250751, + "version": "5" + }, + { + "end": 25225773, + "name": "ENSE00001644818", + "number": "4", + "start": 25225614, + "version": "1" + }, + { + "end": 25227412, + "name": "ENSE00001719809", + "number": "3", + "start": 25227234, + "version": "1" + }, + { + "end": 25209911, + "name": "ENSE00002456976", + "number": "5", + "start": 25205246, + "version": "2" + } + ], + "name": "ENST00000311936", + "start": 25205246, + "translations": [ + { + "biotype": "CDS", + "end": 25245384, + "name": "ENSP00000308495", + "start": 25209798, + "version": "3" + } + ], + "version": "8" + }, + { + "aliases": [ + "KRAS-203" + ], + "biotype": "transcript", + "end": 25250929, + "exons": [ + { + "end": 25245395, + "name": "ENSE00000936617", + "number": "2", + "start": 25245274, + "version": "1" + }, + { + "end": 25235226, + "name": "ENSE00002478081", + "number": "3", + "start": 25233819, + "version": "1" + }, + { + "end": 25250929, + "name": "ENSE00002530521", + "number": "1", + "start": 25250764, + "version": "1" + } + ], + "name": "ENST00000556131", + "start": 25233819, + "translations": [ + { + "biotype": "CDS", + "end": 25245384, + "name": "ENSP00000451856", + "start": 25235209, + "version": "1" + } + ], + "version": "1" + }, + { + "aliases": [ + "KRAS-204" + ], + "biotype": "transcript", + "end": 25250936, + "exons": [ + { + "end": 25245395, + "name": "ENSE00000936617", + "number": "2", + "start": 25245274, + "version": "1" + }, + { + "end": 25250936, + "name": "ENSE00002446502", + "number": "1", + "start": 25250751, + "version": "1" + }, + { + "end": 25209911, + "name": "ENSE00002464674", + "number": "3", + "start": 25209168, + "version": "1" + } + ], + "name": "ENST00000557334", + "start": 25209168, + "translations": [ + { + "biotype": "CDS", + "end": 25245384, + "name": "ENSP00000452512", + "start": 25209798, + "version": "1" + } + ], + "version": "5" + } + ], + "version": "13" + }, + { + "aliases": [ + "KRASP1" + ], + "biotype": "gene", + "chr": "chr6", + "end": 54771134, + "name": "ENSG00000220635", + "start": 54770583, + "strand": "+", + "transcripts": [ + { + "aliases": [ + "KRASP1-201" + ], + "biotype": "transcript", + "end": 54771134, + "exons": [ + { + "end": 54771134, + "name": "ENSE00001550689", + "number": "1", + "start": 54770583, + "version": "2" + } + ], + "name": "ENST00000407852", + "start": 54770583, + "version": "2" + } + ], + "version": "2" } ] } From 84abab90401164a6590e03677bbbc6410e5d4f9c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 22 Feb 2022 11:26:12 -0800 Subject: [PATCH 136/137] Bump version number to 3.0.0 --- Snakefile | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 4c8233f9..c361c753 100644 --- a/Snakefile +++ b/Snakefile @@ -11,7 +11,7 @@ from mavis_config import ( from mavis_config.constants import SUBCOMMAND # env variable mainly for CI/CD -CONTAINER = os.environ.get('SNAKEMAKE_CONTAINER', 'docker://bcgsc/mavis:latest') +CONTAINER = os.environ.get('SNAKEMAKE_CONTAINER', 'docker://bcgsc/mavis:v3.0.0') MAX_TIME = 57600 DEFAULT_MEMORY_MB = 16000 diff --git a/setup.cfg b/setup.cfg index ed47bfd5..4149c69c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = mavis -version = 2.2.10 +version = 3.0.0 url = https://github.com/bcgsc/mavis.git download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz description = A Structural Variant Post-Processing Package From c2221fee6f8ab3767a040c12c9d024ea67e56fff Mon Sep 17 00:00:00 2001 From: zhemingfan <43304373+zhemingfan@users.noreply.github.com> Date: Tue, 22 Feb 2022 15:03:31 -0800 Subject: [PATCH 137/137] Update mkdocs.yml As per instructions on: https://github.com/lukasgeiter/mkdocs-awesome-pages-plugin We now have to add the search flag explicitly. --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 79900dff..2bf2bbfc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,6 +26,7 @@ nav: - glossary.md plugins: + - search - awesome-pages - mkdocs-simple-hooks: hooks: