From e0c5e40afa28e1932ccab00029d2dedfb2606e33 Mon Sep 17 00:00:00 2001 From: rnmitchell <57150382+rnmitchell@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:58:23 -0400 Subject: [PATCH] Complete workflow for Y-STRs using custom sequence ranges (#79) --- README.md | 48 +-- lusSTR/cli/gui.py | 6 + lusSTR/data/filters.json | 378 ++++++++++++++++++ lusSTR/scripts/marker.py | 11 +- .../data/powerseq_example_evidence_ngs.csv | 42 ++ ...powerseq_example_sexloci_sequence_info.csv | 43 ++ lusSTR/tests/test_filters.py | 32 +- lusSTR/tests/test_suite.py | 5 +- lusSTR/workflows/strs.smk | 3 +- lusSTR/wrappers/convert.py | 2 +- lusSTR/wrappers/filter.py | 229 +++++++---- 11 files changed, 692 insertions(+), 107 deletions(-) create mode 100644 lusSTR/tests/data/powerseq_example_evidence_ngs.csv create mode 100644 lusSTR/tests/data/powerseq_example_sexloci_sequence_info.csv diff --git a/README.md b/README.md index 3814f6c3..088102f4 100755 --- a/README.md +++ b/README.md @@ -73,24 +73,24 @@ ___ Running ```lusstr config``` creates a config file containing the default settings for the lusSTR STR analysis pipeline. The settings can be changed with command line arguments (see below) or by manually editing the config file. The default settings, along with their descriptions, are as follows: ### general settings -analysis_software: ```uas``` (```uas/straitrazor/genemarker```); indicates the analysis software used prior to lusSTR -sex: ```False``` (True/False); include sex-chromosome STRs (invoke ```--sex``` flag) -samp_input: ```/path/to/input/directory/or/samples``` input directory or sample; if not provided, will be current working directory (indicate using ```--input path/to/dir``` ) -output: ```lusstr_output``` output file/directory name (indicate using ```--out dir/sampleid e.g. --out test_030923```) -custom: ```False``` (True/False); use custom sequence ranges as defined in the STR markers json file (```str_markers.json```). +**analysis_software**: ```uas``` (```uas/straitrazor/genemarker```); indicates the analysis software used prior to lusSTR +**sex**: ```False``` (True/False); include sex-chromosome STRs (invoke ```--sex``` flag) +**samp_input**: ```/path/to/input/directory/or/samples``` input directory or sample; if not provided, will be current working directory (indicate using ```--input path/to/dir``` ) +**output**: ```lusstr_output``` output file/directory name (indicate using ```--out dir/sampleid e.g. --out test_030923```) +**custom**: ```False``` (True/False); use custom sequence ranges as defined in the STR markers json file (```str_markers.json```). ### convert settings -kit: ```forenseq``` (forenseq/powerseq) (invoke the ```--powerseq``` flag if using PowerSeq data) -nocombine: ```False``` (True/False); do not combine identical sequences during the ```convert``` step, if using STRait Razor data. (invoke the ```--nocombine``` flag) +**kit**: ```forenseq``` (forenseq/powerseq) (invoke the ```--powerseq``` flag if using PowerSeq data) +**nocombine**: ```False``` (True/False); do not combine identical sequences during the ```convert``` step, if using STRait Razor data. (invoke the ```--nocombine``` flag) ### filter settings -output_type: ```strmix``` (strmix/efm/mpsproto) (indicate using the ```--software``` flag) -profile_type: ```evidence``` (evidence/reference) (invoke ```--reference``` flag if creating a reference output file) -data_type: ```ngs``` (ce/ngs/lusplus) (indicate using the ```--str-type```) -info: ```True``` (True/False); create allele information file (invoke ```--noinfo``` flag to not create the allele information file) -separate: ```False``` (True/False); for EFM/MPSproto only, if True will create individual files for samples; if False, will create one file with all samples (invoke ```--separate``` flag to separate EFM/MPSproto output files) -nofilters: ```False``` (True/False); skip all filtering steps but still creates EFM/MPSproto/STRmix output files (invoke ```--nofilters``` flag) -strand: ```uas``` (uas/forward); indicates the strand orientation in which to report the sequence in the final output table for STRmix NGS only (indicate using ```--strand```) +**output_type**: ```strmix``` (strmix/efm/mpsproto) (indicate using the ```--software``` flag) +**profile_type**: ```evidence``` (evidence/reference) (invoke ```--reference``` flag if creating a reference output file) +**data_type**: ```ngs``` (ce/ngs/lusplus) (indicate using the ```--str-type```) +**info**: ```True``` (True/False); create allele information file (invoke ```--noinfo``` flag to not create the allele information file) +**separate**: ```False``` (True/False); for EFM/MPSproto only, if True will create individual files for samples; if False, will create one file with all samples (invoke ```--separate``` flag to separate EFM/MPSproto output files) +**nofilters**: ```False``` (True/False); skip all filtering steps but still creates EFM/MPSproto/STRmix output files (invoke ```--nofilters``` flag) +**strand**: ```uas``` (uas/forward); indicates the strand orientation in which to report the sequence in the final output table for STRmix NGS only (indicate using ```--strand```) One additional argument can be provided with ```lusstr config```: ```-w```/```-workdir``` sets the working directory (e.g. ```-w lusstr_files/```) and all created files are stored in that directory. @@ -223,20 +223,20 @@ Running ```lusstr config --snps``` creates a config file containing the default ### general settings -analysis_software: ```uas``` (```uas/straitrazor```); indicates the analysis software used prior to lusSTR -samp_input: ```/path/to/input/directory/or/samples``` input directory or sample; if not provided, will be current working directory (indicate using ```--input path/to/dir```) -output: ```lusstr_output``` output file/directory name; (indicate using ```--out dir/sampleid e.g. --out test_030923```) -kit: ```sigprep``` (sigprep/kintelligence) (invoke using the ```--kintelligence``` flag if using Kintelligence data) +**analysis_software**: ```uas``` (```uas/straitrazor```); indicates the analysis software used prior to lusSTR +**samp_input**: ```/path/to/input/directory/or/samples``` input directory or sample; if not provided, will be current working directory (indicate using ```--input path/to/dir```) +**output**: ```lusstr_output``` output file/directory name; (indicate using ```--out dir/sampleid e.g. --out test_030923```) +**kit**: ```sigprep``` (sigprep/kintelligence) (invoke using the ```--kintelligence``` flag if using Kintelligence data) ### format settings -types: ```all``` choices are "all", "i" (identity SNPs only), "p" (phenotype only), "a" (ancestry only) or any combination (indicate using the ```--snp-type e.g. --snp-type i, p```) -nofilter: ```False``` (True/False); if no filtering is desired at the format step; if False, will remove any allele designated as Not Typed (invoke using the ```--nofiltering```) +**types**: ```all``` choices are "all", "i" (identity SNPs only), "p" (phenotype only), "a" (ancestry only) or any combination (indicate using the ```--snp-type e.g. --snp-type i, p```) +**nofilter**: ```False``` (True/False); if no filtering is desired at the format step; if False, will remove any allele designated as Not Typed (invoke using the ```--nofiltering```) ### convert settings -strand: ```forward``` (forward/uas); indicates which orientation to report the alleles for the SigPrep SNPs; uas indicates the orientation as reported by the UAS or the forward strand -references: ```None```; list IDs of the samples to be run as references in EFM; default is no reference samples -separate: ```False``` (True/False); if want to separate samples into individual files for use in EFM -thresh: ```0.03```; Analytical threshold value +**strand**: ```forward``` (forward/uas); indicates which orientation to report the alleles for the SigPrep SNPs; uas indicates the orientation as reported by the UAS or the forward strand +**references**: ```None```; list IDs of the samples to be run as references in EFM; default is no reference samples +**separate**: ```False``` (True/False); if want to separate samples into individual files for use in EFM +**thresh**: ```0.03```; Analytical threshold value One additional argument can be provided with ```lusstr config --snps```: diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index bcd4d823..7c531abd 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -236,6 +236,11 @@ def show_STR_page(): ) ] + custom_ranges = st.checkbox( + "Use Custom Sequence Ranges", + help="Check the box to use the specified custom sequence ranges as defined in the str_markers.json file.", + ) + sex = st.checkbox( "Include X- and Y-STRs", help="Check the box to include X- and Y-STRs, otherwise leave unchecked.", @@ -364,6 +369,7 @@ def show_STR_page(): config_data = { "analysis_software": analysis_software, + "custom_ranges": custom_ranges, "sex": sex, "samp_input": samp_input, "output": output, diff --git a/lusSTR/data/filters.json b/lusSTR/data/filters.json index c0be44ee..64f06b4b 100644 --- a/lusSTR/data/filters.json +++ b/lusSTR/data/filters.json @@ -484,5 +484,383 @@ "StutterForwardThresholdDynamicPercent": 0.22, "Intercept": 75, "Slope": 4 + }, + "DYS19": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS385A-B": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0.30, + "StutterThresholdDynamicPercent": 0.20, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS389II": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.05, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.35, + "StutterForwardThresholdDynamicPercent": 0.38, + "Intercept": 0, + "Slope": 0 + }, + "DYS390": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS391": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.20, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS392": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.50, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS393": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS437": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.45, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS438": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS439": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS448": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.033, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS456": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS458": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS481": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.50, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS533": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS549": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.22, + "StutterForwardThresholdDynamicPercent": 0.28, + "Intercept": 0, + "Slope": 0 + }, + "DYS570": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.22, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS576": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS635": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.15, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "DYS643": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.20, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, + "Y-GATA-H4": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0.35, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 } } \ No newline at end of file diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 2205789d..ab91ae6f 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -1453,10 +1453,13 @@ def convert(self): def custom_brack(self): if self.custom: sequence = self.custom_sequence - final_string = ( - f"{collapse_repeats_by_length(sequence[:14], 4)} " - f"{collapse_repeats_by_length(sequence[14:], 4)}" - ) + if len(sequence) > 14: + final_string = ( + f"{collapse_repeats_by_length(sequence[:14], 4)} " + f"{collapse_repeats_by_length(sequence[14:], 4)}" + ) + else: + final_string = collapse_repeats_by_length(sequence, 4) return final_string return None diff --git a/lusSTR/tests/data/powerseq_example_evidence_ngs.csv b/lusSTR/tests/data/powerseq_example_evidence_ngs.csv new file mode 100644 index 00000000..fda609d3 --- /dev/null +++ b/lusSTR/tests/data/powerseq_example_evidence_ngs.csv @@ -0,0 +1,42 @@ +Locus,CE Allele,Allele Seq,Reads +DYS19,13.0,TAGATAGATAGATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,1326 +DYS19,14.0,TAGATAGATAGATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,13539 +DYS385A-B,13.0,AAGGAAGGAAGGAAGGAGAAAGAAAGTAAAAAAGAAAGAAAGAGAAAAAGAGAAAAAGAAAGAAAGAGAAGAAAGAGAAAGAGGAAAGAGAAAGAAAGGAAGGAAGGAAGGAAGGAAGGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAA,4446 +DYS385A-B,16.0,AAGGAAGGAAGGAAGGAGAAAGAAAGTAAAAAAGAAAGAAAGAGAAAAAGAGAAAAAGAAAGAAAGAGAAGAAAGAGAAAGAGGAAAGAGAAAGAAAGGAAGGAAGGAAGGAAGGAAGGGAAAGAAATAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAA,3932 +DYS389II,30.0,TCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATTATACCTACTTCTGTATCCAACTCTCATCTGTATTATCTATGTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,288 +DYS389II,31.0,TCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATTATACCTACTTCTGTATCCAACTCTCATCTGTATTATCTATGTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,2593 +DYS390,23.0,TCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTATCTATCTATCTA,658 +DYS390,24.0,TCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTATCTATCTATCTA,9090 +DYS391,9.0,TGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTG,1086 +DYS391,10.0,TGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTG,12298 +DYS392,12.0,TATTATTATTATTATTATTATTATTATTATTATTAT,2565 +DYS392,13.0,TATTATTATTATTATTATTATTATTATTATTATTATTAT,15505 +DYS393,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATGTATGTCTTTTCTATGAGACATA,1946 +DYS393,13.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATGTATGTCTTTTCTATGAGACATA,19034 +DYS437,13.0,TCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTA,522 +DYS437,14.0,TCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTA,9559 +DYS438,8.0,TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTC,411 +DYS438,9.0,TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTC,15233 +DYS439,11.0,AAATAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,2052 +DYS439,12.0,AAATAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,25067 +DYS448,19.0,AGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATATAGAGATAGAGAGATAGAGATAGAGATAGATAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGAT,2823 +DYS456,16.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCCATTAGTTCTGTCCCTCTAGAGAACCCTAATACATCAGTTTAAGAA,1834 +DYS456,17.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCCATTAGTTCTGTCCCTCTAGAGAACCCTAATACATCAGTTTAAGAA,11109 +DYS458,16.0,GAAAGAAAGAAAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGGAGGGTGGGCGTGGTGGCTCATGCTTGTAATGCCAGAACTTTGGGAGGCCGAGGTGG,1660 +DYS458,17.0,GAAAGAAAGAAAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGGAGGGTGGGCGTGGTGGCTCATGCTTGTAATGCCAGAACTTTGGGAGGCCGAGGTGG,10541 +DYS481,21.0,CTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT,2848 +DYS481,22.0,CTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT,12071 +DYS533,11.0,TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,735 +DYS533,12.0,TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,8598 +DYS549,12.0,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,1559 +DYS549,13.0,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,17735 +DYS570,16.0,TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC,1561 +DYS570,17.0,TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC,14991 +DYS576,17.0,AAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG,1530 +DYS576,18.0,AAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG,9658 +DYS635,20.0,TCTATCTATCTATCTATGTATGTATCTATCTATGTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,1399 +DYS635,21.0,TCTATCTATCTATCTATGTATGTATCTATCTATGTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,14753 +DYS643,9.0,CTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTT,287 +DYS643,10.0,CTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTT,9866 +Y-GATA-H4,10.0,AGATAGATAGATAGATCTATAGATAGATAGGTAGGTAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,662 +Y-GATA-H4,11.0,AGATAGATAGATAGATCTATAGATAGATAGGTAGGTAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,8052 diff --git a/lusSTR/tests/data/powerseq_example_sexloci_sequence_info.csv b/lusSTR/tests/data/powerseq_example_sexloci_sequence_info.csv new file mode 100644 index 00000000..9b257083 --- /dev/null +++ b/lusSTR/tests/data/powerseq_example_sexloci_sequence_info.csv @@ -0,0 +1,43 @@ +SampleID,Locus,UAS_Output_Sequence,CE_Allele,UAS_Output_Bracketed_Notation,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter +powerseq_example,DYS19,TAGATAGATAGATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,14.0,[TAGA]3 TAGG [TAGA]11,13539,real_allele,,,,,, +powerseq_example,DYS19,TAGATAGATAGATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,13.0,[TAGA]3 TAGG [TAGA]10,1326,-1_stutter,[TAGA]3 TAGG [TAGA]11,,13539.0,,,0.098 +powerseq_example,DYS385A-B,AAGGAAGGAAGGAAGGAGAAAGAAAGTAAAAAAGAAAGAAAGAGAAAAAGAGAAAAAGAAAGAAAGAGAAGAAAGAGAAAGAGGAAAGAGAAAGAAAGGAAGGAAGGAAGGAAGGAAGGGAAAGAAATAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAA,16.0,AAG [GAAG]3 GAGA AAGA AAGT AAAA [AAGA]3 GAAA AAGA GAAA [AAGA]3 GAAG AAAG AGAA AGAG GAAA GAGA AAGA [AAGG]6 [GAAA]2 TAAA [GAAA]13 GAGA AAAA,3932,real_allele,,,,,, +powerseq_example,DYS385A-B,AAGGAAGGAAGGAAGGAGAAAGAAAGTAAAAAAGAAAGAAAGAGAAAAAGAGAAAAAGAAAGAAAGAGAAGAAAGAGAAAGAGGAAAGAGAAAGAAAGGAAGGAAGGAAGGAAGGAAGGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAA,13.0,AAG [GAAG]3 GAGA AAGA AAGT AAAA [AAGA]3 GAAA AAGA GAAA [AAGA]3 GAAG AAAG AGAA AGAG GAAA GAGA AAGA [AAGG]6 [GAAA]13 GAGA AAAA,4446,real_allele,,,,,, +powerseq_example,DYS389II,TCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATTATACCTACTTCTGTATCCAACTCTCATCTGTATTATCTATGTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,31.0,[TCTG]4 [TCTA]13 TCAT TATA CCTA CTTC TGTA TCCA ACTC TCAT CTGT ATTA TCTA TGTA [TCTG]3 [TCTA]11,2593,real_allele,,,,,, +powerseq_example,DYS389II,TCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATTATACCTACTTCTGTATCCAACTCTCATCTGTATTATCTATGTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,30.0,[TCTG]4 [TCTA]13 TCAT TATA CCTA CTTC TGTA TCCA ACTC TCAT CTGT ATTA TCTA TGTA [TCTG]3 [TCTA]10,288,-1_stutter,[TCTG]4 [TCTA]13 TCAT TATA CCTA CTTC TGTA TCCA ACTC TCAT CTGT ATTA TCTA TGTA [TCTG]3 [TCTA]11,,2593.0,,,0.111 +powerseq_example,DYS390,TCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTATCTATCTATCTA,24.0,[TCTA]2 [TCTG]8 [TCTA]11 TCTG [TCTA]4,9090,real_allele,,,,,, +powerseq_example,DYS390,TCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTATCTATCTATCTA,23.0,[TCTA]2 [TCTG]8 [TCTA]10 TCTG [TCTA]4,658,-1_stutter,[TCTA]2 [TCTG]8 [TCTA]11 TCTG [TCTA]4,,9090.0,,,0.072 +powerseq_example,DYS391,TGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTG,10.0,TG TCTG [TCTA]10 TCTG,12298,real_allele,,,,,, +powerseq_example,DYS391,TGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTG,9.0,TG TCTG [TCTA]9 TCTG,1086,-1_stutter,TG TCTG [TCTA]10 TCTG,,12298.0,,,0.088 +powerseq_example,DYS392,TATTATTATTATTATTATTATTATTATTATTATTATTAT,13.0,[TAT]13,15505,real_allele,,,,,, +powerseq_example,DYS392,TATTATTATTATTATTATTATTATTATTATTATTAT,12.0,[TAT]12,2565,-1_stutter,[TAT]13,,15505.0,,,0.165 +powerseq_example,DYS393,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATGTATGTCTTTTCTATGAGACATA,13.0,[AGAT]13 [ATGT]2 CTTT TCTA TGAG ACAT A,19034,real_allele,,,,,, +powerseq_example,DYS393,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATGTATGTCTTTTCTATGAGACATA,12.0,[AGAT]12 [ATGT]2 CTTT TCTA TGAG ACAT A,1946,-1_stutter,[AGAT]13 [ATGT]2 CTTT TCTA TGAG ACAT A,,19034.0,,,0.102 +powerseq_example,DYS437,TCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTA,14.0,[TCTA]8 [TCTG]2 [TCTA]4,9559,real_allele,,,,,, +powerseq_example,DYS437,TCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTA,13.0,[TCTA]7 [TCTG]2 [TCTA]4,522,-1_stutter,[TCTA]8 [TCTG]2 [TCTA]4,,9559.0,,,0.055 +powerseq_example,DYS438,TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTC,9.0,[TTTTC]9,15233,real_allele,,,,,, +powerseq_example,DYS438,TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTC,8.0,[TTTTC]8,411,-1_stutter,[TTTTC]9,,15233.0,,,0.027 +powerseq_example,DYS439,AAATAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,12.0,AAAT AGAA [GATA]12,25067,real_allele,,,,,, +powerseq_example,DYS439,AAATAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,11.0,AAAT AGAA [GATA]11,2052,-1_stutter,AAAT AGAA [GATA]12,,25067.0,,,0.082 +powerseq_example,DYS448,AGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATATAGAGATAGAGAGATAGAGATAGAGATAGATAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGAT,19.0,[AGAGAT]11 [ATAGAG]2 [AGATAG]3 ATAGAT AGAGAA [AGAGAT]8,2823,real_allele,,,,,, +powerseq_example,DYS448,AGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATATAGAGATAGAGAGATAGAGATAGAGATAGAGAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGAT,19.0,[AGAGAT]11 [ATAGAG]2 [AGATAG]3 AGAGAT AGAGAA [AGAGAT]8,88,BelowAT,,,,,0.03, +powerseq_example,DYS456,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCCATTAGTTCTGTCCCTCTAGAGAACCCTAATACATCAGTTTAAGAA,17.0,[AGAT]17 ATTC CATT AGTT CTGT CCCT CTAG AGAA CCCT AATA CATC AGTT TAAG AA,11109,real_allele,,,,,, +powerseq_example,DYS456,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCCATTAGTTCTGTCCCTCTAGAGAACCCTAATACATCAGTTTAAGAA,16.0,[AGAT]16 ATTC CATT AGTT CTGT CCCT CTAG AGAA CCCT AATA CATC AGTT TAAG AA,1834,real_allele,,,,,, +powerseq_example,DYS458,GAAAGAAAGAAAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGGAGGGTGGGCGTGGTGGCTCATGCTTGTAATGCCAGAACTTTGGGAGGCCGAGGTGG,17.0,[GAAA]3 AG GAAG [GAAA]17 GGAG GGTG GGCG TGGT GGCT CATG CTTG TAAT GCCA GAAC TTTG GGAG GCCG AGGT GG,10541,real_allele,,,,,, +powerseq_example,DYS458,GAAAGAAAGAAAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGGAGGGTGGGCGTGGTGGCTCATGCTTGTAATGCCAGAACTTTGGGAGGCCGAGGTGG,16.0,[GAAA]3 AG GAAG [GAAA]16 GGAG GGTG GGCG TGGT GGCT CATG CTTG TAAT GCCA GAAC TTTG GGAG GCCG AGGT GG,1660,real_allele,,,,,, +powerseq_example,DYS481,CTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT,22.0,[CTT]22,12071,real_allele,,,,,, +powerseq_example,DYS481,CTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT,21.0,[CTT]21,2848,-1_stutter,[CTT]22,,12071.0,,,0.236 +powerseq_example,DYS533,TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,12.0,[TATC]12,8598,real_allele,,,,,, +powerseq_example,DYS533,TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,11.0,[TATC]11,735,-1_stutter,[TATC]12,,8598.0,,,0.085 +powerseq_example,DYS549,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,13.0,[GATA]13,17735,real_allele,,,,,, +powerseq_example,DYS549,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,12.0,[GATA]12,1559,-1_stutter,[GATA]13,,17735.0,,,0.088 +powerseq_example,DYS570,TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC,17.0,[TTTC]17,14991,real_allele,,,,,, +powerseq_example,DYS570,TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC,16.0,[TTTC]16,1561,-1_stutter,[TTTC]17,,14991.0,,,0.104 +powerseq_example,DYS576,AAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG,18.0,[AAAG]18,9658,real_allele,,,,,, +powerseq_example,DYS576,AAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG,17.0,[AAAG]17,1530,real_allele,,,,,, +powerseq_example,DYS635,TCTATCTATCTATCTATGTATGTATCTATCTATGTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,21.0,[TCTA]4 [TGTA]2 [TCTA]2 [TGTA]2 [TCTA]11,14753,real_allele,,,,,, +powerseq_example,DYS635,TCTATCTATCTATCTATGTATGTATCTATCTATGTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,20.0,[TCTA]4 [TGTA]2 [TCTA]2 [TGTA]2 [TCTA]10,1399,-1_stutter,[TCTA]4 [TGTA]2 [TCTA]2 [TGTA]2 [TCTA]11,,14753.0,,,0.095 +powerseq_example,DYS643,CTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTT,10.0,[CTTTT]10 CTTTC TTTT,9866,real_allele,,,,,, +powerseq_example,DYS643,CTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTT,9.0,[CTTTT]9 CTTTC TTTT,287,-1_stutter,[CTTTT]10 CTTTC TTTT,,9866.0,,,0.029 +powerseq_example,Y-GATA-H4,AGATAGATAGATAGATCTATAGATAGATAGGTAGGTAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,11.0,AGA [TAGA]3 TCTA [TAGA]2 [TAGG]3 [TAGA]11,8052,real_allele,,,,,, +powerseq_example,Y-GATA-H4,AGATAGATAGATAGATCTATAGATAGATAGGTAGGTAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,10.0,AGA [TAGA]3 TCTA [TAGA]2 [TAGG]3 [TAGA]10,662,-1_stutter,AGA [TAGA]3 TCTA [TAGA]2 [TAGG]3 [TAGA]11,,8052.0,,,0.082 diff --git a/lusSTR/tests/test_filters.py b/lusSTR/tests/test_filters.py index adb82b34..5ffa7da0 100644 --- a/lusSTR/tests/test_filters.py +++ b/lusSTR/tests/test_filters.py @@ -229,7 +229,7 @@ def test_STRmixoutput_customranges(tmp_path): exp_out = data_file("NGS_stutter_test/custom/Sample1_evidence_ngs.csv") exp_info_out = data_file("NGS_stutter_test/custom/test_stutter_sequence_info.csv") obs_out = str(tmp_path / f"WD/test_stutter/Sample1_evidence_ngs.csv") - obs_info_out = str(tmp_path / f"WD/test_stutter/test_stutter_sequence_info.csv") + obs_info_out = str(tmp_path / f"WD/test_stutter/test_stutter_custom_range_sequence_info.csv") arglist = [ "config", "-w", @@ -441,3 +441,33 @@ def test_lusplus_sequence_info(tmp_path): all_arglist = ["strs", "all", "-w", str_path] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(all_arglist)) assert filecmp.cmp(exp_out, obs_out) is True + + +def test_filtering_ystrs(tmp_path): + str_path = str(tmp_path / "WD") + inputfile = data_file("powerseq_flanking_convert_test.csv") + inputfile_sex = data_file("powerseq_flanking_convert_test_sexloci.csv") + exp_out = data_file("powerseq_example_evidence_ngs.csv") + exp_info_out = data_file("powerseq_example_sexloci_sequence_info.csv") + obs_out = str(tmp_path / f"WD/lusstr_output/ystrs/powerseq_example_evidence_ngs.csv") + obs_info_out = str( + tmp_path / f"WD/lusstr_output/ystrs/lusstr_output_sexloci_sequence_info.csv" + ) + arglist = [ + "config", + "-w", + str_path, + "--input", + str(inputfile), + "--sex", + "--analysis-software", + "straitrazor", + "--powerseq", + ] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) + shutil.copyfile(inputfile, os.path.join(str_path, "lusstr_output.csv")) + shutil.copyfile(inputfile_sex, os.path.join(str_path, "lusstr_output_sexloci.csv")) + all_arglist = ["strs", "all", "-w", str_path] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(all_arglist)) + assert filecmp.cmp(exp_out, obs_out) is True + assert filecmp.cmp(exp_info_out, obs_info_out) is True diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index 8dde4d6c..15c95d80 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -326,12 +326,13 @@ def test_snakemake(command, output, format_out, convert_out, all_out, tmp_path): def test_marker_plots(tmp_path): inputfile = data_file("UAS_bulk_input/Positive Control Sample Details Report 2315.xlsx") exp_output = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_marker_plots.pdf") - sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_sexchr_marker_plots.pdf") - arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile)] + sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_sexloci_Positive_Control_marker_plots.pdf") + arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile), "--sex"] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) snakemake_arglist = ["strs", "all", "-w", str(tmp_path)] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist)) assert os.path.exists(exp_output) is True + assert os.path.exists(sex_exp) is True def test_genemarker(tmp_path): diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index 7fa75c0f..c283e04b 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -149,7 +149,8 @@ rule filter: separate=config["separate"], filters=config["nofilters"], strand=config["strand"], - custom=config["custom_ranges"] + custom=config["custom_ranges"], + sex=config["sex"] script: lusSTR.wrapper("filter") diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index c499a385..cc79317b 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -274,7 +274,7 @@ def main(input, out, kit, software, sex, nocombine, custom): sex_final_table.to_csv(f"{full_table_name}_sexloci.txt", sep="\t", index=False) if custom: sex_table_custom = create_custom_outputtable(sex_columns, sex_final_table) - sex_table_custom.to_csv(f"{output_name}.txt", index=False, sep="\t") + sex_table_custom.to_csv(f"{output_name}_sexloci.txt", index=False, sep="\t") else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) if software != "uas": diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index b1c9e4ce..9b2342b2 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -57,6 +57,30 @@ "VWA", ] +ystrs = [ + "DYS19", + "DYS385A-B", + "DYS389II", + "DYS390", + "DYS391", + "DYS392", + "DYS393", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS456", + "DYS458", + "DYS481", + "DYS533", + "DYS549", + "DYS570", + "DYS576", + "DYS635", + "DYS643", + "Y-GATA-H4", +] + def get_filter_metadata_file(): return importlib.resources.files("lusSTR") / "data/filters.json" @@ -108,27 +132,31 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): ], fill_value=None, ) - filtered_df = filters(data_order, locus, total_reads, datatype, brack_col) - final_df = pd.concat([final_df, filtered_df]) - flags_df = pd.concat([flags_df, flags(filtered_df, datatype)]) + if locus in strs or locus in ystrs: + filtered_df = filters(data_order, locus, total_reads, datatype, brack_col) + final_df = pd.concat([final_df, filtered_df]) + flags_df = pd.concat([flags_df, flags(filtered_df, datatype)]) if datatype == "ce" or datatype == "ngs": - final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) + try: + final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) + except KeyError: + final_df = None return final_df, flags_df -def EFM_output(profile, outfile, profile_type, data_type, col, separate=False): +def EFM_output(profile, outfile, profile_type, data_type, col, sex, separate=False): if profile_type == "reference": profile = profile[profile.allele_type == "real_allele"] else: profile = profile[profile.allele_type != "BelowAT"] - efm_profile = populate_efm_profile(profile, data_type, col) + efm_profile = populate_efm_profile(profile, data_type, col, sex) if separate: write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outfile) else: write_aggregate_efm_profile(efm_profile, profile_type, data_type, outfile) -def populate_efm_profile(profile, data_type, colname): +def populate_efm_profile(profile, data_type, colname, sex): if data_type == "ce": prof_col = "CE_Allele" elif data_type == "lusplus": @@ -163,7 +191,8 @@ def populate_efm_profile(profile, data_type, colname): entry = [sampleid, locusid] + allele_list + height_list reformatted_profile.append(entry) for sampleid in allele_heights: - for locusid in strs: + str_list = ystrs if sex else strs + for locusid in str_list: if locusid not in allele_heights[sampleid]: entry = [sampleid, locusid] + ([None] * max_num_alleles * 2) reformatted_profile.append(entry) @@ -320,12 +349,12 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def marker_plots(df, output_name): +def marker_plots(df, output_name, sex): Path("MarkerPlots").mkdir(parents=True, exist_ok=True) df["CE_Allele"] = df["CE_Allele"].astype(float) filt_df = df[df["allele_type"] == "real_allele"] for sample_id in df["SampleID"].unique(): - # sample_id = f"{id}_sexchr" if sex else id + # sample_id = f"{id}_ystrs" if sex else id with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: make_plot(filt_df, sample_id, filters=True, at=False) pdf.savefig() @@ -352,40 +381,43 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): fig = plt.figure(figsize=(30, 30)) n = 0 for marker in sample_df["Locus"].unique(): - n += 1 - colors = {"Typed": "g", "Stutter": "b", "BelowAT": "r"} - marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") - ax = fig.add_subplot(6, 5, n) - p = ax.bar( - marker_df["CE_Allele"], - marker_df["Reads"], - edgecolor="k", - color=[colors[i] for i in marker_df["Type"]], - ) - if at: - at = get_at(marker_df, marker) - ax.axhline(at, linestyle="--", color="k") - ax.text(round(min(marker_df["CE_Allele"])) - 0.9, at + (at * 0.1), f"AT", size=12) - labels = marker_df["Type"].unique() - handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] - if not filters: - plt.legend(handles, labels, title="Allele Type") - else: - for i, row in marker_df.iterrows(): - marker_df.loc[i, "Label"] = ( - str(int(marker_df.loc[i, "CE_Allele"])) - if ".0" in str(marker_df.loc[i, "CE_Allele"]) - else str(marker_df.loc[i, "CE_Allele"]) + if marker in strs or marker in ystrs: + n += 1 + colors = {"Typed": "g", "Stutter": "b", "BelowAT": "r"} + marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + ax = fig.add_subplot(6, 5, n) + p = ax.bar( + marker_df["CE_Allele"], + marker_df["Reads"], + edgecolor="k", + color=[colors[i] for i in marker_df["Type"]], + ) + if at: + at = get_at(marker_df, marker) + ax.axhline(at, linestyle="--", color="k") + ax.text(round(min(marker_df["CE_Allele"])) - 0.9, at + (at * 0.1), f"AT", size=12) + labels = marker_df["Type"].unique() + handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] + if not filters: + plt.legend(handles, labels, title="Allele Type") + else: + for i, row in marker_df.iterrows(): + marker_df.loc[i, "Label"] = ( + str(int(marker_df.loc[i, "CE_Allele"])) + if ".0" in str(marker_df.loc[i, "CE_Allele"]) + else str(marker_df.loc[i, "CE_Allele"]) + ) + ax.bar_label(p, labels=marker_df["Label"]) + if sameyaxis: + ax.set_yticks(np.arange(0, max_yvalue, increase_value)) + ax.set_xticks( + np.arange( + round(min(marker_df["CE_Allele"]) - 1), + round(max(marker_df["CE_Allele"])) + 2, + 1.0, ) - ax.bar_label(p, labels=marker_df["Label"]) - if sameyaxis: - ax.set_yticks(np.arange(0, max_yvalue, increase_value)) - ax.set_xticks( - np.arange( - round(min(marker_df["CE_Allele"]) - 1), round(max(marker_df["CE_Allele"])) + 2, 1.0 ) - ) - ax.title.set_text(marker) + ax.title.set_text(marker) if sameyaxis: title = "Marker Plots for All Alleles With Same Y-Axis Scale" elif filters: @@ -412,6 +444,55 @@ def get_at(df, locus): return at +def process_input( + input_name, + outpath, + profile_type, + data_type, + output_type, + strand="forward", + nofiltering=False, + separate=False, + custom=False, + sex=False, + info=True, +): + full_df = pd.read_csv(f"{input_name}.txt", sep="\t") + if custom: + seq_col = "Custom_Range_Sequence" + brack_col = "Custom_Bracketed_Notation" + else: + seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence" + brack_col = ( + "UAS_Output_Bracketed_Notation" + if strand == "uas" + else "Forward_Strand_Bracketed_Notation" + ) + if nofiltering: + full_df["allele_type"] = "real_allele" + marker_plots(full_df, input_name, sex) + if output_type == "efm" or output_type == "mpsproto": + EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, separate) + else: + STRmix_output(full_df, outpath, profile_type, data_type, seq_col) + else: + dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} + final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) + if final_df is not None: + marker_plots(final_df, input_name, sex) + if output_type == "efm" or output_type == "mpsproto": + EFM_output(final_df, outpath, profile_type, data_type, brack_col, sex, separate) + else: + STRmix_output(final_df, outpath, profile_type, data_type, seq_col) + if info: + name = os.path.basename(outpath) + final_df.to_csv(f"{outpath}/{input_name}_sequence_info.csv", index=False) + if not flags_df.empty: + flags_df.to_csv(f"{outpath}/{input_name}_Flagged_Loci.csv", index=False) + else: + print(f"{input_name} does not have any sequences! Continuing on.") + + def main( input, output_type, @@ -423,6 +504,7 @@ def main( nofilters, strand, custom, + sex, ): input = str(input) if profile_type not in ("evidence", "reference"): @@ -431,41 +513,39 @@ def main( raise ValueError(f"unknown data type '{data_type}'") if output_type not in ("efm", "strmix", "mpsproto"): raise ValueError(f"unknown output type '{output_type}'") - full_df = pd.read_csv(input, sep="\t") if output_dir is None: raise ValueError("No output specified using --out.") - else: - outpath = output_dir - if custom: - seq_col = "Custom_Range_Sequence" - brack_col = "Custom_Bracketed_Notation" - else: - seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence" - brack_col = ( - "UAS_Output_Bracketed_Notation" - if strand == "uas" - else "Forward_Strand_Bracketed_Notation" + if sex: + outpath_sex = f"{output_dir}/ystrs/" + input_name_sex = f"{os.path.splitext(input)[0]}_sexloci" + process_input( + input_name_sex, + outpath_sex, + profile_type, + data_type, + output_type, + strand=strand, + nofiltering=nofilters, + separate=separate, + custom=custom, + sex=sex, + info=info, ) - if nofilters: - full_df["allele_type"] = "real_allele" - marker_plots(full_df, outpath) - if output_type == "efm" or output_type == "mpsproto": - EFM_output(full_df, outpath, profile_type, data_type, brack_col, separate) - else: - STRmix_output(full_df, outpath, profile_type, data_type, seq_col) - else: - dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} - final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) - marker_plots(final_df, outpath) - if output_type == "efm" or output_type == "mpsproto": - EFM_output(final_df, outpath, profile_type, data_type, brack_col, separate) - else: - STRmix_output(final_df, outpath, profile_type, data_type, seq_col) - if info: - name = os.path.basename(outpath) - final_df.to_csv(f"{outpath}/{name}_sequence_info.csv", index=False) - if not flags_df.empty: - flags_df.to_csv(f"{outpath}/{name}_Flagged_Loci.csv", index=False) + input_name = os.path.splitext(input)[0] + outpath = output_dir + process_input( + input_name, + outpath, + profile_type, + data_type, + output_type, + strand=strand, + nofiltering=nofilters, + separate=separate, + custom=custom, + sex=sex, + info=info, + ) if __name__ == "__main__": @@ -480,4 +560,5 @@ def main( nofilters=snakemake.params.filters, strand=snakemake.params.strand, custom=snakemake.params.custom, + sex=snakemake.params.sex, )