From 2a0a58e622f6d51b41a83df3cfc8be04c4f0b174 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 28 Jun 2024 14:36:23 -0400 Subject: [PATCH 1/2] Update --- LICENSE.txt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 4b2d73b7..ff6bcf8a 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,11 +1,15 @@ -Copyright (c) 2020, Battelle National Biodefense Institute (BNBI); -all rights reserved. Authored by Rebecca Mitchell. +Copyright (c) 2020, DHS. This Software was prepared for the Department of Homeland Security (DHS) by the Battelle National Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the National Biodefense Analysis and Countermeasures Center (NBACC), a Federally -Funded Research and Development Center. +Funded Research and Development Center. All rights in the computer +software are reserved by DHS on behalf of the United States Government +and the Contractor as provided in the Contract. NEITHER THE GOVERNMENT +NOR BNBI MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY +LIABILITY FOR THE USE OF THIS SOFTWARE. This notice including this +sentence must appear on any copies of this computer software. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are From 4559739f8822b472094cc4ec47ab10be8a3ac01d Mon Sep 17 00:00:00 2001 From: rnmitchell <57150382+rnmitchell@users.noreply.github.com> Date: Mon, 8 Jul 2024 10:10:12 -0400 Subject: [PATCH 2/2] Custom PowerSeq Ranges (#75) --- README.md | 1 + lusSTR/cli/config.py | 6 + lusSTR/data/config.yaml | 1 + lusSTR/data/str_markers.json | 122 ++++++ lusSTR/scripts/marker.py | 124 +++++- .../custom/Sample1_evidence_ngs.csv | 14 + .../custom/test_stutter_custom_range.txt | 21 ++ .../custom/test_stutter_sequence_info.csv | 19 + lusSTR/tests/test_filters.py | 19 + lusSTR/tests/test_marker.py | 352 ++++++++++++++++++ lusSTR/workflows/strs.smk | 17 +- lusSTR/wrappers/convert.py | 80 +++- lusSTR/wrappers/filter.py | 35 +- setup.py | 1 + 14 files changed, 769 insertions(+), 43 deletions(-) create mode 100644 lusSTR/tests/data/NGS_stutter_test/custom/Sample1_evidence_ngs.csv create mode 100644 lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_custom_range.txt create mode 100644 lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_sequence_info.csv diff --git a/README.md b/README.md index 8e5b4d78..18e1dad2 100755 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ analysis_software: ```uas``` (```uas/straitrazor/genemarker```); indicates the a sex: ```False``` (True/False); include sex-chromosome STRs (invoke ```--sex``` flag) samp_input: ```/path/to/input/directory/or/samples``` input directory or sample; if not provided, will be current working directory (indicate using ```--input path/to/dir``` ) output: ```lusstr_output``` output file/directory name (indicate using ```--out dir/sampleid e.g. --out test_030923```) +custom: ```False``` (True/False); use custom sequence ranges as defined in the STR markers json file (```str_markers.json```). ### convert settings kit: ```forenseq``` (forenseq/powerseq) (invoke the ```--powerseq``` flag if using PowerSeq data) diff --git a/lusSTR/cli/config.py b/lusSTR/cli/config.py index 179fd532..fa81c160 100644 --- a/lusSTR/cli/config.py +++ b/lusSTR/cli/config.py @@ -93,6 +93,8 @@ def edit_str_config(config, args): data["output_type"] = args.software if args.strand: data["strand"] = args.strand + if args.custom: + data["custom_ranges"] = args.custom return data @@ -175,3 +177,7 @@ def subparser(subparsers): help="Specify the strand orientation for the final output files. UAS orientation is " "default for STRs; forward strand is default for SNPs." ) + p.add_argument( + "--custom", action="store_true", + help="Specifying custom sequence ranges." + ) diff --git a/lusSTR/data/config.yaml b/lusSTR/data/config.yaml index 82e0d2fb..d851456d 100644 --- a/lusSTR/data/config.yaml +++ b/lusSTR/data/config.yaml @@ -9,6 +9,7 @@ output: "lusstr_output" ## output file/directory name; Example: "test_030923" ##convert settings kit: "forenseq" ## forenseq/powerseq +custom_ranges: False ## True/False; use custom ranges, for PowerSeq sequences only nocombine: False ## True/False; do not combine identical sequences (if using STRait Razor data) ##filter settings diff --git a/lusSTR/data/str_markers.json b/lusSTR/data/str_markers.json index 1fe9a099..38c613d5 100644 --- a/lusSTR/data/str_markers.json +++ b/lusSTR/data/str_markers.json @@ -14,6 +14,8 @@ "Foren_3": 14, "Power_5": 13, "Power_3": 112, + "Custom_5": 6, + "Custom_3": 5, "Alleles": ["6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"] }, "D10S1248": { @@ -31,6 +33,8 @@ "Foren_3": 23, "Power_5": 54, "Power_3": 0, + "Custom_5": 0, + "Custom_3": 0, "Alleles": [ "7", "8", "9", "10", "11", "12", "12.3", "13", "14", "15", "16", "17", "18", "19" ] @@ -51,6 +55,8 @@ "Foren_3": 73, "Power_5": 16, "Power_3": 7, + "Custom_5": 9, + "Custom_3": 4, "Alleles": [ "13", "14", "15", "15.1", "16", "16.3", "17", "17.1", "17.3", "18", "18.1", "18.3", "19", "19.1", "19.2", "19.3", "20", "20.1", "20.3", "21", "21.1", "22", @@ -77,6 +83,8 @@ "Foren_3": 8, "Power_5": 70, "Power_3": 45, + "Custom_5": 4, + "Custom_3": 5, "Alleles": ["6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "28.2"] }, "D16S539": { @@ -94,6 +102,8 @@ "Foren_3": 36, "Power_5": 137, "Power_3": 4, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["5", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"] }, "D17S1301": { @@ -128,6 +138,8 @@ "Foren_3": 48, "Power_5": 66, "Power_3": 45, + "Custom_5": 4, + "Custom_3": 8, "Alleles": [ "9", "10", "11", "12", "12.2", "13", "13.2", "14", "14.2", "15", "15.2", "16", "16.2", "17", "17.2", "18", "18.1", "18.2", "19", "20", "20.2", "21", "21.2", "22", @@ -151,6 +163,8 @@ "Foren_3": 54, "Power_5": 59, "Power_3": 37, + "Custom_5": 5, + "Custom_3": 5, "Alleles": [ "4", "8", "9", "10", "11", "11.2", "12", "12.1", "12.2", "12.3", "13", "13.2", "14", "14.2", "15", "15.2", "15.3", "16", "16.2", "17", "17.2", "18", "18.2" @@ -175,6 +189,8 @@ "Foren_3": 0, "Power_5": 45, "Power_3": 21, + "Custom_5": -10, + "Custom_3": 0, "Alleles": [ "7", "8", "9", "10", "11", "12", "13", "14", "14.3", "15", "15.3", "16", "16.1", "16.3", "17", "17.3", "18", "18.3", "19.3", "20" @@ -211,6 +227,8 @@ "Foren_3": 12, "Power_5": 10, "Power_3": 38, + "Custom_5": 4, + "Custom_3": 6, "Alleles": [ "24.2", "25.2", "26", "26.2", "27", "28", "28.2", "28.3", "29", "29.2", "29.3", "30", "30.2", "30.3", "31", "31.2", "32", "32.2", "33", "33.1", "33.2", "34", @@ -232,6 +250,8 @@ "Foren_3": 20, "Power_5": 51, "Power_3": 15, + "Custom_5": 4, + "Custom_3": 4, "Alleles": ["8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"] }, "D2S1338": { @@ -250,6 +270,8 @@ "Foren_3": 14, "Power_5": 65, "Power_3": 48, + "Custom_5": 0, + "Custom_3": 0, "Alleles": [ "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28" @@ -271,6 +293,8 @@ "Foren_3": 23, "Power_5": 81, "Power_3": 5, + "Custom_5": 4, + "Custom_3": 5, "Alleles": [ "9", "10", "10.3", "11", "11.3", "12", "12.3", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28" @@ -292,6 +316,8 @@ "Foren_3": 18, "Power_5": 100, "Power_3": 22, + "Custom_5": 6, + "Custom_3": 5, "Alleles": [ "10", "11", "12", "13", "14", "15", "15.2", "16", "16.2", "17", "18", "18.2", "19", "20" @@ -327,6 +353,8 @@ "Foren_3": 7, "Power_5": 35, "Power_3": 81, + "Custom_5": 4, + "Custom_3": 4, "Alleles": ["7", "8", "9", "10", "11", "11.1", "12", "13", "14", "15", "16"] }, "D6S1043": { @@ -364,6 +392,8 @@ "Foren_3": 20, "Power_5": 61, "Power_3": 69, + "Custom_5": 10, + "Custom_3": 4, "Alleles": [ "6", "6.3", "7", "8", "8.1", "9", "9.1", "9.2", "10", "10.1", "10.3", "11", "11.1", "12", "12.1", "13", "14" @@ -385,6 +415,8 @@ "Foren_3": 5, "Power_5": 19, "Power_3": 108, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18"] }, "D9S1122": { @@ -422,6 +454,8 @@ "Foren_3": 0, "Power_5": 80, "Power_3": 0, + "Custom_5": 0, + "Custom_3": 0, "Alleles": [ "17", "18", "19", "19.2", "20", "21", "22", "22.2", "23", "23.2", "24", "25", "26", "26.2", "27" @@ -442,6 +476,8 @@ "Foren_3": 9, "Power_5": 67, "Power_3": 64, + "Custom_5": -5, + "Custom_3": 0, "Alleles": [ "1.2", "2.2", "3.2", "5", "6", "7", "8", "9", "10", "11", "12", "13", "13.4", "14", "14.1", "15", "16", "17" @@ -462,6 +498,8 @@ "Foren_3": 75, "Power_5": 106, "Power_3": 5, + "Custom_5": 0, + "Custom_3": 0, "Alleles": [ "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "15.4", "16", "16.4", "17", "18", "19", "19.4", "20", "21", "22", "23", "24", "25" @@ -482,6 +520,8 @@ "Foren_3": 12, "Power_5": 10, "Power_3": 162, + "Custom_5": 6, + "Custom_3": 5, "Alleles": ["5", "6", "7", "7.3", "8", "8.3", "9", "9.3", "10", "10.3", "11"] }, "TPOX": { @@ -499,6 +539,8 @@ "Foren_3": 5, "Power_5": 124, "Power_3": 18, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["5", "6", "7", "8", "9", "10", "11", "12", "13", "14"] }, "VWA": { @@ -518,6 +560,8 @@ "Foren_3": 5, "Power_5": 3, "Power_3": 100, + "Custom_5": -8, + "Custom_3": 0, "Alleles": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"] }, "Y-GATA-H4": { @@ -527,6 +571,7 @@ "TCTA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -534,6 +579,8 @@ "Foren_3": 0, "Power_5": 0, "Power_3": 113, + "Custom_5": 0, + "Custom_3": -39, "Alleles": ["10", "11", "12", "13"] }, "DYS643": { @@ -543,6 +590,7 @@ "CTTTT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 5, "LUS": "CTTTT", "Sec": "", "Tert": "", @@ -550,6 +598,8 @@ "Foren_3": 8, "Power_5": 63, "Power_3": 33, + "Custom_5": 4, + "Custom_3": 0, "Alleles": ["9", "10", "11", "14"] }, @@ -560,6 +610,7 @@ "TAGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "", "Tert": "", @@ -567,6 +618,8 @@ "Foren_3": 24, "Power_5": 42, "Power_3": 4, + "Custom_5": 6, + "Custom_3": 4, "Alleles": ["19", "20", "21", "22"] }, "DYS612": { @@ -576,6 +629,7 @@ "TCT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 3, "LUS": "TCT", "Sec": "TCT", "Tert": "", @@ -593,6 +647,7 @@ "AAAG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAAG", "Sec": "", "Tert": "", @@ -600,6 +655,8 @@ "Foren_3": 39, "Power_5": 1, "Power_3": 62, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["15", "16", "17", "18", "19"] }, "DYS570":{ @@ -609,6 +666,7 @@ "TTTC" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TTTC", "Sec": "", "Tert": "", @@ -616,6 +674,8 @@ "Foren_3": 14, "Power_5": 51, "Power_3": 5, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["16", "17", "18", "19", "20"] }, "DYS549": { @@ -625,6 +685,7 @@ "GATA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "GATA", "Sec": "", "Tert": "", @@ -632,6 +693,8 @@ "Foren_3": 49, "Power_5": 110, "Power_3": 9, + "Custom_5": 4, + "Custom_3": 6, "Alleles": ["11", "12", "13", "14"] }, "DYS533": { @@ -641,6 +704,7 @@ "TATC" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TATC", "Sec": "", "Tert": "", @@ -648,6 +712,8 @@ "Foren_3": 21, "Power_5": 101, "Power_3": 64, + "Custom_5": 7, + "Custom_3": 4, "Alleles": [ "10", "11", "12", "13"] }, "DYS522": { @@ -657,6 +723,7 @@ "ATAG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "ATAG", "Sec": "", "Tert": "", @@ -671,6 +738,7 @@ "TCCT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TCCT", "Sec": "", "Tert": "", @@ -685,6 +753,7 @@ "CTT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 3, "LUS": "CTT", "Sec": "", "Tert": "", @@ -692,6 +761,8 @@ "Foren_3": 6, "Power_5": 35, "Power_3": 12, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["20", "21", "22", "23", "24", "25", "26", "28"] }, "DYS460": { @@ -701,6 +772,7 @@ "CTAT" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "CTAT", "Sec": "", "Tert": "", @@ -715,11 +787,14 @@ "GAAA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "GAAA", "Sec": "", "Tert": "", "Power_5": 0, "Power_3": 0, + "Custom_5": 0, + "Custom_3": -58, "Alleles": ["13", "14", "15", "16", "16.2", "17", "17.2", "18", "18.2", "19", "20"] }, "DYS456": { @@ -729,11 +804,14 @@ "AGAT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AGAT", "Sec": "", "Tert": "", "Power_5": 0, "Power_3": 0, + "Custom_5": 0, + "Custom_3": -45, "Alleles": ["15", "16", "17", "18"] }, "DYS448": { @@ -743,6 +821,7 @@ "AGAGAT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 6, "LUS": "AGAGAT", "Sec": "AGAGAT", "Tert": "", @@ -750,6 +829,8 @@ "Foren_3": 5, "Power_5": 12, "Power_3": 17, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["19", "20", "21", "22"] }, "DYS439": { @@ -759,6 +840,7 @@ "GATA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "GATA", "Sec": "", "Tert": "", @@ -766,6 +848,8 @@ "Foren_3": 23, "Power_5": 0, "Power_3": 25, + "Custom_5": -3, + "Custom_3": 6, "Alleles": ["10", "11", "12", "13"] }, "DYS438": { @@ -775,6 +859,7 @@ "TTTTC" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 5, "LUS": "TTTTC", "Sec": "", "Tert": "", @@ -782,6 +867,8 @@ "Foren_3": 36, "Power_5": 7, "Power_3": 34, + "Custom_5": 4, + "Custom_3": 5, "Alleles": ["8", "9", "10", "11"] }, "DYS437": { @@ -791,6 +878,7 @@ "TCTA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -798,6 +886,8 @@ "Foren_3": 105, "Power_5": 10, "Power_3": 76, + "Custom_5": 4, + "Custom_3": 6, "Alleles": ["13", "14", "15", "16"] }, "DYS393": { @@ -807,11 +897,14 @@ "AGAT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AGAT", "Sec": "", "Tert": "", "Power_5": 0, "Power_3": 0, + "Custom_5": 0, + "Custom_3": -25, "Alleles": ["11", "12", "13", "14", "15", "16"] }, "DYS392": { @@ -821,6 +914,7 @@ "ATA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 3, "LUS": "ATA", "Sec": "", "Tert": "", @@ -828,6 +922,8 @@ "Foren_3": 104, "Power_5": 3, "Power_3": 75, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["9", "10", "11", "12", "12.2", "13", "14", "15"] }, "DYS391": { @@ -837,6 +933,7 @@ "TCTA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -844,6 +941,8 @@ "Foren_3": 29, "Power_5": 0, "Power_3": 58, + "Custom_5": -2, + "Custom_3": 6, "Alleles": ["8", "9", "9.3", "10", "11", "12", "13", "14"] }, "DYS390": { @@ -855,6 +954,7 @@ "GAGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "CAGA", "Tert": "GAGA", @@ -862,6 +962,8 @@ "Foren_3": 0, "Power_5": 23, "Power_3": 54, + "Custom_5": 0, + "Custom_3": -8, "Alleles": ["20", "21", "22", "23", "24", "25", "26", "27"] }, "DYS389II": { @@ -872,6 +974,7 @@ "CAGG" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "TAGA", "Tert": "CAGG", @@ -879,6 +982,8 @@ "Foren_3": 22, "Power_5": 54, "Power_3": 2, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["26", "28", "29", "30", "31", "32"] }, "DYS389I": { @@ -888,6 +993,7 @@ "TAGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "", "Tert": "", @@ -895,6 +1001,8 @@ "Foren_3": 18, "Power_5": 0, "Power_3": 5, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["10", "11", "12", "13", "14"] }, "DYS385A-B": { @@ -904,6 +1012,7 @@ "TTTC" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TTTC", "Sec": "", "Tert": "", @@ -911,6 +1020,8 @@ "Foren_3": 23, "Power_5": 0, "Power_3": 23, + "Custom_5": -8, + "Custom_3": -119, "Alleles": ["12", "13", "14", "15", "16", "17"] }, "DYS19": { @@ -920,6 +1031,7 @@ "TCTA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -927,6 +1039,8 @@ "Foren_3": 0, "Power_5": 26, "Power_3": 49, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["12", "13", "13.2", "14", "14.2", "15", "16", "17", "18", "19"] }, "DYF387S1": { @@ -937,6 +1051,7 @@ "GAAG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAAG", "Sec": "GAAG", "Tert": "", @@ -951,6 +1066,7 @@ "ATCT" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "ATCT", "Sec": "", "Tert": "", @@ -965,6 +1081,7 @@ "ATAG" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "ATAG", "Sec": "", "Tert": "", @@ -979,6 +1096,7 @@ "TGGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TGGA", "Sec": "", "Tert": "", @@ -993,6 +1111,7 @@ "TAGA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "", "Tert": "", @@ -1008,6 +1127,7 @@ "AAGG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAGA", "Sec": "AAGG", "Tert": "", @@ -1028,6 +1148,7 @@ "AAAA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAGA", "Sec": "AAAA", "Tert": "", @@ -1045,6 +1166,7 @@ "TAGA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "TAGA", "Tert": "", diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 1817faf5..2205789d 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -45,7 +45,7 @@ class UnsupportedSoftwareError(ValueError): class STRMarker: - def __init__(self, locus, sequence, software, kit="forenseq"): + def __init__(self, locus, sequence, software, custom=False, kit="forenseq"): self.locus = locus self.sequence = sequence if locus not in str_marker_data: @@ -59,6 +59,7 @@ def __init__(self, locus, sequence, software, kit="forenseq"): self.kit = kit.lower() if software == "uas" and self.data["ReverseCompNeeded"] == "Yes": self.sequence = reverse_complement(sequence) + self.custom = custom @property def repeat_size(self): @@ -116,6 +117,20 @@ def uas_sequence(self): return reverse_complement(self.forward_sequence) return self.forward_sequence + @property + def custom_sequence(self): + """Custom range for sequences; PowerSeq sequences only""" + if self.custom: + front, back = self._uas_bases_to_trim() + custom_front = front - self.data["Custom_5"] + custom_back = back - self.data["Custom_3"] + if custom_back == 0: + custom_back = None + else: + custom_back *= -1 + return self.sequence[custom_front:custom_back] + return None + @property def flankseq_5p(self): if self.software == "uas": @@ -294,6 +309,17 @@ def convert_uas(self): return reverse_complement_bracketed(self.convert) return self.convert + @property + def custom_brack(self): + if self.custom and self.data["Custom_5"] == 0 and self.data["Custom_3"] == 0: + return self.convert + elif self.custom: + bases = 4 if self.locus == "CSF1PO" else self.data["NumBasesToSeparate"] + final = sequence_to_bracketed_form(self.custom_sequence, bases, self.repeats) + final_string = re.sub(r" +", " ", final) + return final_string + return None + @property def designation(self): lus, sec, ter = None, None, None @@ -319,8 +345,10 @@ def summary(self): return [ self.uas_sequence, self.forward_sequence, + self.custom_sequence, self.convert_uas, self.convert, + self.custom_brack, canon, lus_final_output, lus_plus, @@ -402,6 +430,29 @@ def convert(self): bracketed_form = "" return bracketed_form + @property + def custom_brack(self): + if self.custom: + if len(self.uas_sequence) < 110: + bracketed_form = collapse_repeats_by_length(self.custom_sequence, 4) + else: + if "GGGCTGCCTA" in self.custom_sequence: + break_point = self.custom_sequence.index("GGGCTGCCTA") + 10 + bracketed_form = ( + f"{collapse_repeats_by_length(self.custom_sequence[:break_point], 4)} " + f"{collapse_repeats_by_length(self.custom_sequence[break_point:], 4)}" + ) + elif "TTTT" in self.uas_sequence: + break_point = self.uas_sequence.index("TTTT") + 18 + bracketed_form = ( + f"{collapse_repeats_by_length(self.custom_sequence[:break_point], 4)} " + f"{collapse_repeats_by_length(self.custom_sequence[break_point:], 4)}" + ) + else: + bracketed_form = collapse_repeats_by_length(self.custom_sequence, 4) + return bracketed_form + return None + class STRMarker_D20S482(STRMarker): @property @@ -878,6 +929,17 @@ def convert(self): elif isinstance(self.canonical, int): return collapse_repeats_by_length(self.uas_sequence, self.repeat_size) + @property + def custom_brack(self): + if self.custom: + if isinstance(self.canonical, str): + return sequence_to_bracketed_form( + self.custom_sequence, self.repeat_size, self.repeats + ) + elif isinstance(self.canonical, int): + return collapse_repeats_by_length(self.custom_sequence, self.repeat_size) + return None + class STRMarker_D21S11(STRMarker): @property @@ -1001,6 +1063,18 @@ def designation(self): count = count return lus_allele, sec_allele, finalcount + @property + def custom_brack(self): + if self.custom: + custom_front = self.custom_sequence[: self.data["Custom_5"]] + custom_back = self.custom_sequence[-self.data["Custom_3"] :] + uas_bracket = self.convert + final_string = ( + f"{custom_front} {uas_bracket} {collapse_repeats_by_length(custom_back, 4)}" + ) + return final_string + return None + class STRMarker_TH01(STRMarker): @property @@ -1037,6 +1111,16 @@ def flank_3p(self): flank = collapse_repeats_by_length(flank_seq, 4) return flank + @property + def custom_brack(self): + if self.custom: + custom_front = self.custom_sequence[: self.data["Custom_5"]] + custom_back = self.custom_sequence[-self.data["Custom_3"] :] + uas_bracket = self.convert + final_string = f"{collapse_repeats_by_length(custom_front, 4)} {uas_bracket} {collapse_repeats_by_length(custom_back, 4)}" + return final_string + return None + class STRMarker_TPOX(STRMarker): @property @@ -1112,6 +1196,16 @@ def convert(self): final_string = " ".join(final) return re.sub(r" +", " ", final_string) + @property + def custom_brack(self): + if self.custom: + custom_front = self.custom_sequence[: self.data["Custom_5"]] + custom_back = self.custom_sequence[-self.data["Custom_3"] :] + uas_bracket = self.convert + final_string = f"{custom_front[:3]} {custom_front[3:5]}{uas_bracket} {custom_back[-5:-1]} {custom_back[-1:]}" + return final_string + return None + class STRMarker_DYS643(STRMarker): @property @@ -1317,10 +1411,13 @@ def canonical(self): def convert(self): sequence = self.forward_sequence if self.kit == "powerseq": - final_seq = ( - f"{collapse_repeats_by_length_flanks(sequence[:6], 4)} " - f"{collapse_repeats_by_length(sequence[6:], 4)}" - ) + if len(sequence) < 6: + final_seq = sequence + else: + final_seq = ( + f"{collapse_repeats_by_length_flanks(sequence[:6], 4)} " + f"{collapse_repeats_by_length(sequence[6:], 4)}" + ) elif len(sequence) % 4 != 0: final_seq = sequence_to_bracketed_form(sequence, self.repeat_size, self.repeats) else: @@ -1352,6 +1449,17 @@ def convert(self): ) return final_string + @property + def custom_brack(self): + if self.custom: + sequence = self.custom_sequence + final_string = ( + f"{collapse_repeats_by_length(sequence[:14], 4)} " + f"{collapse_repeats_by_length(sequence[14:], 4)}" + ) + return final_string + return None + class STRMarker_HPRTB(STRMarker): @property @@ -1629,7 +1737,7 @@ def flank_5p(self): return flank -def STRMarkerObject(locus, sequence, software, kit="forenseq"): +def STRMarkerObject(locus, sequence, software, custom=False, kit="forenseq"): constructors = { "D8S1179": STRMarker_D8S1179, "D13S317": STRMarker_D13S317, @@ -1679,6 +1787,6 @@ def STRMarkerObject(locus, sequence, software, kit="forenseq"): } if locus in constructors: constructor = constructors[locus] - return constructor(locus, sequence, software=software, kit=kit) + return constructor(locus, sequence, software=software, custom=custom, kit=kit) else: - return STRMarker(locus, sequence, software=software, kit=kit) + return STRMarker(locus, sequence, software=software, custom=custom, kit=kit) diff --git a/lusSTR/tests/data/NGS_stutter_test/custom/Sample1_evidence_ngs.csv b/lusSTR/tests/data/NGS_stutter_test/custom/Sample1_evidence_ngs.csv new file mode 100644 index 00000000..b61122a4 --- /dev/null +++ b/lusSTR/tests/data/NGS_stutter_test/custom/Sample1_evidence_ngs.csv @@ -0,0 +1,14 @@ +Locus,CE Allele,Allele Seq,Reads +D8S1179,12.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 +D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 +D8S1179,13.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 +D8S1179,14.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 +FGA,18.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,181 +FGA,20.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,1750 +FGA,22.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,262 +FGA,23.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,1436 +PentaD,13.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 +PentaD,14.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,22 +PentaE,7.0,TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT,505 +TH01,6.0,ATGGTGAATGAATGAATGAATGAATGAATGAGGGA,1632 +TH01,7.0,ATGGTGAATGAATGAATGAATGAATGAATGAATGAGGGA,2197 diff --git a/lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_custom_range.txt b/lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_custom_range.txt new file mode 100644 index 00000000..3d575f1d --- /dev/null +++ b/lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_custom_range.txt @@ -0,0 +1,21 @@ +SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence Custom_Range_Sequence UAS_Output_Bracketed_Notation Forward_Strand_Bracketed_Notation Custom_Bracketed_Notation CE_Allele LUS LUS_Plus Reads +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA [TCTA]2 TCTG [TCTA]11 [TCTA]2 TCTG [TCTA]11 [TCTA]2 TCTG [TCTA]11 14 14_11 14_11_1_0 739 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTA TCTG [TCTA]12 TCTA TCTG [TCTA]12 TCTA TCTG [TCTA]12 14 14_12 14_12_1_0 130 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTA TCTG [TCTA]11 TCTA TCTG [TCTA]11 TCTA TCTG [TCTA]11 13 13_11 13_11_1_0 95 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA [TCTA]2 TCTG [TCTA]10 [TCTA]2 TCTG [TCTA]10 [TCTA]2 TCTG [TCTA]10 13 13_10 13_10_1_0 89 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA [TCTA]2 TCTG [TCTA]9 [TCTA]2 TCTG [TCTA]9 [TCTA]2 TCTG [TCTA]9 12 12_9 12_9_1_0 26 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTA TCTG [TCTA]10 TCTA TCTG [TCTA]10 TCTA TCTG [TCTA]10 12 12_10 12_10_1_0 11 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3 [GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3 20 20_12 20_12_3_0 1750 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3 [GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3 23 23_15 23_15_3_0 1436 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]14 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]14 AGAA AAAA [GAAA]3 [GGAA]2 GGAG [AAAG]14 AGAA AAAA [GAAA]3 22 22_14 22_14_3_0 262 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]13 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]13 AGAA AAAA [GAAA]3 [GGAA]2 GGAG [AAAG]13 AGAA AAAA [GAAA]3 21 21_13 21_13_3_0 48 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]10 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]10 AGAA AAAA [GAAA]3 [GGAA]2 GGAG [AAAG]10 AGAA AAAA [GAAA]3 18 18_10 18_10_3_0 181 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]9 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]9 AGAA AAAA [GAAA]3 [GGAA]2 GGAG [AAAG]9 AGAA AAAA [GAAA]3 17 17_9 17_9_3_0 15 +Sample1 Experiment_1.1 Experiment1 TH01 AATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATG ATGGTGAATGAATGAATGAATGAATGAATGAATGAGGGA [AATG]7 [AATG]7 ATGG TG [AATG]7 AGGG A 7 7_7 7_7 2197 +Sample1 Experiment_1.1 Experiment1 TH01 AATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATG ATGGTGAATGAATGAATGAATGAATGAATGAGGGA [AATG]6 [AATG]6 ATGG TG [AATG]6 AGGG A 6 6_6 6_6 1632 +Sample1 Experiment_1.1 Experiment1 TH01 AATGAATGAATGAATGAATG AATGAATGAATGAATGAATG ATGGTGAATGAATGAATGAATGAATGAGGGA [AATG]5 [AATG]5 ATGG TG [AATG]5 AGGG A 5 5_5 5_5 66 +Sample1 Experiment_1.1 Experiment1 VWA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA TCTA [TCTG]3 [TCTA]12 TCCA TCTA TAGA TGGA [TAGA]12 [CAGA]3 TAGA [TAGA]12 [CAGA]3 TAGA 16 16_12 16_12_3_1 6 +Sample1 Experiment_1.1 Experiment1 TPOX AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG [AATG]11 [AATG]11 [AATG]11 11 11_11 11_11 15 +Sample1 Experiment_1.1 Experiment1 PENTA E AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT [AAAGA]7 [TCTTT]7 [TCTTT]7 7 7_7 7_7 505 +Sample1 Experiment_1.1 Experiment1 PENTA D AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAG [AAAGA]13 AAAAG [AAAGA]13 [AAAGA]13 13 13_13 13_13 1000 +Sample1 Experiment_1.1 Experiment1 PENTA D AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAG [AAAGA]13 AAAAG [AAAGA]13 [AAAGA]14 14 14_14 14_14 22 \ No newline at end of file diff --git a/lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_sequence_info.csv b/lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_sequence_info.csv new file mode 100644 index 00000000..b03f67e8 --- /dev/null +++ b/lusSTR/tests/data/NGS_stutter_test/custom/test_stutter_sequence_info.csv @@ -0,0 +1,19 @@ +SampleID,Locus,Custom_Range_Sequence,CE_Allele,Custom_Bracketed_Notation,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter +Sample1,D8S1179,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,14.0,[TCTA]2 TCTG [TCTA]11,739,real_allele,,,,,, +Sample1,D8S1179,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,13.0,TCTA TCTG [TCTA]11,95,-1_stutter,[TCTA]2 TCTG [TCTA]11,,739.0,,,0.129 +Sample1,D8S1179,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,13.0,[TCTA]2 TCTG [TCTA]10,89,-1_stutter,[TCTA]2 TCTG [TCTA]11,,739.0,,,0.12 +Sample1,D8S1179,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,12.0,[TCTA]2 TCTG [TCTA]9,26,-2_stutter,[TCTA]2 TCTG [TCTA]11,,739.0,,,0.035 +Sample1,D8S1179,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,12.0,TCTA TCTG [TCTA]10,11,BelowAT,,,,,0.01, +Sample1,FGA,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,23.0,[GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3,1436,real_allele,,,,,, +Sample1,FGA,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,22.0,[GGAA]2 GGAG [AAAG]14 AGAA AAAA [GAAA]3,262,-1_stutter,[GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3,,1436.0,,,0.182 +Sample1,FGA,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,21.0,[GGAA]2 GGAG [AAAG]13 AGAA AAAA [GAAA]3,48,BelowAT,,,,,0.013, +Sample1,FGA,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,20.0,[GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3,1750,real_allele,,,,,, +Sample1,FGA,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,18.0,[GGAA]2 GGAG [AAAG]10 AGAA AAAA [GAAA]3,181,real_allele,,,,,, +Sample1,FGA,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,17.0,[GGAA]2 GGAG [AAAG]9 AGAA AAAA [GAAA]3,15,BelowAT,,,,,0.004, +Sample1,PENTA D,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,14.0,[AAAGA]14,22,+1_stutter,[AAAGA]13,,1000.0,,,0.022 +Sample1,PENTA D,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,13.0,[AAAGA]13,1000,real_allele,,,,,, +Sample1,PENTA E,TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT,7.0,[TCTTT]7,505,real_allele,,,,,, +Sample1,TH01,ATGGTGAATGAATGAATGAATGAATGAATGAATGAGGGA,7.0,ATGG TG [AATG]7 AGGG A,2197,real_allele,,,,,, +Sample1,TH01,ATGGTGAATGAATGAATGAATGAATGAATGAGGGA,6.0,ATGG TG [AATG]6 AGGG A,1632,real_allele,,,,,, +Sample1,TH01,ATGGTGAATGAATGAATGAATGAATGAGGGA,5.0,ATGG TG [AATG]5 AGGG A,66,BelowAT,,,,,0.017, +Sample1,TPOX,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,11.0,[AATG]11,15,BelowAT,,,,,1.0, diff --git a/lusSTR/tests/test_filters.py b/lusSTR/tests/test_filters.py index 195f0e70..b1095e07 100644 --- a/lusSTR/tests/test_filters.py +++ b/lusSTR/tests/test_filters.py @@ -223,6 +223,25 @@ def test_STRmixoutput_format(outputdir, datatype, tmp_path): assert filecmp.cmp(exp_info_out, obs_info_out) is True +def test_STRmixoutput_customranges(tmp_path): + str_path = str(tmp_path / "WD") + inputfile = data_file("NGS_stutter_test/custom/test_stutter_custom_range.txt") + exp_out = data_file("NGS_stutter_test/custom/Sample1_evidence_ngs.csv") + exp_info_out = data_file("NGS_stutter_test/custom/test_stutter_sequence_info.csv") + obs_out = str(tmp_path / f"WD/test_stutter/Sample1_evidence_ngs.csv") + obs_info_out = str(tmp_path / f"WD/test_stutter/test_stutter_sequence_info.csv") + arglist = ["config", "-w", str_path, "--input", "WD", "-o", "test_stutter", "--custom"] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) + shutil.copyfile(inputfile, os.path.join(str_path, "test_stutter.csv")) + shutil.copyfile(inputfile, os.path.join(str_path, "test_stutter.txt")) + shutil.copyfile(inputfile, os.path.join(str_path, "test_stutter_custom_range.csv")) + shutil.copyfile(inputfile, os.path.join(str_path, "test_stutter_custom_range.txt")) + all_arglist = ["strs", "all", "-w", str_path] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(all_arglist)) + assert filecmp.cmp(exp_out, obs_out) is True + assert filecmp.cmp(exp_info_out, obs_info_out) is True + + def test_nofilters(tmp_path): str_path = str(tmp_path / "WD") inputfile = data_file("test_stutter.txt") diff --git a/lusSTR/tests/test_marker.py b/lusSTR/tests/test_marker.py index cb01e362..926cc7c0 100644 --- a/lusSTR/tests/test_marker.py +++ b/lusSTR/tests/test_marker.py @@ -1467,3 +1467,355 @@ def test_new_power_config(locus, sequence, bracketed, conc, lus, sec, tert, flan assert marker.designation == (lus, sec, tert) assert marker.flank_5p == flank_5 assert marker.flank_3p == flank_3 + + +@pytest.mark.parametrize( + "locus, sequence, cust_seq, bracketed", + [ + ( + "VWA", + "GGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT" + "CCAAGTCACATACTGATTATTCTTATCATCCACTAGGGCTTTCACATCTCAGCCAAGTCAACTTGGATCCTCTAGACCTGTTTCTTCT" + "TCTGGAA", + "TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA", + "[TAGA]11 [CAGA]4 TAGA", + ), + ( + "TPOX", + "CACTGGCCTGTGGGTCCCCCCATAGATCGTAAGCCCAGGAGGAAGGGCTGTGTTTCAGGGCTGTGATCACTAGCAC" + "CCAGAACCGTCGACTGGCACAGAACAGGCACTTAGGGAACCCTCACTGAATGAATGAATGAATGAATGAATGAATGAATGTTTG" + "GGCAAATAAACGCT", + "AATGAATGAATGAATGAATGAATGAATGAATG", + "[AATG]8", + ), + ( + "TH01", + "CTCCATGGTGAATGAATGAATGAATGAATGAATGAATGAGGGAAATAAGGGAGGAACAGGCCAATGGGAATCACCC" + "CAGAGCCCAGATACCCTTTGAATTTTGCCCCCTATTTGCCCAGGACCCCCCACCATGAGCTGCTGCTAGAGCCTGGGAAGGGCC" + "TTGGGGCTGCCTCCCCAAGCAGGCAGGCTGGTTGGGGTGC", + "ATGGTGAATGAATGAATGAATGAATGAATGAATGAGGGA", + "ATGG TG [AATG]7 AGGG A", + ), + ( + "TH01", + "CTCCATGGTGAATGAATGAATGAATGAATGAATGAATGAATGAATGAGGGAAATAAGGGAGGAACAGGCCAATGGG" + "AATCACCCCAGAGCCCAGATACCCTTTGAATTTTGCCCCTATTTGCCCAGGACCCCCCACCATGAGCTGCTGCTAGAGCCTGGG" + "AAGGGCCTTGGGGCTGCCTCCCCAAGCAGGCAGGCTGGTTGGGGTGC", + "ATGGTGAATGAATGAATGAATGAATGAATGAAT" "GAATGAATGAGGG", + "ATGG TG [AATG]8 AAT GAGG G", + ), + ( + "PENTA E", + "TAATGATTACATAACATACATGTGTGTAAAGTGCTTAGTATCATGATTGATACATGGAAAGAATTCTCTTATT" + "TGGGTTATTAATTGAGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGAC", + "TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT", + "[TCTTT]7", + ), + ( + "PENTA D", + "GAGCCATGATCACACCACTACACTCCAGCCTAGGTGACAGAGCAAGACACCATCTCAAGAAAGAAAAAAAAGA" + "AAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAAACGAAGGGGAAAAAAAGAGAATCATAAACATAAATG" + "TAAAATTTCTCAAAAAAATCGTTA", + "AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA", + "[AAAGA]9", + ), + ( + "FGA", + "GTCTGAAATCGAAAATATGGTTATTGAAGTAGCTGCTGAGTGATTTGTCTGTAATTGCCAGCAAAAAAGAAAGGAAG" + "AAAGGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA", + "GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA", + "[GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3", + ), + ( + "D8S1179", + "TTTCATGTGTACATTCGTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC" + "TATTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAA" + "ATGAACTAATTATAGCTACGTGAAAC", + "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC" "TA", + "TCTA TCTG [TCTA]12", + ), + ( + "D7S820", + "AGAATTGCACCAAATATTGGTAATTAAATGTTTACTATAGACTATTTAGTGAGATAAAAAAAAACTATCAATCT" + "GTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCGTTAGTTCGTTCTAAACTATGACAAGTGTTCTA" + "TCATACCCTTTATATATATTAACCTTAAAATAACTC", + "AGATAAAAAAAAACTATCAATCTGTCTATCTATCTATCTATCTA" "TCTATCTATCTATCTATCTATCTATCTATCGTTA", + "AGAT [AAAA]2 AC TATC AATC TGTC [TATC]12 GTTA", + ), + ( + "D5S818", + "AACATTTGTATCTTTATCTGTATCCTTATTTATACCTCTATCTATCTATCTATCTATCTATCTATCTATCTATC" + "TATCTATCTTCAAAATATTACGTAAGGATACCAAAGAGGAAAATCACCCTTGTCACATACTTGCTATTAAAATATACTTTTATT" + "AGTACA", + "ATACCTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTTCAA", + "ATAC CTCT [ATCT]11 TCAA", + ), + ( + "D3S1358", + "TGCCCACTTCTGCCCAGGGATCTATTTTTCTGTGGTGTGTATTCCCTGTGCCTTTGGGGGCATCTCTTATACT" + "CATGAAATCAACAGAGGCTTGCATGTATCTATCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAT" + "CTATGAGACAGGGTCTTGCTCTGTC", + "CATGTATCTATCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTAT" "CTATCTATCTATGAGA", + "CATG TA TCTA [TCTG]2 [TCTA]12 TGAG A", + ), + ( + "D2S441", + "TGCACCCAACATTCTAACAAAAGGCTGTAACAAGGGCTACAGGAATCATGAGCCAGGAACTGTGGCTCATCTAT" + "GAAAACTTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTATATCA", + "AACTTCTATCTA" "TCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTATATCA", + "AACT [TCTA]11 TTTA [TCTA]2 TATC A", + ), + ( + "D2S1338", + "CTAGCATGGTACCTGCAGGTGGCCCATAATAATGAGTTATTCAGTAAGTTAAAGGATTGCAGGAGGGAAGGAA" + "GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAAGGCCAAGCCATTTCTGTTTCCAA" + "ATCCACTGGCTCCCTCCCACAGCT", + "GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCA" "GGCAGGCAGGCA", + "[GGAA]11 [GGCA]6", + ), + ( + "D22S1045", + "CCTTCTTATAGCTGCTATGGGGGCTAGATTTTCCCCGATGATAGTAGTCTCATTATTATTATTATTATTATT" + "ATTATTATTATTATTACTATTATTGTTATAAAAATATTG", + "TCTCATTATTATTATTATTATTATTATTATTATTATTATTA" "CTATTATTGTTA", + "TCT C [ATT]12 ACT [ATT]2 GTT A", + ), + ( + "D21S11", + "TGAATTGCCTTCTATCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATC" + "TATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCGTCTATCTATCCAGT" + "CTATCTACCTCCTATTAGTCT", + "GCCTTCTATCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCT" + "ATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCGTCT", + "GCCT [TCTA]6 [TCTG]5 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCA TA [TCTA]11 TCGT CT", + ), + ( + "D1S1656", + "GAAATAGAATCACTAGGGAACCAAATATATATACATACAATTAAACACACACACACCTATCTATCTATCTATC" + "TATCTATCTATCTATCTATCTATCTATCTATCATCTATCTATCTATCTACATCATACAGTTGACCCTTGA", + "CCTATCTATC" "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATCTATCTATCTATCTA", + "CCTA [TCTA]11 TCA [TCTA]4", + ), + ( + "D19S433", + "AAGTTCTTTAGCAGTGATTTCTGATATTTTGGTGCACCCATTACCCGAATAAAAATCTTCTCTCTTTCTTCCT" + "CTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTTCAACAGAATCTTATTC" + "TGTTGCCCAGGCTGGAGTGCA", + "ATCTTCTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTT" + "CCTTCCTTCCTTCCTACCTTCTTTCCTTCAACA", + "ATC TTCT CTCT TTCT TCCT CTCT [CCTT]12 CCTA CCTT CTTT CCTT CAAC A", + ), + ( + "D18S51", + "AGGCTGCAGTGAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTCAGAAAGAA" + "AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAA" + "CTGTTATTGTA", + "TCTCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAGGAAAGAAA", + "TCTC [AGAA]12 AAAG AGAG AGGA AAGA AA", + ), + ( + "D16S539", + "GTGCACAAATCTAAATGCAGAAAAGCACTGAAAGAAGAATCCAGAAAACCACAGTTCCCATTTTTATATGGGA" + "GCAAACAAAGGCAGATCCCAAGCTCTTCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTGGATAGATAGATAGATAGATA" + "GATAGATAGATAGATATCAT", + "GATAGATAGATAGATAGATAGATAGATAGATAGATA", + "[GATA]9", + ), + ( + "D13S317", + "TTCTTTAGTGGGCATCCGTGACTCTCTGGACTCTGACCCATCTAACGCCTATCTGTATTTACAAATACATTAT" + "CTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGGCTGCCTATGGCT" + "CAACCCAAGTTGAAGGAGGAGATTT", + "ACATTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATC" "TATCTTTCTGTCTGTCTTTTT", + "ACAT [TATC]9 [AATC]2 [ATCT]3 TTCT [GTCT]2 TTTT", + ), + ( + "D12S391", + "CAATGGATGCATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACA" + "GACAGACAGACAGACAGATGAGAGGG", + "TGCATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA" + "GATAGACAGACAGACAGACAGACAGACAGATGAGA", + "TGCA TAGG T [AGAT]12 [AGAC]6 AGAT GAGA", + ), + ( + "D10S1248", + "CCCCAGGACCAATCTGGTCACAAACATATTAATGAATTGAACAAATGAGTGAGTGGAAGGAAGGAAGGAAGG" + "AAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA", + "GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA", + "[GGAA]14", + ), + ( + "CSF1PO", + "CTAAGTACTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTTCTATCTA" + "TGAAGGCAGTTACTGTTAATATCTTCATTTTACAGGTAGGAAAACTGAGACACAGGGTGGTTAGCAACCTGCTAGTCCTTGGCA" + "GACTCAG", + "CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCT", + "CTTC CT [ATCT]10 A ATCT", + ), + ( + "Y-GATA-H4", + "TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTACCTACCTACCTATCTATCTATAG" + "ATCTATCTATCTATCTTAAATTTGGAAATTCTCCTCAGCATAACATTTTAATGATGATTCCTAGGATACAAGTGATGTGCTGAA" + "AGTATCAATGTGTATCAGAAAACCAACATCTCTGCTTAGGTCTCT", + "TCTATCTATCTATCTATCTATCTATCTATCTATCT" "ATCTATCTATCTA", + "[TCTA]12", + ), + ( + "DYS643", + "ACCTCATGCTCTGTGATTTTTGCAGGTGTTCACTGCAAGCCATGCCTGGTTAAACTACTGTGCCTTTTCTTTTC" + "TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTTTAAAACTTTTTACTTCAGTAGAATTTTGGGGGG", + "GTGCCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTT", + "GTGC [CTTTT]10 CTTT CTTTT", + ), + ( + "DYS635", + "CCCAAATATCCATCAATCAATGAATGGATAAAGAAAATGTGATAGATAGATAGATAGATAGATAGATAGATAGA" + "TAGATAGATACATACATAGATAGATACATACATAGATAGATACATACATAGATAGATAGATAGAGATT", + "ATGTGATAGATA" + "GATAGATAGATAGATAGATAGATAGATAGATAGATACATACATAGATAGATACATACATAGATAGATACATACATAGATAGATA" + "GATAGAGATT", + "ATGT GA [TAGA]10 [TACA]2 [TAGA]2 [TACA]2 [TAGA]2 [TACA]2 [TAGA]4 GATT", + ), + ( + "DYS576", + "AAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAA" + "GCCAAGACAAATACGCTTATTACTCCCATCTCCTCCTTCATCTCCAGGAAATGAGAC", + "AAAGAAAGAAAGAAAGAAAGAAA" "GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG", + "[AAAG]17", + ), + ( + "DYS570", + "TAAAATGAATGATGACTAGGTAGAAATCCTGGCTGTGTCCTCCAAGTTCCTTTTCTTTCTTTCTTTCTTTCTTT" + "CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTTT", + "TTTCTTTCTTTCTTTCTTTCTTTCTTTCTT" "TCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC", + "[TTTC]17", + ), + ( + "DYS549", + "GTAAAGAACTATAAAAAGATTAATACAACAAAAATTTGGTAATCTGAAATAATAAGGTAGACATAGCAATTAGG" + "TAGGTAAAGAGGAAGATGATAGATGATTAGAAAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA" + "GATAGAAAAAATC", + "AGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAAA", + "AGAT [GATA]13 GAAA AA", + ), + ( + "DYS533", + "CTAATATTTATCTATATCATTCTAATTATGTCTCTTCTAACTATATAACTATGTATTATCTATCAATCTTCTAC" + "CTATCATCTTTCTAGCTAGCTATCATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATCTATCAT" + "CTTCTATTGTTTGGTTGAGTTAAGAACTGATCATGAATAAATACATTTCATTGGT", + "TATCATCTATCTATCTATCTATCTA" "TCTATCTATCTATCTATCTATCTATCTATCATCT", + "TATC ATC [TATC]12 ATCT", + ), + ( + "DYS481", + "TAAAAGGAATGTGGCTAACGCTGTTCAGCATGCTGCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT" + "CTTCTTCTTCTTCTTCTTCTTCTTCTTTTTTGAGTCTTG", + "CTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCT" "TCTTCTTCTTCTTCTTCTTCTTCTT", + "[CTT]22", + ), + ( + "DYS458", + "GAAAGAAAGAAAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA" + "GAAAGAAAGAAAGGAGGGTGGGCGTGGTGGCTCATGCTTGTAATGCCAGAACTTTGGGAGGCCGAGGTGG", + "GAAAGAAAGA" + "AAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA", + "[GAAA]3 AG GAAG [GAAA]17", + ), + ( + "DYS456", + "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCCATTAGTTCT" + "GTCCCTCTAGAGAACCCTAATACATCAGTTTAAGAA", + "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT" "AGATAGATAGATAGATATTCC", + "[AGAT]15 ATTC C", + ), + ( + "DYS448", + "AGACATGGATAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAG" + "AGATATAGAGATAGAGAGATAGAGATAGAGATAGATAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAG" + "AGATAGAGATAGAGAGGTAAAGATAGA", + "AGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGA" + "TAGAGATAGAGATATAGAGATAGAGAGATAGAGATAGAGATAGATAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGA" + "TAGAGATAGAGATAGAGAT", + "[AGAGAT]11 [ATAGAG]2 [AGATAG]3 ATAGAT AGAGAA [AGAGAT]8", + ), + ( + "DYS439", + "AAATAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAGTATAAGTAA" + "AGAGATGATGG", + "TAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAGT", + "TAGA A [GATA]13 GAAA GT", + ), + ( + "DYS438", + "CAGTATATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTATTTGA" + "AATGGAGTTTCACTCTTGTTGCCCAGG", + "TATATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTT" "CTTTTCTTTTCTATTT", + "TATA [TTTTC]12 TATTT", + ), + ( + "DYS437", + "GCCCATCCGGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTATCAT" + "CTATCATCTGTGAATGATGTCTATCTACTTATCTATGAATGATATTTATCTGTGGTTATCTATCTATCTATA", + "CCGGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTATCATCT", + "CCGG [TCTA]9 [TCTG]2 [TCTA]4 TCAT CT", + ), + ( + "DYS393", + "CGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATGTATGTCTTTTCTATGAGAC" "ATA", + "CGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", + "CGAT [AGAT]12", + ), + ( + "DYS392", + "TAAATAATAATAATAATAATAATAATAATAATAATAATAATAAATAAATGGTGATACAAGAAAAAAATTTGTTT" + "TCCTTCTTGGCTTTTAAATAACAAACACTTGAAATCAAATTAG", + "ATAATAATAATAATAATAATAATAATAATAATAATAA" "TA", + "[ATA]13", + ), + ( + "DYS391", + "TGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGCCTATCTGCCTGCCTACCTA" + "TCCCTCTATGGCAATTGCTTGCAACCAGGGAGATTTTA", + "TCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATC" "TATCTATCTGCCTATC", + "TCTG [TCTA]11 TCTG CCTA TC", + ), + ( + "DYS390", + "AACAAGGAAAGATAGATAGATGATAGATAGATAGATAGACAGATAGATAGATAGATAGATAGATAGATAGATAG" + "ATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGATAGATAGAATATATTATGGGGTACCAAAATGCAGGGCCCAAAA" + "ATGTGTAAAATATATGTGT", + "TAGATAGATAGATAGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAC" + "AGACAGACAGACAGACAGACAGACAGACAGA", + "[TAGA]4 CAGA [TAGA]10 [CAGA]8", + ), + ( + "DYS389II", + "TCATAGATAGATGATGGACTGCTAGATAAATAGATAGATTGATAGAGGGAGGGATAGATAGATAGATAGATA" + "GATAGATAGATAGATAGATAGATAGACAGACAGACAGATACATAGATAATACAGATGAGAGTTGGATACAGAAGTAGGTATAAT" + "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACA", + "TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATACATAGATAATACAGATGAGAGTTGGA" + "TACAGAAGTAGGTATAATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGA", + "[TAGA]11 [CAGA]3 TACA TAGA TAAT ACAG ATGA GAGT TGGA TACA GAAG TAGG TATA ATGA [TAGA]11" + " [CAGA]5", + ), + ( + "DYS385A-B", + "TTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCCCTTCCTTCCTTCCT" + "TCCTTCCTTTCTTTCTCTTTCCTCTTTCTCTTTCTTCTCTTTCTTTCTTTTTCTCTTTTTCTCTTTCTTTCTTTTTTACTTTCT" + "TTCTCCTTCCTTCCTTCCTTTCTGAATTTCATTTCTTTTCTTT", + "TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCT" "TTCTTTCTTTC", + "[TTTC]12", + ), + ( + "DYS19", + "TCTGGGTTAAGGAGAGTGTCACTATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTACCTAT" + "CTATCTATCTAAAACACTATATATATATAACACTATATATATAATACTATATATATATTA", + "TCTATCTATCTATCTATCTA" "TCTATCTATCTATCTATCTATCTACCTATCTATCTATCTA", + "[TCTA]11 CCTA [TCTA]3", + ), + ], +) +def test_custom_ranges(locus, sequence, cust_seq, bracketed): + marker = STRMarkerObject(locus, sequence, "straitrazor", custom=True, kit="powerseq") + assert marker.custom_sequence == cust_seq + assert marker.custom_brack == bracketed diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index 9983bee4..b03bbc9f 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -14,6 +14,7 @@ software = config["output_type"] prof = config["profile_type"] data = config["data_type"] separate = config["separate"] +custom = config["custom_ranges"] def get_sample_IDs(input, a_software, output, software, separate): @@ -71,6 +72,14 @@ def parse_sample_details(filename): return sampleID +def get_output(): + if custom: + outname = expand("{name}_custom_range.txt", name=output_name) + else: + outname = expand("{name}.txt", name=output_name) + return outname + + rule all: input: expand("{name}.csv", name=output_name), @@ -98,12 +107,13 @@ rule convert: input: rules.format.output output: - expand("{name}.txt", name=output_name) + get_output() params: a_software=config["analysis_software"], sex=config["sex"], nocombine=config["nocombine"], - kit=config["kit"] + kit=config["kit"], + custom=config["custom_ranges"] script: lusSTR.wrapper("convert") @@ -125,7 +135,8 @@ rule filter: info=config["info"], separate=config["separate"], filters=config["nofilters"], - strand=config["strand"] + strand=config["strand"], + custom=config["custom_ranges"] script: lusSTR.wrapper("filter") \ No newline at end of file diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index bf5fdc59..c499a385 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -29,7 +29,7 @@ str_marker_data = json.load(fh) -def format_table(input, software, kit="forenseq"): +def format_table(input, software, kit="forenseq", custom=False): """ Function to format final output table and the flanking report (if necessary). """ @@ -99,7 +99,7 @@ def format_table(input, software, kit="forenseq"): ] flanks_list.append(flank_summary) continue - marker = STRMarkerObject(locus, sequence, software, kit=kit) + marker = STRMarkerObject(locus, sequence, software, custom=custom, kit=kit) if locus == "D12S391" and kit == "powerseq" and software == "straitrazor": if "." in str(marker.canonical): check_sr += 1 @@ -112,11 +112,11 @@ def format_table(input, software, kit="forenseq"): indel_flag = marker.indel_flag if indel_flag == "Possible indel or partial sequence": if locus == "PENTA D" and kit == "powerseq": - marker = check_pentad(marker, sequence, software) + marker = check_pentad(marker, sequence, software, custom) elif locus == "D7S820" and kit == "powerseq": - marker = check_D7(marker, sequence, software) + marker = check_D7(marker, sequence, software, custom) elif locus == "VWA" and kit == "powerseq": - marker = check_vwa(marker, sequence, software) + marker = check_vwa(marker, sequence, software, custom) summary = [sampleid, project, analysis, locus] + marker.summary + [reads] list_of_lists.append(summary) if software != "uas": @@ -134,7 +134,6 @@ def format_table(input, software, kit="forenseq"): indel_flag, ] flanks_list.append(flank_summary) - columns = [ "SampleID", "Project", @@ -142,8 +141,10 @@ def format_table(input, software, kit="forenseq"): "Locus", "UAS_Output_Sequence", "Forward_Strand_Sequence", + "Custom_Range_Sequence", "UAS_Output_Bracketed_Notation", "Forward_Strand_Bracketed_Notation", + "Custom_Bracketed_Notation", "CE_Allele", "LUS", "LUS_Plus", @@ -173,35 +174,45 @@ def format_table(input, software, kit="forenseq"): final_flank_output = sort_table(pd.DataFrame(flanks_list, columns=flanks_columns)) else: final_flank_output = "" + if not custom: + remove_list = ["Custom_Range_Sequence", "Custom_Bracketed_Notation"] + final_output = final_output.drop(remove_list, axis=1) + columns = remove_columns(columns, remove_list) return final_output, final_flank_output, columns -def check_pentad(marker, sequence, software): +def check_pentad(marker, sequence, software, custom): new_marker = marker if marker.summary[1][:4] == "AAAG" and marker.flank_5p[-5:] == "GAAAA": new_sequence = f"{sequence[:66]}-{sequence[66:]}" - new_marker = STRMarkerObject("PENTA D", new_sequence, software, kit="powerseq") + new_marker = STRMarkerObject( + "PENTA D", new_sequence, software, custom=custom, kit="powerseq" + ) elif marker.summary[1][-4:] == "AAAG" and marker.flank_3p[:7] == "AAAAA A": new_sequence = f"{sequence}-" - new_marker = STRMarkerObject("PENTA D", new_sequence, software, kit="powerseq") + new_marker = STRMarkerObject( + "PENTA D", new_sequence, software, custom=custom, kit="powerseq" + ) else: return marker return new_marker -def check_D7(marker, sequence, software): +def check_D7(marker, sequence, software, custom): if marker.summary[1][:3] == "AAC" and marker.flank_5p[-8:] == "T AAAAAA": new_sequence = f"{sequence[:60]}-{sequence[60:]}" - new_marker = STRMarkerObject("D7S820", new_sequence, software, kit="powerseq") + new_marker = STRMarkerObject( + "D7S820", new_sequence, software, custom=custom, kit="powerseq" + ) else: return marker return new_marker -def check_vwa(marker, sequence, software): +def check_vwa(marker, sequence, software, custom): if sequence[:4] == "GATA": new_sequence = f"{sequence[:1]}-{sequence[1:]}" - new_marker = STRMarkerObject("VWA", new_sequence, software, kit="powerseq") + new_marker = STRMarkerObject("VWA", new_sequence, software, custom=custom, kit="powerseq") else: return marker return new_marker @@ -220,15 +231,37 @@ def sort_table(table): return sorted_table -def main(input, out, kit, software, sex, nocombine): +def remove_columns(column_list, remove_list): + for col in remove_list: + column_list.remove(col) + return column_list + + +def create_custom_outputtable(columns, table): + remove_list = [ + "UAS_Output_Sequence", + "Forward_Strand_Sequence", + "UAS_Output_Bracketed_Notation", + "Forward_Strand_Bracketed_Notation", + ] + custom_columns = remove_columns(columns, remove_list) + custom_table = table[custom_columns] + custom_table_comb = combine_reads(custom_table, custom_columns) + return custom_table_comb + + +def main(input, out, kit, software, sex, nocombine, custom): input = str(input) out = str(out) output_name = os.path.splitext(out)[0] input_name = os.path.splitext(input)[0] - autosomal_final_table, autosomal_flank_table, columns = format_table(input, software, kit) + full_table_name = re.sub(r"_custom_range", "", output_name) + autosomal_final_table, autosomal_flank_table, columns = format_table( + input, software, kit, custom + ) if sex: - sex_final_table, sex_flank_table, columns = format_table( - f"{input_name}_sexloci.csv", software, kit + sex_final_table, sex_flank_table, sex_columns = format_table( + f"{input_name}_sexloci.csv", software, kit, custom ) if software != "uas": if not sex_final_table.empty: @@ -237,8 +270,11 @@ def main(input, out, kit, software, sex, nocombine): sex_final_table.to_csv( f"{output_name}_sexloci_no_combined_reads.txt", index=False ) - sex_final_table = combine_reads(sex_final_table, columns) - sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) + sex_final_table = combine_reads(sex_final_table, sex_columns) + sex_final_table.to_csv(f"{full_table_name}_sexloci.txt", sep="\t", index=False) + if custom: + sex_table_custom = create_custom_outputtable(sex_columns, sex_final_table) + sex_table_custom.to_csv(f"{output_name}.txt", index=False, sep="\t") else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) if software != "uas": @@ -249,7 +285,10 @@ def main(input, out, kit, software, sex, nocombine): f"{output_name}_no_combined_reads.txt", sep="\t", index=False ) autosomal_final_table = combine_reads(autosomal_final_table, columns) - autosomal_final_table.to_csv(out, sep="\t", index=False) + autosomal_final_table.to_csv(f"{full_table_name}.txt", sep="\t", index=False) + if custom: + custom_table_comb = create_custom_outputtable(columns, autosomal_final_table) + custom_table_comb.to_csv(out, sep="\t", index=False) else: autosomal_final_table.to_csv(out, sep="\t", index=False) @@ -262,4 +301,5 @@ def main(input, out, kit, software, sex, nocombine): software=snakemake.params.a_software, sex=snakemake.params.sex, nocombine=snakemake.params.nocombine, + custom=snakemake.params.custom, ) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index cbcf8052..b1c9e4ce 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -66,14 +66,9 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) -def process_strs(dict_loc, datatype, seq_col): +def process_strs(dict_loc, datatype, seq_col, brack_col): final_df = pd.DataFrame() flags_df = pd.DataFrame() - brack_col = ( - "UAS_Output_Bracketed_Notation" - if seq_col == "UAS_Output_Sequence" - else "Forward_Strand_Bracketed_Notation" - ) for key, value in dict_loc.items(): data = dict_loc[key].reset_index(drop=True) if datatype == "ce": @@ -418,7 +413,16 @@ def get_at(df, locus): def main( - input, output_type, profile_type, data_type, output_dir, info, separate, nofilters, strand + input, + output_type, + profile_type, + data_type, + output_dir, + info, + separate, + nofilters, + strand, + custom, ): input = str(input) if profile_type not in ("evidence", "reference"): @@ -432,10 +436,16 @@ def main( raise ValueError("No output specified using --out.") else: outpath = output_dir - seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence" - brack_col = ( - "UAS_Output_Bracketed_Notation" if strand == "uas" else "Forward_Strand_Bracketed_Form" - ) + if custom: + seq_col = "Custom_Range_Sequence" + brack_col = "Custom_Bracketed_Notation" + else: + seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence" + brack_col = ( + "UAS_Output_Bracketed_Notation" + if strand == "uas" + else "Forward_Strand_Bracketed_Notation" + ) if nofilters: full_df["allele_type"] = "real_allele" marker_plots(full_df, outpath) @@ -445,7 +455,7 @@ def main( STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} - final_df, flags_df = process_strs(dict_loc, data_type, seq_col) + final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) marker_plots(final_df, outpath) if output_type == "efm" or output_type == "mpsproto": EFM_output(final_df, outpath, profile_type, data_type, brack_col, separate) @@ -469,4 +479,5 @@ def main( separate=snakemake.params.separate, nofilters=snakemake.params.filters, strand=snakemake.params.strand, + custom=snakemake.params.custom, ) diff --git a/setup.py b/setup.py index b5363b19..3e84b2f1 100755 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ "snakemake>=7.22,<8.0", "pyyaml>=6.0", "matplotlib>=3.8", + "numpy==1.26.4", ], entry_points={"console_scripts": ["lusstr = lusSTR.cli:main"]}, scripts=glob.glob("lusSTR/scripts/*"),