From 03b89790f91d7f9fd4a2d9a01c0ac694acff483a Mon Sep 17 00:00:00 2001 From: rnmitchell <57150382+rnmitchell@users.noreply.github.com> Date: Wed, 22 Nov 2023 08:09:41 -0500 Subject: [PATCH] Add LUS+ as possible STR data type (#57) --- lusSTR/cli/config.py | 10 +- lusSTR/scripts/filter_settings.py | 118 ++++++++++++---- .../EFM_test_reference_lusplus.csv | 28 ++++ .../LUSPlus_sequence_info.csv | 24 ++++ .../test_filtering_EFMoutput_lusplus.csv | 28 ++++ ...test_filtering_EFMoutput_sequence_info.csv | 26 ++++ ...eference.csv => EFM_test_reference_ce.csv} | 0 ...ut.csv => test_filtering_EFMoutput_ce.csv} | 0 lusSTR/tests/data/test_stutter_lusp.txt | 25 ++++ lusSTR/tests/test_filters.py | 132 ++++++++++++++---- lusSTR/tests/test_suite.py | 2 +- lusSTR/wrappers/filter.py | 72 ++++++---- 12 files changed, 378 insertions(+), 87 deletions(-) create mode 100644 lusSTR/tests/data/LUSPlus_stutter_test/EFM_test_reference_lusplus.csv create mode 100644 lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv create mode 100644 lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_lusplus.csv create mode 100644 lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_sequence_info.csv rename lusSTR/tests/data/RU_stutter_test/{EFM_test_reference.csv => EFM_test_reference_ce.csv} (100%) rename lusSTR/tests/data/RU_stutter_test/{test_filtering_EFMoutput.csv => test_filtering_EFMoutput_ce.csv} (100%) create mode 100644 lusSTR/tests/data/test_stutter_lusp.txt diff --git a/lusSTR/cli/config.py b/lusSTR/cli/config.py index a6c3f23d..a972c847 100644 --- a/lusSTR/cli/config.py +++ b/lusSTR/cli/config.py @@ -87,8 +87,8 @@ def edit_str_config(config, args): data["info"] = False if args.reference: data["profile_type"] = "reference" - if args.ce: - data["data_type"] = "ce" + if args.datatype: + data["data_type"] = args.datatype if args.efm: data["output_type"] = "efm" if args.strand: @@ -127,7 +127,11 @@ def subparser(subparsers): help="Use for creating Reference profiles for STR workflow" ) p.add_argument("--efm", action="store_true",help="Use to create EuroForMix profiles") - p.add_argument("--ce", action="store_true", help="Use for CE data") + p.add_argument( + "--str-type", choices=["ce", "ngs", "lusplus"], default="ngs", + dest="datatype", help="Data type for STRs. Options are: CE allele ('ce'), sequence " + "('ngs'), or LUS+ allele ('lusplus'). Default is 'ngs'.", + ) p.add_argument( "--noinfo", action="store_true", help="Use to not create the Sequence Information File in the 'filter' step" diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index 9edb028b..b6fc13f7 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -37,8 +37,8 @@ def filters(locus_allele_info, locus, locus_reads, datatype, brack_col): locus_allele_info = ce_filtering( locus_allele_info, locus_reads, metadata, datatype, brack_col ) - if datatype == "ngs": - locus_allele_info = same_size_filter(locus_allele_info, metadata) + if datatype != "ce": + locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype) return locus_allele_info @@ -146,14 +146,19 @@ def allele_ident( locus_allele_info, init_type_all, metadata, ref_allele_reads, i, j, datatype, brack_col ): quest_al_reads = locus_allele_info.loc[j, "Reads"] - ref_allele = float(locus_allele_info.loc[i, "CE_Allele"]) - question_allele = float(locus_allele_info.loc[j, "CE_Allele"]) - if datatype == "ngs": - ref_bracket = locus_allele_info.loc[i, brack_col] - question_bracket = locus_allele_info.loc[j, brack_col] + if datatype == "lusplus": + ref_allele = float(locus_allele_info.loc[i, "LUS_Plus"].split("_")[0]) + question_allele = float(locus_allele_info.loc[j, "LUS_Plus"].split("_")[0]) + ref_altformat = locus_allele_info.loc[i, "LUS_Plus"] + question_altformat = locus_allele_info.loc[j, "LUS_Plus"] else: - ref_bracket = None - question_bracket = None + ref_allele = float(locus_allele_info.loc[i, "CE_Allele"]) + question_allele = float(locus_allele_info.loc[j, "CE_Allele"]) + ref_altformat = None + question_altformat = None + if datatype == "ngs": + ref_altformat = locus_allele_info.loc[i, brack_col] + question_altformat = locus_allele_info.loc[j, brack_col] locus_allele_info.loc[j, ["allele_type", "perc_stutter"]] = allele_type( ref_allele, question_allele, @@ -163,8 +168,8 @@ def allele_ident( ref_allele_reads, locus_allele_info.loc[j, "allele1_ref_reads"], locus_allele_info, - ref_bracket, - question_bracket, + ref_altformat, + question_altformat, datatype, brack_col, ) @@ -172,13 +177,13 @@ def allele_ident( if "/" in locus_allele_info.loc[j, "allele_type"] and pd.isnull( locus_allele_info.loc[j, "parent_allele2"] ): - stut_allele2 = ref_allele if datatype == "ce" else ref_bracket + stut_allele2 = ref_allele if datatype == "ce" else ref_altformat locus_allele_info.loc[j, ["parent_allele2", "allele2_ref_reads"]] = [ stut_allele2, ref_allele_reads, ] elif pd.isnull(locus_allele_info.loc[j, "parent_allele1"]): - stut_allele1 = ref_allele if datatype == "ce" else ref_bracket + stut_allele1 = ref_allele if datatype == "ce" else ref_altformat locus_allele_info.loc[j, ["parent_allele1", "allele1_ref_reads"]] = [ stut_allele1, ref_allele_reads, @@ -263,8 +268,8 @@ def allele_type( ref_reads, al1_ref_reads, all_type_df, - ref_bracket, - question_bracket, + ref_altformat, + question_altformat, datatype, brack_col, ): @@ -275,8 +280,16 @@ def allele_type( allele_diff = round(ref - ce, 1) if allele_diff == 1 and ref_reads > quest_al_reads: # -1 stutter if ( - datatype == "ngs" and bracketed_stutter_id(ref_bracket, question_bracket, -1) == -1 - ) or datatype == "ce": + ( + datatype == "ngs" + and bracketed_stutter_id(ref_altformat, question_altformat, -1) == -1 + ) + or datatype == "ce" + or ( + datatype == "lusplus" + and lusplus_stutter_id(ref_altformat, question_altformat, -1) == -1 + ) + ): all_type, stut_perc = minus1_stutter( all_type, stutter_thresh, @@ -287,11 +300,19 @@ def allele_type( quest_al_reads, ) elif allele_diff == 2 and ref_reads > quest_al_reads: # -2 stutter - allele = ce if datatype == "ce" else question_bracket + allele = ce if datatype == "ce" else question_altformat if check_2stutter(all_type_df, datatype, allele, brack_col)[0] is True: if ( - datatype == "ngs" and bracketed_stutter_id(ref_bracket, question_bracket, -2) == -2 - ) or datatype == "ce": + ( + datatype == "ngs" + and bracketed_stutter_id(ref_altformat, question_altformat, -2) == -2 + ) + or datatype == "ce" + or ( + datatype == "lusplus" + and lusplus_stutter_id(ref_altformat, question_altformat, -2) == -2 + ) + ): ref_reads = check_2stutter(all_type_df, datatype, allele, brack_col)[1] stutter_thresh_reads = stutter_thresh * ref_reads all_type, stut_perc = minus2_stutter( @@ -305,8 +326,13 @@ def allele_type( ) elif allele_diff == -1 and ref_reads > quest_al_reads: # +1 stutter if ( - datatype == "ngs" and bracketed_stutter_id(ref_bracket, question_bracket, 1) == 1 - ) or datatype == "ce": + (datatype == "ngs" and bracketed_stutter_id(ref_altformat, question_altformat, 1) == 1) + or datatype == "ce" + or ( + datatype == "lusplus" + and lusplus_stutter_id(ref_altformat, question_altformat, 1) == 1 + ) + ): all_type, stut_perc = plus1_stutter( all_type, stutter_thresh, forward_thresh, ref_reads, al1_ref_reads, quest_al_reads ) @@ -342,9 +368,14 @@ def check_2stutter(stutter_df, allele_des, allele, brack_col): break else: for k, row in stutter_df.iterrows(): - bracket_test = stutter_df.loc[k, brack_col] - if bracketed_stutter_id(bracket_test, allele, -1) == -1: - is_true, reads = True, stutter_df.loc[k, "Reads"] + if allele_des == "ngs": + bracket_test = stutter_df.loc[k, brack_col] + if bracketed_stutter_id(bracket_test, allele, -1) == -1: + is_true, reads = True, stutter_df.loc[k, "Reads"] + else: + lusp_test = stutter_df.loc[k, "LUS_Plus"] + if lusplus_stutter_id(lusp_test, allele, -1) == -1: + is_true, reads = True, stutter_df.loc[k, "Reads"] return is_true, reads @@ -409,6 +440,26 @@ def bracketed_stutter_id(ref_bracket, quest_bracket, stutter_id): return stutter +def lusplus_stutter_id(ref_lusp, question_lusp, stutter_id): + diffcount = 0 + stutter = None + for j in range(1, len(ref_lusp.split("_"))): + diff = float(question_lusp.split("_")[j]) - round(float(ref_lusp.split("_")[j])) + if diff == 0: + continue + elif diff == stutter_id: + if diffcount == 0: + diffcount = diff + stutter = stutter_id + else: + stutter = None + break + else: + stutter = None + break + return stutter + + def allele_imbalance_check(allele_df): imbalance_df = pd.DataFrame(columns=["SampleID", "Locus", "Flags"]) try: @@ -430,10 +481,13 @@ def allele_imbalance_check(allele_df): return imbalance_df -def check_D7(allele_df): +def check_D7(allele_df, datatype): D7_df = pd.DataFrame(columns=["SampleID", "Locus", "Flags"]) for i in range(len(allele_df)): - al = allele_df.loc[i, "CE_Allele"] + if datatype == "lusplus": + al = allele_df.loc[i, "LUS_Plus"].split("_")[0] + else: + al = allele_df.loc[i, "CE_Allele"] try: if str(al).split(".")[1] == "1" and allele_df.loc[i, "allele_type"] != "BelowAT": print("D7 microvariants detected! Check flagged file for details.") @@ -447,16 +501,18 @@ def check_D7(allele_df): return D7_df -def flags(allele_df): +def flags(allele_df, datatype): flags_df = pd.DataFrame(columns=["SampleID", "Locus", "Flags"]) flags_df = flags_df.append(allele_counts(allele_df)) flags_df = flags_df.append(allele_imbalance_check(allele_df)) - flags_df = flags_df.append(check_D7(allele_df)) + flags_df = flags_df.append(check_D7(allele_df, datatype)) return flags_df -def same_size_filter(df, metadata): +def same_size_filter(df, metadata, datatype): final_df = pd.DataFrame() + if datatype == "lusplus": + df["CE_Allele"] = df["LUS_Plus"].apply(lambda x: x.split("_")[0]) al_list = df["CE_Allele"].unique() for ce_allele in al_list: df_filt = ( @@ -475,4 +531,6 @@ def same_size_filter(df, metadata): else: final_df = final_df.append(df_filt) final_df = final_df.reset_index(drop=True) + if datatype == "lusplus": + final_df = final_df.drop("CE_Allele", axis=1) return final_df diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/EFM_test_reference_lusplus.csv b/lusSTR/tests/data/LUSPlus_stutter_test/EFM_test_reference_lusplus.csv new file mode 100644 index 00000000..dd94b709 --- /dev/null +++ b/lusSTR/tests/data/LUSPlus_stutter_test/EFM_test_reference_lusplus.csv @@ -0,0 +1,28 @@ +SampleName,Marker,Allele1,Allele2 +Positive_Control,CSF1PO,12_12_0,12_12_0 +Positive_Control,D10S1248,13_13,15_15 +Positive_Control,D12S391,18_11_6_0,23_14_9_0 +Positive_Control,D13S317,11_12_3_1,9_9_3_1 +Positive_Control,D16S539,13_13_0,9_9_0 +Positive_Control,D17S1301,11_11,12_12 +Positive_Control,D18S51,16_16_1,18_18_1 +Positive_Control,D19S433,13_11_1_0,14_12_1_0 +Positive_Control,D1S1656,12_11_1_0,13_13_0_0 +Positive_Control,D20S482,14_14,15_15 +Positive_Control,D21S11,29_11_4_6,31.2_11_5_6 +Positive_Control,D22S1045,16_13,16_13 +Positive_Control,D2S1338,22_12_1_7,25_15_1_7 +Positive_Control,D2S441,10_10_0,14_11_0 +Positive_Control,D3S1358,17_13_3,18_14_3 +Positive_Control,D4S2408,9_9_0,9_9_0 +Positive_Control,D5S818,12_12,12_12 +Positive_Control,D6S1043,12_12_0,20_14_1 +Positive_Control,D7S820,11_11_1_0,8_8_1_0 +Positive_Control,D8S1179,14_12_1_0,15_12_1_0 +Positive_Control,D9S1122,12_10,12_12 +Positive_Control,FGA,20_12_3_0,23_15_3_0 +Positive_Control,PENTA D,12_12,13_13 +Positive_Control,PENTA E,14_14,7_7 +Positive_Control,TH01,6_6,9.3_6 +Positive_Control,TPOX,11_11,11_11 +Positive_Control,VWA,16_12_3_1,19_14_4_1 diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv new file mode 100644 index 00000000..a093c1fd --- /dev/null +++ b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv @@ -0,0 +1,24 @@ +SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele +Sample1,D4S2408,10_10_0,1022,real_allele,,,,,,, +Sample1,D4S2408,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,,, +Sample1,D4S2408,8_8_0,1050,real_allele,,,,,,, +Sample1,D8S1179,14_12_1_0,869,real_allele,,,,,,, +Sample1,D8S1179,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212, +Sample1,D8S1179,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201, +Sample1,D9S1122,13_11,948,real_allele,,,,,,, +Sample1,D9S1122,12_10,108,-1_stutter,13_11,,948.0,,,0.114, +Sample1,D9S1122,11_11,991,real_allele,,,,,,, +Sample1,D9S1122,10_10,87,-1_stutter,11_11,,991.0,,,0.088, +Sample1,FGA,23_15_3_0,1436,real_allele,,,,,,, +Sample1,FGA,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182, +Sample1,FGA,21_13_3_0,48,BelowAT,,,,,0.013,, +Sample1,FGA,20_12_3_0,1750,real_allele,,,,,,, +Sample1,FGA,18_10_3_0,181,real_allele,,,,,,, +Sample1,FGA,17_9_3_0,15,BelowAT,,,,,0.004,, +Sample1,PENTA D,15_15,50,real_allele,,,,,,, +Sample1,PENTA D,13_13,1000,real_allele,,,,,,, +Sample1,PENTA E,7_7,505,real_allele,,,,,,,7.0 +Sample1,TH01,7_7,2197,real_allele,,,,,,, +Sample1,TH01,6_6,1632,real_allele,,,,,,, +Sample1,TH01,5_5,66,BelowAT,,,,,0.017,, +Sample1,TPOX,11_11,15,BelowAT,,,,,1.0,,11.0 diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_lusplus.csv b/lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_lusplus.csv new file mode 100644 index 00000000..5ebcafee --- /dev/null +++ b/lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_lusplus.csv @@ -0,0 +1,28 @@ +SampleName,Marker,Allele1,Allele2,Allele3,Allele4,Height1,Height2,Height3,Height4 +Sample1,CSF1PO,,,,,,,, +Sample1,D10S1248,,,,,,,, +Sample1,D12S391,,,,,,,, +Sample1,D13S317,,,,,,,, +Sample1,D16S539,,,,,,,, +Sample1,D17S1301,,,,,,,, +Sample1,D18S51,,,,,,,, +Sample1,D19S433,,,,,,,, +Sample1,D1S1656,,,,,,,, +Sample1,D20S482,,,,,,,, +Sample1,D21S11,,,,,,,, +Sample1,D22S1045,,,,,,,, +Sample1,D2S1338,,,,,,,, +Sample1,D2S441,,,,,,,, +Sample1,D3S1358,,,,,,,, +Sample1,D4S2408,10_10_0,8_8_0,9_9_0,,900,1000,1357, +Sample1,D5S818,,,,,,,, +Sample1,D6S1043,,,,,,,, +Sample1,D7S820,,,,,,,, +Sample1,D8S1179,12_9_1_0,13_10_1_0,13_11_1_0,14_11_1_0,26,89,95,739 +Sample1,D9S1122,10_10,11_11,12_10,13_11,87,991,108,948 +Sample1,FGA,18_10_3_0,20_12_3_0,22_14_3_0,23_15_3_0,181,1750,262,1436 +Sample1,PENTA D,13_13,15_15,,,1000,50,, +Sample1,PENTA E,7_7,,,,505,,, +Sample1,TH01,6_6,7_7,,,1632,2197,, +Sample1,TPOX,,,,,,,, +Sample1,VWA,,,,,,,, diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_sequence_info.csv b/lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_sequence_info.csv new file mode 100644 index 00000000..9016d7a3 --- /dev/null +++ b/lusSTR/tests/data/LUSPlus_stutter_test/test_filtering_EFMoutput_sequence_info.csv @@ -0,0 +1,26 @@ +SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele +Sample1,D4S2408,10_10_0,900.0,real_allele,,,,,,, +Sample1,D4S2408,9_9_0,1357.0,real_allele,,,,,,, +Sample1,D4S2408,8_8_0,1000.0,real_allele,,,,,,, +Sample1,D8S1179,14_11_1_0,739.0,real_allele,,,,,,, +Sample1,D8S1179,13_11_1_0,95.0,real_allele,,,,,,, +Sample1,D8S1179,13_10_1_0,89.0,-1_stutter,14_11_1_0,,739.0,,,0.12, +Sample1,D8S1179,12_9_1_0,26.0,real_allele,,,,,,, +Sample1,D8S1179,12_10_1_0,11.0,BelowAT,,,,,0.01,, +Sample1,D9S1122,13_11,948.0,real_allele,,,,,,, +Sample1,D9S1122,12_10,108.0,-1_stutter,13_11,,948.0,,,0.114, +Sample1,D9S1122,11_11,991.0,real_allele,,,,,,, +Sample1,D9S1122,10_10,87.0,-1_stutter,11_11,,991.0,,,0.088, +Sample1,FGA,23_15_3_0,1436.0,real_allele,,,,,,, +Sample1,FGA,22_14_3_0,262.0,-1_stutter,23_15_3_0,,1436.0,,,0.182, +Sample1,FGA,21_13_3_0,48.0,BelowAT,,,,,0.013,, +Sample1,FGA,20_12_3_0,1750.0,real_allele,,,,,,, +Sample1,FGA,18_10_3_0,181.0,real_allele,,,,,,, +Sample1,FGA,17_9_3_0,15.0,BelowAT,,,,,0.004,, +Sample1,PENTA D,15_15,50.0,real_allele,,,,,,, +Sample1,PENTA D,13_13,1000.0,real_allele,,,,,,, +Sample1,PENTA E,7_7,505.0,real_allele,,,,,,,7.0 +Sample1,TH01,7_7,2197.0,real_allele,,,,,,, +Sample1,TH01,6_6,1632.0,real_allele,,,,,,, +Sample1,TH01,5_5,66.0,BelowAT,,,,,0.017,, +Sample1,TPOX,11_11,15.0,BelowAT,,,,,1.0,,11.0 diff --git a/lusSTR/tests/data/RU_stutter_test/EFM_test_reference.csv b/lusSTR/tests/data/RU_stutter_test/EFM_test_reference_ce.csv similarity index 100% rename from lusSTR/tests/data/RU_stutter_test/EFM_test_reference.csv rename to lusSTR/tests/data/RU_stutter_test/EFM_test_reference_ce.csv diff --git a/lusSTR/tests/data/RU_stutter_test/test_filtering_EFMoutput.csv b/lusSTR/tests/data/RU_stutter_test/test_filtering_EFMoutput_ce.csv similarity index 100% rename from lusSTR/tests/data/RU_stutter_test/test_filtering_EFMoutput.csv rename to lusSTR/tests/data/RU_stutter_test/test_filtering_EFMoutput_ce.csv diff --git a/lusSTR/tests/data/test_stutter_lusp.txt b/lusSTR/tests/data/test_stutter_lusp.txt new file mode 100644 index 00000000..239b9c5c --- /dev/null +++ b/lusSTR/tests/data/test_stutter_lusp.txt @@ -0,0 +1,25 @@ +SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence UAS_Output_Bracketed_Notation Forward_Strand_Bracketed_Notation CE_Allele LUS LUS_Plus Reads +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTA TCTG [TCTA]12 TCTA TCTG [TCTA]12 14 14_12 14_12_1_0 869 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA [TCTA]2 TCTG [TCTA]10 [TCTA]2 TCTG [TCTA]10 13 13_11 13_11_1_0 184 +Sample1 Experiment_1.1 Experiment1 D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA [TCTA]2 TCTG [TCTA]9 [TCTA]2 TCTG [TCTA]9 12 12_10 12_10_1_0 37 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3 20 20_12 20_12_3_0 1750 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3 23 23_15 23_15_3_0 1436 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]14 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]14 AGAA AAAA [GAAA]3 22 22_14 22_14_3_0 262 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]13 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]13 AGAA AAAA [GAAA]3 21 21_13 21_13_3_0 48 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]10 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]10 AGAA AAAA [GAAA]3 18 18_10 18_10_3_0 181 +Sample1 Experiment_1.1 Experiment1 FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA [TTTC]3 TTTT TTCT [CTTT]9 CTCC [TTCC]2 [GGAA]2 GGAG [AAAG]9 AGAA AAAA [GAAA]3 17 17_9 17_9_3_0 15 +Sample1 Experiment_1.1 Experiment1 TH01 AATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATG [AATG]7 [AATG]7 7 7_7 7_7 2197 +Sample1 Experiment_1.1 Experiment1 TH01 AATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATG [AATG]6 [AATG]6 6 6_6 6_6 1632 +Sample1 Experiment_1.1 Experiment1 TH01 AATGAATGAATGAATGAATG AATGAATGAATGAATGAATG [AATG]5 [AATG]5 5 5_5 5_5 66 +Sample1 Experiment_1.1 Experiment1 D9S1122 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA [TAGA]11 [TAGA]11 11 11_11 11_11 991 +Sample1 Experiment_1.1 Experiment1 D9S1122 TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGA TCGA [TAGA]11 TAGA TCGA [TAGA]11 13 13_11 13_11 948 +Sample1 Experiment_1.1 Experiment1 D9S1122 TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGA TCGA [TAGA]10 TAGA TCGA [TAGA]10 12 12_10 12_10 108 +Sample1 Experiment_1.1 Experiment1 D9S1122 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA [TAGA]10 [TAGA]10 10 10_10 10_10 87 +Sample1 Experiment_1.1 Experiment1 VWA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA TCTA [TCTG]3 [TCTA]12 TCCA TCTA TAGA TGGA [TAGA]12 [CAGA]3 TAGA 16 16_12 16_12_3_1 6 +Sample1 Experiment_1.1 Experiment1 TPOX AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG [AATG]11 [AATG]11 11 11_11 11_11 15 +Sample1 Experiment_1.1 Experiment1 PENTA E AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT [AAAGA]7 [TCTTT]7 7 7_7 7_7 505 +Sample1 Experiment_1.1 Experiment1 PENTA D AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAG [AAAGA]13 AAAAG [AAAGA]13 13 13_13 13_13 1000 +Sample1 Experiment_1.1 Experiment1 PENTA D AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAG [AAAGA]13 AAAAG [AAAGA]13 15 15_15 15_15 50 +Sample1 Experiment_1.1 Experiment1 D4S2408 ATCTATCTATCTATCTATCTATCTATCTATCTATCT ATCTATCTATCTATCTATCTATCTATCTATCTATCT [ATCT]9 [ATCT]9 9 9_9 9_9_0 116 +Sample1 Experiment_1.1 Experiment1 D4S2408 ATCTATCTATCTATCTATCTATCTATCTATCT ATCTATCTATCTATCTATCTATCTATCTATCT [ATCT]8 [ATCT]8 8 8_8 8_8_0 1050 +Sample1 Experiment_1.1 Experiment1 D4S2408 ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [ATCT]10 [ATCT]10 10 10_10 10_10_0 1022 \ No newline at end of file diff --git a/lusSTR/tests/test_filters.py b/lusSTR/tests/test_filters.py index 96b9cf8b..d5a85681 100644 --- a/lusSTR/tests/test_filters.py +++ b/lusSTR/tests/test_filters.py @@ -157,14 +157,28 @@ def test_plus1stutter( assert test_stut_perc == stut_perc -def test_EFMoutput_format(tmp_path): +@pytest.mark.parametrize( + "outputdir, datatype", [("RU_stutter_test/", "ce"), ("LUSPlus_stutter_test/", "lusplus")] +) +def test_EFMoutput_format(outputdir, datatype, tmp_path): str_path = str(tmp_path / "WD") inputfile = data_file("test_stutter.txt") - exp_out = data_file("RU_stutter_test/test_filtering_EFMoutput.csv") - exp_info_out = data_file("RU_stutter_test/test_filtering_EFMoutput_sequence_info.csv") - obs_out = str(tmp_path / "WD/test_output/test_output_evidence_ce.csv") + exp_out = data_file(f"{outputdir}test_filtering_EFMoutput_{datatype}.csv") + exp_info_out = data_file(f"{outputdir}test_filtering_EFMoutput_sequence_info.csv") + obs_out = str(tmp_path / f"WD/test_output/test_output_evidence_{datatype}.csv") obs_info_out = str(tmp_path / "WD/test_output/test_output_sequence_info.csv") - arglist = ["config", "-w", str_path, "-o", "test_output", "--efm", "--ce", "--input", "WD"] + arglist = [ + "config", + "-w", + str_path, + "-o", + "test_output", + "--efm", + "--str-type", + datatype, + "--input", + "WD", + ] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) shutil.copyfile(inputfile, os.path.join(str_path, "test_output.csv")) shutil.copyfile(inputfile, os.path.join(str_path, "test_output.txt")) @@ -183,10 +197,17 @@ def test_STRmixoutput_format(outputdir, datatype, tmp_path): exp_info_out = data_file(f"{outputdir}STRmix_Files_sequence_info.csv") obs_out = str(tmp_path / f"WD/STRmix_Files/Sample1_evidence_{datatype}.csv") obs_info_out = str(tmp_path / f"WD/STRmix_Files/STRmix_Files_sequence_info.csv") - if datatype == "ngs": - arglist = ["config", "-w", str_path, "--input", "WD", "-o", "STRmix_Files"] - else: - arglist = ["config", "-w", str_path, "--input", "WD", "-o", "STRmix_Files", "--ce"] + arglist = [ + "config", + "-w", + str_path, + "--input", + "WD", + "-o", + "STRmix_Files", + "--str-type", + datatype, + ] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) shutil.copyfile(inputfile, os.path.join(str_path, "STRmix_Files.csv")) shutil.copyfile(inputfile, os.path.join(str_path, "STRmix_Files.txt")) @@ -224,19 +245,30 @@ def test_flags(tmp_path): assert filecmp.cmp(exp_out, obs_out) is True -def test_efm_reference(tmp_path): +@pytest.mark.parametrize( + "outputdir, datatype", [("RU_stutter_test/", "ce"), ("LUSPlus_stutter_test/", "lusplus")] +) +def test_efm_reference(outputdir, datatype, tmp_path): str_path = str(tmp_path / "WD") inputfile = data_file("test_references.txt") - exp_out = data_file("RU_stutter_test/EFM_test_reference.csv") - obs_efm_out = str(tmp_path / "WD/lusstr_output/lusstr_output_reference_ce.csv") - arglist = ["config", "-w", str_path, "--input", "WD", "--efm", "--reference", "--ce"] + exp_out = data_file(f"{outputdir}EFM_test_reference_{datatype}.csv") + obs_efm_out = str(tmp_path / f"WD/lusstr_output/lusstr_output_reference_{datatype}.csv") + arglist = [ + "config", + "-w", + str_path, + "--input", + "WD", + "--efm", + "--reference", + "--str-type", + datatype, + ] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) shutil.copyfile(inputfile, os.path.join(str_path, "lusstr_output.csv")) shutil.copyfile(inputfile, os.path.join(str_path, "lusstr_output.txt")) - print(os.listdir(str_path)) all_arglist = ["strs", "all", "-w", str_path] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(all_arglist)) - print(os.listdir(f"{str_path}/lusstr_output")) assert filecmp.cmp(exp_out, obs_efm_out) is True @@ -248,20 +280,18 @@ def test_strmix_reference(outputdir, datatype, tmp_path): inputfile = data_file("test_references.txt") exp_out = data_file(f"{outputdir}Positive_Control_reference_{datatype}.csv") obs_out = str(tmp_path / f"WD/STRmix_Files/Positive_Control_reference_{datatype}.csv") - if datatype == "ngs": - arglist = ["config", "-w", str_path, "--input", "WD", "-o", "STRmix_Files", "--reference"] - else: - arglist = [ - "config", - "-w", - str_path, - "--input", - "WD", - "-o", - "STRmix_Files", - "--ce", - "--reference", - ] + arglist = [ + "config", + "-w", + str_path, + "--input", + "WD", + "-o", + "STRmix_Files", + "--reference", + "--str-type", + datatype, + ] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) shutil.copyfile(inputfile, os.path.join(str_path, "STRmix_Files.csv")) shutil.copyfile(inputfile, os.path.join(str_path, "STRmix_Files.txt")) @@ -304,6 +334,23 @@ def test_ngs_stutter(ref_bracket, quest_bracket, stutter, actual_call): assert test_stut == actual_call +@pytest.mark.parametrize( + "ref_lusp, question_lusp, stutter, actual_call", + [ + ("10_10_1_0", "9_9_1_0", -1, -1), + ("10_10_1_0", "9_10_0_0", -1, -1), + ("12_11_1_0", "11_9_2_0", -1, None), + ("19_13_1", "20_14_1", 1, 1), + ("19_13_1", "20_15_0", 1, None), + ("19_13_1", "17_11_1", -2, -2), + ("19_13_1", "17_12_0", -2, None), + ], +) +def test_lusplus_stutter(ref_lusp, question_lusp, stutter, actual_call): + test_stut = lusSTR.scripts.filter_settings.lusplus_stutter_id(ref_lusp, question_lusp, stutter) + assert test_stut == actual_call + + def test_forward_strand_orientation(tmp_path): str_path = str(tmp_path / "WD") inputfile = data_file("test_stutter.txt") @@ -326,3 +373,30 @@ def test_forward_strand_orientation(tmp_path): all_arglist = ["strs", "all", "-w", str_path] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(all_arglist)) assert filecmp.cmp(exp_out, obs_out) is True + + +def test_lusplus_sequence_info(tmp_path): + str_path = str(tmp_path / "WD") + inputfile = data_file("test_stutter_lusp.txt") + exp_out = data_file("LUSPlus_stutter_test/LUSPlus_sequence_info.csv") + obs_out = str(tmp_path / f"WD/LUSPlus/LUSPlus_sequence_info.csv") + arglist = [ + "config", + "-w", + str_path, + "--input", + "WD", + "-o", + "LUSPlus", + "--strand", + "forward", + "--str-type", + "lusplus", + "--efm", + ] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) + shutil.copyfile(inputfile, os.path.join(str_path, "LUSPlus.csv")) + shutil.copyfile(inputfile, os.path.join(str_path, "LUSPlus.txt")) + all_arglist = ["strs", "all", "-w", str_path] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(all_arglist)) + assert filecmp.cmp(exp_out, obs_out) is True diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index 8cfcef4c..7ce41c1e 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -254,7 +254,7 @@ def test_config(tmp_path): def test_config_settings(tmp_path): obs_config = str(tmp_path / "config.yaml") - arglist = ["config", "-w", str(tmp_path), "--straitrazor", "--reference", "--ce"] + arglist = ["config", "-w", str(tmp_path), "--straitrazor", "--reference", "--str-type", "ce"] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) with open(obs_config, "r") as file: data = yaml.safe_load(file) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index c0e828e3..790b58f3 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -77,9 +77,10 @@ def process_strs(dict_loc, datatype, seq_col): data_combine = data.groupby(["SampleID", "Locus", "CE_Allele"], as_index=False)[ "Reads" ].sum() - data_order = data_combine.sort_values(by=["CE_Allele"], ascending=False).reset_index( - drop=True - ) + elif datatype == "lusplus": + data_combine = data.groupby( + ["SampleID", "Locus", "CE_Allele", "LUS_Plus"], as_index=False + )["Reads"].sum() else: data_combine = data[ [ @@ -91,9 +92,9 @@ def process_strs(dict_loc, datatype, seq_col): "Reads", ] ] - data_order = data_combine.sort_values(by=["CE_Allele"], ascending=False).reset_index( - drop=True - ) + data_order = data_combine.sort_values(by=["CE_Allele"], ascending=False).reset_index( + drop=True + ) total_reads = data_order["Reads"].sum() locus = key[1] data_order = data_order.reindex( @@ -111,28 +112,43 @@ def process_strs(dict_loc, datatype, seq_col): ) filtered_df = filters(data_order, locus, total_reads, datatype, brack_col) final_df = final_df.append(filtered_df) - flags_df = flags_df.append(flags(filtered_df)) - final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) + flags_df = flags_df.append(flags(filtered_df, datatype)) + if datatype == "ce" or datatype == "ngs": + final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) return final_df, flags_df -def EFM_output(profile, outfile, profile_type, separate=False): +def EFM_output(profile, outfile, profile_type, data_type, separate=False): if profile_type == "reference": profile = profile[profile.allele_type == "real_allele"] else: profile = profile[profile.allele_type != "BelowAT"] - efm_profile = populate_efm_profile(profile) + efm_profile = populate_efm_profile(profile, data_type) if separate: - write_sample_specific_efm_profiles(efm_profile, profile_type, outfile) + write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outfile) else: - write_aggregate_efm_profile(efm_profile, profile_type, outfile) + write_aggregate_efm_profile(efm_profile, profile_type, data_type, outfile) -def populate_efm_profile(profile): - profile = profile.sort_values(by=["SampleID", "Locus", "CE_Allele"]) +def populate_efm_profile(profile, data_type): + if data_type == "ce": + prof_col = "CE_Allele" + elif data_type == "lusplus": + prof_col = "LUS_Plus" + else: + message = ( + f"Incorrect data type {data_type} specified for EFM. Please choose either " + "'ce' or 'lusplus'." + ) + raise ValueError(message) + profile = profile.sort_values(by=["SampleID", "Locus", prof_col]) + profile = profile.rename(columns={prof_col: "Allele"}) allele_heights = defaultdict(lambda: defaultdict(dict)) for i, row in profile.iterrows(): - allele_heights[row.SampleID][row.Locus][float(row.CE_Allele)] = int(row.Reads) + if data_type == "ce": + allele_heights[row.SampleID][row.Locus][float(row.Allele)] = int(row.Reads) + else: + allele_heights[row.SampleID][row.Locus][row.Allele] = int(row.Reads) max_num_alleles = determine_max_num_alleles(allele_heights) reformatted_profile = list() for sampleid, loci in allele_heights.items(): @@ -161,13 +177,13 @@ def populate_efm_profile(profile): return efm_profile -def write_sample_specific_efm_profiles(efm_profile, profile_type, outdir): +def write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outdir): Path(outdir).mkdir(parents=True, exist_ok=True) for sample in efm_profile.SampleName: sample_profile = efm_profile[efm_profile.SampleName == sample].reset_index(drop=True) sample_profile.dropna(axis=1, how="all", inplace=True) if profile_type == "evidence": - sample_profile.to_csv(f"{outdir}/{sample}_evidence_ce.csv", index=False) + sample_profile.to_csv(f"{outdir}/{sample}_evidence_{data_type}.csv", index=False) else: num_alleles = (len(sample_profile.columns) - 2) / 2 if num_alleles > 2: @@ -180,19 +196,21 @@ def write_sample_specific_efm_profiles(efm_profile, profile_type, outdir): for i in range(len(sample_profile)): if pd.isna(sample_profile.loc[i, "Allele2"]): sample_profile.loc[i, "Allele2"] = sample_profile.loc[i, "Allele1"] - sample_profile.iloc[:, :4].to_csv(f"{outdir}/{sample}_reference_ce.csv", index=False) + sample_profile.iloc[:, :4].to_csv( + f"{outdir}/{sample}_reference_{data_type}.csv", index=False + ) -def write_aggregate_efm_profile(efm_profile, profile_type, outfile): +def write_aggregate_efm_profile(efm_profile, profile_type, data_type, outfile): Path(outfile).mkdir(parents=True, exist_ok=True) name = os.path.basename(outfile) if profile_type == "evidence": - efm_profile.to_csv(f"{outfile}/{name}_evidence_ce.csv", index=False) + efm_profile.to_csv(f"{outfile}/{name}_evidence_{data_type}.csv", index=False) else: for i in range(len(efm_profile)): if pd.isna(efm_profile.loc[i, "Allele2"]): efm_profile.loc[i, "Allele2"] = efm_profile.loc[i, "Allele1"] - efm_profile.iloc[:, :4].to_csv(f"{outfile}/{name}_reference_ce.csv", index=False) + efm_profile.iloc[:, :4].to_csv(f"{outfile}/{name}_reference_{data_type}.csv", index=False) def determine_max_num_alleles(allele_heights): @@ -212,6 +230,12 @@ def STRmix_output(profile, outdir, profile_type, data_type, seq_col): filtered_df = profile[profile.allele_type != "BelowAT"] if data_type == "ce": strmix_profile = strmix_ce_processing(filtered_df) + elif data_type == "lusplus": + error_message = ( + "LUSPlus specified as the data type. STRmix does not accept LUSPlus! Please rerun with " + "'ce' or 'ngs' specified." + ) + raise ValueError(error_message) else: strmix_profile = filtered_df.loc[:, ["SampleID", "Locus", "CE_Allele", seq_col, "Reads"]] strmix_profile.rename( @@ -302,7 +326,7 @@ def main( input = str(input) if profile_type not in ("evidence", "reference"): raise ValueError(f"unknown profile type '{profile_type}'") - if data_type not in ("ce", "ngs"): + if data_type not in ("ce", "ngs", "lusplus"): raise ValueError(f"unknown data type '{data_type}'") if output_type not in ("efm", "strmix"): raise ValueError(f"unknown output type '{output_type}'") @@ -315,14 +339,14 @@ def main( if nofilters: full_df["allele_type"] = "real_allele" if output_type == "efm": - EFM_output(full_df, outpath, profile_type, separate) + EFM_output(full_df, outpath, profile_type, data_type, separate) else: STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} final_df, flags_df = process_strs(dict_loc, data_type, seq_col) if output_type == "efm": - EFM_output(final_df, outpath, profile_type, separate) + EFM_output(final_df, outpath, profile_type, data_type, separate) else: STRmix_output(final_df, outpath, profile_type, data_type, seq_col) if info: