Skip to content

Commit

Permalink
Add LUS+ as possible STR data type (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
rnmitchell committed Nov 22, 2023
1 parent 24fc3cb commit 03b8979
Show file tree
Hide file tree
Showing 12 changed files with 378 additions and 87 deletions.
10 changes: 7 additions & 3 deletions lusSTR/cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ def edit_str_config(config, args):
data["info"] = False
if args.reference:
data["profile_type"] = "reference"
if args.ce:
data["data_type"] = "ce"
if args.datatype:
data["data_type"] = args.datatype
if args.efm:
data["output_type"] = "efm"
if args.strand:
Expand Down Expand Up @@ -127,7 +127,11 @@ def subparser(subparsers):
help="Use for creating Reference profiles for STR workflow"
)
p.add_argument("--efm", action="store_true",help="Use to create EuroForMix profiles")
p.add_argument("--ce", action="store_true", help="Use for CE data")
p.add_argument(
"--str-type", choices=["ce", "ngs", "lusplus"], default="ngs",
dest="datatype", help="Data type for STRs. Options are: CE allele ('ce'), sequence "
"('ngs'), or LUS+ allele ('lusplus'). Default is 'ngs'.",
)
p.add_argument(
"--noinfo", action="store_true",
help="Use to not create the Sequence Information File in the 'filter' step"
Expand Down
118 changes: 88 additions & 30 deletions lusSTR/scripts/filter_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def filters(locus_allele_info, locus, locus_reads, datatype, brack_col):
locus_allele_info = ce_filtering(
locus_allele_info, locus_reads, metadata, datatype, brack_col
)
if datatype == "ngs":
locus_allele_info = same_size_filter(locus_allele_info, metadata)
if datatype != "ce":
locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
return locus_allele_info


Expand Down Expand Up @@ -146,14 +146,19 @@ def allele_ident(
locus_allele_info, init_type_all, metadata, ref_allele_reads, i, j, datatype, brack_col
):
quest_al_reads = locus_allele_info.loc[j, "Reads"]
ref_allele = float(locus_allele_info.loc[i, "CE_Allele"])
question_allele = float(locus_allele_info.loc[j, "CE_Allele"])
if datatype == "ngs":
ref_bracket = locus_allele_info.loc[i, brack_col]
question_bracket = locus_allele_info.loc[j, brack_col]
if datatype == "lusplus":
ref_allele = float(locus_allele_info.loc[i, "LUS_Plus"].split("_")[0])
question_allele = float(locus_allele_info.loc[j, "LUS_Plus"].split("_")[0])
ref_altformat = locus_allele_info.loc[i, "LUS_Plus"]
question_altformat = locus_allele_info.loc[j, "LUS_Plus"]
else:
ref_bracket = None
question_bracket = None
ref_allele = float(locus_allele_info.loc[i, "CE_Allele"])
question_allele = float(locus_allele_info.loc[j, "CE_Allele"])
ref_altformat = None
question_altformat = None
if datatype == "ngs":
ref_altformat = locus_allele_info.loc[i, brack_col]
question_altformat = locus_allele_info.loc[j, brack_col]
locus_allele_info.loc[j, ["allele_type", "perc_stutter"]] = allele_type(
ref_allele,
question_allele,
Expand All @@ -163,22 +168,22 @@ def allele_ident(
ref_allele_reads,
locus_allele_info.loc[j, "allele1_ref_reads"],
locus_allele_info,
ref_bracket,
question_bracket,
ref_altformat,
question_altformat,
datatype,
brack_col,
)
if "stutter" in locus_allele_info.loc[j, "allele_type"]:
if "/" in locus_allele_info.loc[j, "allele_type"] and pd.isnull(
locus_allele_info.loc[j, "parent_allele2"]
):
stut_allele2 = ref_allele if datatype == "ce" else ref_bracket
stut_allele2 = ref_allele if datatype == "ce" else ref_altformat
locus_allele_info.loc[j, ["parent_allele2", "allele2_ref_reads"]] = [
stut_allele2,
ref_allele_reads,
]
elif pd.isnull(locus_allele_info.loc[j, "parent_allele1"]):
stut_allele1 = ref_allele if datatype == "ce" else ref_bracket
stut_allele1 = ref_allele if datatype == "ce" else ref_altformat
locus_allele_info.loc[j, ["parent_allele1", "allele1_ref_reads"]] = [
stut_allele1,
ref_allele_reads,
Expand Down Expand Up @@ -263,8 +268,8 @@ def allele_type(
ref_reads,
al1_ref_reads,
all_type_df,
ref_bracket,
question_bracket,
ref_altformat,
question_altformat,
datatype,
brack_col,
):
Expand All @@ -275,8 +280,16 @@ def allele_type(
allele_diff = round(ref - ce, 1)
if allele_diff == 1 and ref_reads > quest_al_reads: # -1 stutter
if (
datatype == "ngs" and bracketed_stutter_id(ref_bracket, question_bracket, -1) == -1
) or datatype == "ce":
(
datatype == "ngs"
and bracketed_stutter_id(ref_altformat, question_altformat, -1) == -1
)
or datatype == "ce"
or (
datatype == "lusplus"
and lusplus_stutter_id(ref_altformat, question_altformat, -1) == -1
)
):
all_type, stut_perc = minus1_stutter(
all_type,
stutter_thresh,
Expand All @@ -287,11 +300,19 @@ def allele_type(
quest_al_reads,
)
elif allele_diff == 2 and ref_reads > quest_al_reads: # -2 stutter
allele = ce if datatype == "ce" else question_bracket
allele = ce if datatype == "ce" else question_altformat
if check_2stutter(all_type_df, datatype, allele, brack_col)[0] is True:
if (
datatype == "ngs" and bracketed_stutter_id(ref_bracket, question_bracket, -2) == -2
) or datatype == "ce":
(
datatype == "ngs"
and bracketed_stutter_id(ref_altformat, question_altformat, -2) == -2
)
or datatype == "ce"
or (
datatype == "lusplus"
and lusplus_stutter_id(ref_altformat, question_altformat, -2) == -2
)
):
ref_reads = check_2stutter(all_type_df, datatype, allele, brack_col)[1]
stutter_thresh_reads = stutter_thresh * ref_reads
all_type, stut_perc = minus2_stutter(
Expand All @@ -305,8 +326,13 @@ def allele_type(
)
elif allele_diff == -1 and ref_reads > quest_al_reads: # +1 stutter
if (
datatype == "ngs" and bracketed_stutter_id(ref_bracket, question_bracket, 1) == 1
) or datatype == "ce":
(datatype == "ngs" and bracketed_stutter_id(ref_altformat, question_altformat, 1) == 1)
or datatype == "ce"
or (
datatype == "lusplus"
and lusplus_stutter_id(ref_altformat, question_altformat, 1) == 1
)
):
all_type, stut_perc = plus1_stutter(
all_type, stutter_thresh, forward_thresh, ref_reads, al1_ref_reads, quest_al_reads
)
Expand Down Expand Up @@ -342,9 +368,14 @@ def check_2stutter(stutter_df, allele_des, allele, brack_col):
break
else:
for k, row in stutter_df.iterrows():
bracket_test = stutter_df.loc[k, brack_col]
if bracketed_stutter_id(bracket_test, allele, -1) == -1:
is_true, reads = True, stutter_df.loc[k, "Reads"]
if allele_des == "ngs":
bracket_test = stutter_df.loc[k, brack_col]
if bracketed_stutter_id(bracket_test, allele, -1) == -1:
is_true, reads = True, stutter_df.loc[k, "Reads"]
else:
lusp_test = stutter_df.loc[k, "LUS_Plus"]
if lusplus_stutter_id(lusp_test, allele, -1) == -1:
is_true, reads = True, stutter_df.loc[k, "Reads"]
return is_true, reads


Expand Down Expand Up @@ -409,6 +440,26 @@ def bracketed_stutter_id(ref_bracket, quest_bracket, stutter_id):
return stutter


def lusplus_stutter_id(ref_lusp, question_lusp, stutter_id):
diffcount = 0
stutter = None
for j in range(1, len(ref_lusp.split("_"))):
diff = float(question_lusp.split("_")[j]) - round(float(ref_lusp.split("_")[j]))
if diff == 0:
continue
elif diff == stutter_id:
if diffcount == 0:
diffcount = diff
stutter = stutter_id
else:
stutter = None
break
else:
stutter = None
break
return stutter


def allele_imbalance_check(allele_df):
imbalance_df = pd.DataFrame(columns=["SampleID", "Locus", "Flags"])
try:
Expand All @@ -430,10 +481,13 @@ def allele_imbalance_check(allele_df):
return imbalance_df


def check_D7(allele_df):
def check_D7(allele_df, datatype):
D7_df = pd.DataFrame(columns=["SampleID", "Locus", "Flags"])
for i in range(len(allele_df)):
al = allele_df.loc[i, "CE_Allele"]
if datatype == "lusplus":
al = allele_df.loc[i, "LUS_Plus"].split("_")[0]
else:
al = allele_df.loc[i, "CE_Allele"]
try:
if str(al).split(".")[1] == "1" and allele_df.loc[i, "allele_type"] != "BelowAT":
print("D7 microvariants detected! Check flagged file for details.")
Expand All @@ -447,16 +501,18 @@ def check_D7(allele_df):
return D7_df


def flags(allele_df):
def flags(allele_df, datatype):
flags_df = pd.DataFrame(columns=["SampleID", "Locus", "Flags"])
flags_df = flags_df.append(allele_counts(allele_df))
flags_df = flags_df.append(allele_imbalance_check(allele_df))
flags_df = flags_df.append(check_D7(allele_df))
flags_df = flags_df.append(check_D7(allele_df, datatype))
return flags_df


def same_size_filter(df, metadata):
def same_size_filter(df, metadata, datatype):
final_df = pd.DataFrame()
if datatype == "lusplus":
df["CE_Allele"] = df["LUS_Plus"].apply(lambda x: x.split("_")[0])
al_list = df["CE_Allele"].unique()
for ce_allele in al_list:
df_filt = (
Expand All @@ -475,4 +531,6 @@ def same_size_filter(df, metadata):
else:
final_df = final_df.append(df_filt)
final_df = final_df.reset_index(drop=True)
if datatype == "lusplus":
final_df = final_df.drop("CE_Allele", axis=1)
return final_df
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
SampleName,Marker,Allele1,Allele2
Positive_Control,CSF1PO,12_12_0,12_12_0
Positive_Control,D10S1248,13_13,15_15
Positive_Control,D12S391,18_11_6_0,23_14_9_0
Positive_Control,D13S317,11_12_3_1,9_9_3_1
Positive_Control,D16S539,13_13_0,9_9_0
Positive_Control,D17S1301,11_11,12_12
Positive_Control,D18S51,16_16_1,18_18_1
Positive_Control,D19S433,13_11_1_0,14_12_1_0
Positive_Control,D1S1656,12_11_1_0,13_13_0_0
Positive_Control,D20S482,14_14,15_15
Positive_Control,D21S11,29_11_4_6,31.2_11_5_6
Positive_Control,D22S1045,16_13,16_13
Positive_Control,D2S1338,22_12_1_7,25_15_1_7
Positive_Control,D2S441,10_10_0,14_11_0
Positive_Control,D3S1358,17_13_3,18_14_3
Positive_Control,D4S2408,9_9_0,9_9_0
Positive_Control,D5S818,12_12,12_12
Positive_Control,D6S1043,12_12_0,20_14_1
Positive_Control,D7S820,11_11_1_0,8_8_1_0
Positive_Control,D8S1179,14_12_1_0,15_12_1_0
Positive_Control,D9S1122,12_10,12_12
Positive_Control,FGA,20_12_3_0,23_15_3_0
Positive_Control,PENTA D,12_12,13_13
Positive_Control,PENTA E,14_14,7_7
Positive_Control,TH01,6_6,9.3_6
Positive_Control,TPOX,11_11,11_11
Positive_Control,VWA,16_12_3_1,19_14_4_1
24 changes: 24 additions & 0 deletions lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele
Sample1,D4S2408,10_10_0,1022,real_allele,,,,,,,
Sample1,D4S2408,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,,,
Sample1,D4S2408,8_8_0,1050,real_allele,,,,,,,
Sample1,D8S1179,14_12_1_0,869,real_allele,,,,,,,
Sample1,D8S1179,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212,
Sample1,D8S1179,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201,
Sample1,D9S1122,13_11,948,real_allele,,,,,,,
Sample1,D9S1122,12_10,108,-1_stutter,13_11,,948.0,,,0.114,
Sample1,D9S1122,11_11,991,real_allele,,,,,,,
Sample1,D9S1122,10_10,87,-1_stutter,11_11,,991.0,,,0.088,
Sample1,FGA,23_15_3_0,1436,real_allele,,,,,,,
Sample1,FGA,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182,
Sample1,FGA,21_13_3_0,48,BelowAT,,,,,0.013,,
Sample1,FGA,20_12_3_0,1750,real_allele,,,,,,,
Sample1,FGA,18_10_3_0,181,real_allele,,,,,,,
Sample1,FGA,17_9_3_0,15,BelowAT,,,,,0.004,,
Sample1,PENTA D,15_15,50,real_allele,,,,,,,
Sample1,PENTA D,13_13,1000,real_allele,,,,,,,
Sample1,PENTA E,7_7,505,real_allele,,,,,,,7.0
Sample1,TH01,7_7,2197,real_allele,,,,,,,
Sample1,TH01,6_6,1632,real_allele,,,,,,,
Sample1,TH01,5_5,66,BelowAT,,,,,0.017,,
Sample1,TPOX,11_11,15,BelowAT,,,,,1.0,,11.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
SampleName,Marker,Allele1,Allele2,Allele3,Allele4,Height1,Height2,Height3,Height4
Sample1,CSF1PO,,,,,,,,
Sample1,D10S1248,,,,,,,,
Sample1,D12S391,,,,,,,,
Sample1,D13S317,,,,,,,,
Sample1,D16S539,,,,,,,,
Sample1,D17S1301,,,,,,,,
Sample1,D18S51,,,,,,,,
Sample1,D19S433,,,,,,,,
Sample1,D1S1656,,,,,,,,
Sample1,D20S482,,,,,,,,
Sample1,D21S11,,,,,,,,
Sample1,D22S1045,,,,,,,,
Sample1,D2S1338,,,,,,,,
Sample1,D2S441,,,,,,,,
Sample1,D3S1358,,,,,,,,
Sample1,D4S2408,10_10_0,8_8_0,9_9_0,,900,1000,1357,
Sample1,D5S818,,,,,,,,
Sample1,D6S1043,,,,,,,,
Sample1,D7S820,,,,,,,,
Sample1,D8S1179,12_9_1_0,13_10_1_0,13_11_1_0,14_11_1_0,26,89,95,739
Sample1,D9S1122,10_10,11_11,12_10,13_11,87,991,108,948
Sample1,FGA,18_10_3_0,20_12_3_0,22_14_3_0,23_15_3_0,181,1750,262,1436
Sample1,PENTA D,13_13,15_15,,,1000,50,,
Sample1,PENTA E,7_7,,,,505,,,
Sample1,TH01,6_6,7_7,,,1632,2197,,
Sample1,TPOX,,,,,,,,
Sample1,VWA,,,,,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele
Sample1,D4S2408,10_10_0,900.0,real_allele,,,,,,,
Sample1,D4S2408,9_9_0,1357.0,real_allele,,,,,,,
Sample1,D4S2408,8_8_0,1000.0,real_allele,,,,,,,
Sample1,D8S1179,14_11_1_0,739.0,real_allele,,,,,,,
Sample1,D8S1179,13_11_1_0,95.0,real_allele,,,,,,,
Sample1,D8S1179,13_10_1_0,89.0,-1_stutter,14_11_1_0,,739.0,,,0.12,
Sample1,D8S1179,12_9_1_0,26.0,real_allele,,,,,,,
Sample1,D8S1179,12_10_1_0,11.0,BelowAT,,,,,0.01,,
Sample1,D9S1122,13_11,948.0,real_allele,,,,,,,
Sample1,D9S1122,12_10,108.0,-1_stutter,13_11,,948.0,,,0.114,
Sample1,D9S1122,11_11,991.0,real_allele,,,,,,,
Sample1,D9S1122,10_10,87.0,-1_stutter,11_11,,991.0,,,0.088,
Sample1,FGA,23_15_3_0,1436.0,real_allele,,,,,,,
Sample1,FGA,22_14_3_0,262.0,-1_stutter,23_15_3_0,,1436.0,,,0.182,
Sample1,FGA,21_13_3_0,48.0,BelowAT,,,,,0.013,,
Sample1,FGA,20_12_3_0,1750.0,real_allele,,,,,,,
Sample1,FGA,18_10_3_0,181.0,real_allele,,,,,,,
Sample1,FGA,17_9_3_0,15.0,BelowAT,,,,,0.004,,
Sample1,PENTA D,15_15,50.0,real_allele,,,,,,,
Sample1,PENTA D,13_13,1000.0,real_allele,,,,,,,
Sample1,PENTA E,7_7,505.0,real_allele,,,,,,,7.0
Sample1,TH01,7_7,2197.0,real_allele,,,,,,,
Sample1,TH01,6_6,1632.0,real_allele,,,,,,,
Sample1,TH01,5_5,66.0,BelowAT,,,,,0.017,,
Sample1,TPOX,11_11,15.0,BelowAT,,,,,1.0,,11.0
Loading

0 comments on commit 03b8979

Please sign in to comment.