Skip to content

Commit

Permalink
added strand orientation option to STRs
Browse files Browse the repository at this point in the history
  • Loading branch information
rnmitchell committed Jul 19, 2023
1 parent c99f319 commit 9a25dcf
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 17 deletions.
9 changes: 9 additions & 0 deletions lusSTR/cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def edit_snp_config(config, args):
data["references"] = args.ref
else:
data["references"] = None
if args.strand:
data["strand"] = args.strand
return data


Expand Down Expand Up @@ -85,6 +87,8 @@ def edit_str_config(config, args):
data["data_type"] = "ce"
if args.efm:
data["output_type"] = "efm"
if args.strand:
data["strand"] = args.strand
return data


Expand Down Expand Up @@ -152,3 +156,8 @@ def subparser(subparsers):
"--snp-reference", dest="ref",
help="Specify any references for SNP data for use in EFM."
)
p.add_argument(
"--strand", choices=["uas", "forward"],
help="Specify the strand orientation for the final output files. UAS orientation is "
"default for STRs; forward strand is default for SNPs."
)
6 changes: 1 addition & 5 deletions lusSTR/data/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,4 @@ data_type: "ngs" ## ce/ngs
info: True ## True/False; create allele information file
separate: False ##True/False; for EFM only, if True will create individual files for samples; if False, will create one file with all samples
nofilters: False ##True/False; skip all filtering steps but still creates EFM/STRmix output files





strand: uas ##uas/forward; strand orientation to report
1 change: 1 addition & 0 deletions lusSTR/workflows/strs.smk
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ rule filter:
info=config["info"],
separate=config["separate"],
filters=config["nofilters"],
strand=config["strand"]
script:
lusSTR.wrapper("filter")

32 changes: 20 additions & 12 deletions lusSTR/wrappers/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,14 @@ def get_filter_metadata_file():
filter_marker_data = json.load(fh)


def process_strs(dict_loc, datatype):
def process_strs(dict_loc, datatype, seq_col):
final_df = pd.DataFrame()
flags_df = pd.DataFrame()
brack_col = (
"UAS_Output_Bracketed_Notation"
if seq_col == "UAS_Output_Sequence"
else "Forward_Strand_Bracketed_Notation"
)
for key, value in dict_loc.items():
data = dict_loc[key].reset_index(drop=True)
if datatype == "ce":
Expand All @@ -80,9 +85,9 @@ def process_strs(dict_loc, datatype):
[
"SampleID",
"Locus",
"UAS_Output_Sequence",
seq_col,
"CE_Allele",
"UAS_Output_Bracketed_Notation",
brack_col,
"Reads",
]
]
Expand Down Expand Up @@ -199,7 +204,7 @@ def determine_max_num_alleles(allele_heights):
return max_num_alleles


def STRmix_output(profile, outdir, profile_type, data_type):
def STRmix_output(profile, outdir, profile_type, data_type, seq_col):
Path(outdir).mkdir(parents=True, exist_ok=True)
if profile_type == "reference":
filtered_df = profile[profile.allele_type == "real_allele"]
Expand All @@ -208,13 +213,12 @@ def STRmix_output(profile, outdir, profile_type, data_type):
if data_type == "ce":
strmix_profile = strmix_ce_processing(filtered_df)
else:
strmix_profile = filtered_df.loc[
:, ["SampleID", "Locus", "CE_Allele", "UAS_Output_Sequence", "Reads"]
]
strmix_profile = filtered_df.loc[:, ["SampleID", "Locus", "CE_Allele", seq_col, "Reads"]]
strmix_profile.rename(
{"CE_Allele": "CE Allele", "UAS_Output_Sequence": "Allele Seq"}, axis=1, inplace=True
{"CE_Allele": "CE Allele", seq_col: "Allele Seq"}, axis=1, inplace=True
)
strmix_profile = strmix_profile.sort_values(by=["SampleID", "Locus", "CE Allele"])
print(strmix_profile)
strmix_profile.replace(
{"Locus": {"VWA": "vWA", "PENTA D": "PentaD", "PENTA E": "PentaE"}}, inplace=True
)
Expand Down Expand Up @@ -293,7 +297,9 @@ def format_ref_table(new_rows, sample_data, datatype):
return sort_df


def main(input, output_type, profile_type, data_type, output_dir, info, separate, nofilters):
def main(
input, output_type, profile_type, data_type, output_dir, info, separate, nofilters, strand
):
input = str(input)
if profile_type not in ("evidence", "reference"):
raise ValueError(f"unknown profile type '{profile_type}'")
Expand All @@ -306,19 +312,20 @@ def main(input, output_type, profile_type, data_type, output_dir, info, separate
raise ValueError("No output specified using --out.")
else:
outpath = output_dir
seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence"
if nofilters:
full_df["allele_type"] = "real_allele"
if output_type == "efm":
EFM_output(full_df, outpath, profile_type, separate)
else:
STRmix_output(full_df, outpath, profile_type, data_type)
STRmix_output(full_df, outpath, profile_type, data_type, seq_col)
else:
dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])}
final_df, flags_df = process_strs(dict_loc, data_type)
final_df, flags_df = process_strs(dict_loc, data_type, seq_col)
if output_type == "efm":
EFM_output(final_df, outpath, profile_type, separate)
else:
STRmix_output(final_df, outpath, profile_type, data_type)
STRmix_output(final_df, outpath, profile_type, data_type, seq_col)
if info:
name = os.path.basename(outpath)
final_df.to_csv(f"{outpath}/{name}_sequence_info.csv", index=False)
Expand All @@ -336,4 +343,5 @@ def main(input, output_type, profile_type, data_type, output_dir, info, separate
info=snakemake.params.info,
separate=snakemake.params.separate,
nofilters=snakemake.params.filters,
strand=snakemake.params.strand,
)

0 comments on commit 9a25dcf

Please sign in to comment.