From 9a25dcf430c2e45c49096d6dbf7e1f1592c8c6dc Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 19 Jul 2023 06:56:17 -0400 Subject: [PATCH] added strand orientation option to STRs --- lusSTR/cli/config.py | 9 +++++++++ lusSTR/data/config.yaml | 6 +----- lusSTR/workflows/strs.smk | 1 + lusSTR/wrappers/filter.py | 32 ++++++++++++++++++++------------ 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/lusSTR/cli/config.py b/lusSTR/cli/config.py index ef005422..ae0dd49e 100644 --- a/lusSTR/cli/config.py +++ b/lusSTR/cli/config.py @@ -53,6 +53,8 @@ def edit_snp_config(config, args): data["references"] = args.ref else: data["references"] = None + if args.strand: + data["strand"] = args.strand return data @@ -85,6 +87,8 @@ def edit_str_config(config, args): data["data_type"] = "ce" if args.efm: data["output_type"] = "efm" + if args.strand: + data["strand"] = args.strand return data @@ -152,3 +156,8 @@ def subparser(subparsers): "--snp-reference", dest="ref", help="Specify any references for SNP data for use in EFM." ) + p.add_argument( + "--strand", choices=["uas", "forward"], + help="Specify the strand orientation for the final output files. UAS orientation is " + "default for STRs; forward strand is default for SNPs." + ) diff --git a/lusSTR/data/config.yaml b/lusSTR/data/config.yaml index c35a3c8d..0ad3e7e8 100644 --- a/lusSTR/data/config.yaml +++ b/lusSTR/data/config.yaml @@ -18,8 +18,4 @@ data_type: "ngs" ## ce/ngs info: True ## True/False; create allele information file separate: False ##True/False; for EFM only, if True will create individual files for samples; if False, will create one file with all samples nofilters: False ##True/False; skip all filtering steps but still creates EFM/STRmix output files - - - - - +strand: uas ##uas/forward; strand orientation to report \ No newline at end of file diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index fd14c43e..c9955579 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -125,6 +125,7 @@ rule filter: info=config["info"], separate=config["separate"], filters=config["nofilters"], + strand=config["strand"] script: lusSTR.wrapper("filter") \ No newline at end of file diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 88c676f3..1cb5040e 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -63,9 +63,14 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) -def process_strs(dict_loc, datatype): +def process_strs(dict_loc, datatype, seq_col): final_df = pd.DataFrame() flags_df = pd.DataFrame() + brack_col = ( + "UAS_Output_Bracketed_Notation" + if seq_col == "UAS_Output_Sequence" + else "Forward_Strand_Bracketed_Notation" + ) for key, value in dict_loc.items(): data = dict_loc[key].reset_index(drop=True) if datatype == "ce": @@ -80,9 +85,9 @@ def process_strs(dict_loc, datatype): [ "SampleID", "Locus", - "UAS_Output_Sequence", + seq_col, "CE_Allele", - "UAS_Output_Bracketed_Notation", + brack_col, "Reads", ] ] @@ -199,7 +204,7 @@ def determine_max_num_alleles(allele_heights): return max_num_alleles -def STRmix_output(profile, outdir, profile_type, data_type): +def STRmix_output(profile, outdir, profile_type, data_type, seq_col): Path(outdir).mkdir(parents=True, exist_ok=True) if profile_type == "reference": filtered_df = profile[profile.allele_type == "real_allele"] @@ -208,13 +213,12 @@ def STRmix_output(profile, outdir, profile_type, data_type): if data_type == "ce": strmix_profile = strmix_ce_processing(filtered_df) else: - strmix_profile = filtered_df.loc[ - :, ["SampleID", "Locus", "CE_Allele", "UAS_Output_Sequence", "Reads"] - ] + strmix_profile = filtered_df.loc[:, ["SampleID", "Locus", "CE_Allele", seq_col, "Reads"]] strmix_profile.rename( - {"CE_Allele": "CE Allele", "UAS_Output_Sequence": "Allele Seq"}, axis=1, inplace=True + {"CE_Allele": "CE Allele", seq_col: "Allele Seq"}, axis=1, inplace=True ) strmix_profile = strmix_profile.sort_values(by=["SampleID", "Locus", "CE Allele"]) + print(strmix_profile) strmix_profile.replace( {"Locus": {"VWA": "vWA", "PENTA D": "PentaD", "PENTA E": "PentaE"}}, inplace=True ) @@ -293,7 +297,9 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def main(input, output_type, profile_type, data_type, output_dir, info, separate, nofilters): +def main( + input, output_type, profile_type, data_type, output_dir, info, separate, nofilters, strand +): input = str(input) if profile_type not in ("evidence", "reference"): raise ValueError(f"unknown profile type '{profile_type}'") @@ -306,19 +312,20 @@ def main(input, output_type, profile_type, data_type, output_dir, info, separate raise ValueError("No output specified using --out.") else: outpath = output_dir + seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence" if nofilters: full_df["allele_type"] = "real_allele" if output_type == "efm": EFM_output(full_df, outpath, profile_type, separate) else: - STRmix_output(full_df, outpath, profile_type, data_type) + STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} - final_df, flags_df = process_strs(dict_loc, data_type) + final_df, flags_df = process_strs(dict_loc, data_type, seq_col) if output_type == "efm": EFM_output(final_df, outpath, profile_type, separate) else: - STRmix_output(final_df, outpath, profile_type, data_type) + STRmix_output(final_df, outpath, profile_type, data_type, seq_col) if info: name = os.path.basename(outpath) final_df.to_csv(f"{outpath}/{name}_sequence_info.csv", index=False) @@ -336,4 +343,5 @@ def main(input, output_type, profile_type, data_type, output_dir, info, separate info=snakemake.params.info, separate=snakemake.params.separate, nofilters=snakemake.params.filters, + strand=snakemake.params.strand, )