|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | + |
| 8 | +DEFAULT_FILENAME_PATTERNS = [ |
| 9 | + ("illumina_some_institute_1",{ |
| 10 | + "regex":r"(?P<date>\d+)_(?P<lab_id>\d{2}-\d{4,5}(-[^_]+)?)_(?P<sample_id>.+)" |
| 11 | + r"_(?P<snum>S\d+)_(?P<lane>L\d{3})_(?P<read>R[12])" |
| 12 | + r"_(?P<running>\d{3})", |
| 13 | + "ambig": ["lane","running"] |
| 14 | + }), |
| 15 | + ("illumina1",{ |
| 16 | + "regex":r"(?P<sample_id>.+)_(?P<snum>S[\d]+)_(?P<lane>L[\d]{3})_" |
| 17 | + r"(?P<read>R[12])_(?P<running>\d{3})", |
| 18 | + "ambig":["lane","running"] }), |
| 19 | + ("illumina2",{ |
| 20 | + "regex":r"(?P<sample_id>.+)_(?P<snum>S[\d]+)_" |
| 21 | + r"(?P<read>R[12])_(?P<running>\d{3})", |
| 22 | + "ambig":["running"] }), |
| 23 | + ("illumina3",{ |
| 24 | + "regex":r"(?P<sample_id>.+)_(?P<snum>S[0-9]+)_(?P<lane>L[0-9]{3})_" |
| 25 | + r"(?P<read>R[12])", |
| 26 | + "ambig":["lane"] }), |
| 27 | + ("illumina4",{ |
| 28 | + "regex":r"(?P<sample_id>.+)_S(?P<snum>[0-9]+)_" |
| 29 | + r"(?P<read>R[12])", |
| 30 | + "ambig":[] }), |
| 31 | + ("illumina_fallback",{ |
| 32 | + "regex":r"(?P<sample_id>.+)_" |
| 33 | + r"(?P<read>R[12])(?P<residual>_.+)?", |
| 34 | + "ambig":[] }), |
| 35 | + ("SRA",{ |
| 36 | + "regex":r"(?P<sample_id>SR.+)_(?P<read>[12])", |
| 37 | + "ambig":[] }), |
| 38 | + |
| 39 | + ("fallback1",{ |
| 40 | + "regex":r"(?P<sample_id>.+)_(?P<read>[A-Za-z0-9]+)", |
| 41 | + "ambig":[] }), |
| 42 | + ("fallback2",{ |
| 43 | + "regex":r"(?P<sample_id>.+)\.(?P<read>[A-Za-z0-9]+)", |
| 44 | + "ambig":[], |
| 45 | + "sep":"."}) |
| 46 | + ] |
| 47 | + |
| 48 | +PAIRED_READ_REF = { |
| 49 | + "1":1, |
| 50 | + "2":2, |
| 51 | + "R1":1, |
| 52 | + "R2":2, |
| 53 | + "FORWARD":1, |
| 54 | + "REVERSE":2, |
| 55 | + "FWD":1, |
| 56 | + "REV":2, |
| 57 | + "F":1, |
| 58 | + "R":2, |
| 59 | + "P":1, |
| 60 | + "M":2, |
| 61 | + "PLUS":1, |
| 62 | + "MINUS":2, |
| 63 | + "SENSE":1, |
| 64 | + "ANTI":2 |
| 65 | + } |
| 66 | + |
| 67 | +def check_and_add_fastq(_files, res, pdir=None, sample_names=None, alt_ids=None): |
| 68 | + single_mode = len(_files) == 1 and sample_names is not None |
| 69 | + patterns=dict(DEFAULT_FILENAME_PATTERNS) |
| 70 | + patterns_in_order = [key for key, _ in DEFAULT_FILENAME_PATTERNS] |
| 71 | + end=r'.(fq|fnq|fastq)(.gz)?' |
| 72 | + for _file in (fl for fl in _files if re.match(f".*{end}", fl)): |
| 73 | + if pdir is None: |
| 74 | + path = os.path.dirname(os.path.realpath(_file)) |
| 75 | + else: |
| 76 | + path = os.path.realpath(pdir) |
| 77 | + _file = os.path.basename(_file) |
| 78 | + for pnum, pattern in enumerate(patterns_in_order): |
| 79 | + regex_string = ('{patternregex}{end}'.format( |
| 80 | + patternregex=patterns[pattern]["regex"], |
| 81 | + end=end)) |
| 82 | + match = re.match(regex_string, _file) |
| 83 | + if match is None: |
| 84 | + if pnum-1 == len(patterns): |
| 85 | + print(f"FastQ does not meet any known spec: file: {_file}") |
| 86 | + continue |
| 87 | + sep = default_if_not("sep", patterns[pattern],"_") |
| 88 | + ambig_keys=default_if_not("ambig",patterns[pattern], []) |
| 89 | + match_dict = match.groupdict() |
| 90 | + ambig = sep.join(match_dict[a] |
| 91 | + for a in sorted(ambig_keys)) |
| 92 | + nonambig = sep.join(match_dict[na] |
| 93 | + for na in sorted(match_dict.keys()) |
| 94 | + if na not in ambig_keys + ["read"] |
| 95 | + and match_dict[na] is not None) |
| 96 | + sub_sample_id = sep.join( |
| 97 | + [val if val is not None else "" |
| 98 | + for (_id, val) in list(match.groupdict().items()) |
| 99 | + if _id not in ["read"]]) |
| 100 | + if pattern.startswith("illumina_some_institute"): |
| 101 | + matchdict = match.groupdict() |
| 102 | + sub_sample_id = "{sid}_{lid}".format( |
| 103 | + sid=matchdict["sample_id"], |
| 104 | + lid=matchdict["lab_id"]) |
| 105 | + |
| 106 | + |
| 107 | + read = "R1" |
| 108 | + read_id = 1 |
| 109 | + try: |
| 110 | + read = match_dict["read"] |
| 111 | + except KeyError: |
| 112 | + pass |
| 113 | + try: |
| 114 | + read_id = PAIRED_READ_REF[read.upper()] |
| 115 | + except KeyError: |
| 116 | + print("Warning: Read name \"",read,"\" not known") |
| 117 | + print(f" Using Regex-pattern: {pattern} {regex_string} File: {_file}") |
| 118 | + |
| 119 | + |
| 120 | + key=(path,nonambig,ambig,read) |
| 121 | + if key in res["sample_data"].index: |
| 122 | + break |
| 123 | + new_entry=pd.DataFrame(dict(zip(res["sample_data"].columns, |
| 124 | + ([a] |
| 125 | + for a |
| 126 | + in [path, nonambig, ambig, read, read_id, |
| 127 | + "PLACE_HOLDER", |
| 128 | + sample_names[0] if single_mode |
| 129 | + and sample_names is not None |
| 130 | + else sub_sample_id, |
| 131 | + "PLACE_HOLDER", |
| 132 | + alt_ids[0] if single_mode |
| 133 | + and alt_ids is not None |
| 134 | + else "PLACE_HOLDER", |
| 135 | + _file, pattern, |
| 136 | + '{regexpattern}{end}'.format( |
| 137 | + regexpattern=patterns[pattern]["regex"],end=end), |
| 138 | + "new"])))) |
| 139 | + new_entry.set_index(["path","unambig_id","ambig_id","read"],inplace=True, drop=False) |
| 140 | + res["sample_data"]= res["sample_data"].append(new_entry) |
| 141 | + # print(read, nonambig, ambig, pattern, _file, path, sep='\t') |
| 142 | + break |
| 143 | + #eprint(res["sample_data"][["sub_sample_id", "alt_sub_sample_id"]]) |
| 144 | + |
| 145 | +def default_if_not(key, _dict, default): |
| 146 | + try: |
| 147 | + return _dict[key] |
| 148 | + except KeyError: |
| 149 | + return default |
| 150 | + |
| 151 | +def resolve_sample_id_conflicts(sample_data): |
| 152 | + generate_alternative_ids(sample_data) |
| 153 | + prelim_groups = sample_data.groupby(["sub_sample_id"]).groups |
| 154 | + for sub_id in prelim_groups: |
| 155 | + paths_subgroup = sample_data[ |
| 156 | + sample_data.sub_sample_id == sub_id].groupby(level="path").groups |
| 157 | + num_subids = len(paths_subgroup) |
| 158 | + if num_subids == 1: |
| 159 | + continue |
| 160 | + print("Warning: Found multiple samples ({num_subids}) with sample id {sub_id}! DUPLICATE_[Number]_ will be prepended to duplicates") |
| 161 | + for (_id, path) in enumerate(paths_subgroup): |
| 162 | + if _id == 0: # Skip first occurance |
| 163 | + continue |
| 164 | + sample_data.loc[((sample_data.sub_sample_id == sub_id) & |
| 165 | + (sample_data.path == path)) , |
| 166 | + "sub_sample_id"] = "Duplicate_{_id}_{samp}".format( |
| 167 | + _id=_id, samp=sub_id) |
| 168 | + |
| 169 | +def generate_alternative_ids(sample_data): |
| 170 | + if not ((sample_data["alt_sub_sample_id"]=="PLACE_HOLDER").any()): |
| 171 | + # All Samples known and will just be passed on |
| 172 | + return |
| 173 | + old_samples = sample_data.loc[ |
| 174 | + sample_data.alt_sub_sample_id != "PLACE_HOLDER"].copy() |
| 175 | + old_sample_groups = old_samples.groupby( |
| 176 | + level=["path", "unambig_id"]).groups |
| 177 | + num_old_samples = len(old_sample_groups) |
| 178 | + new_samples = sample_data.loc[ |
| 179 | + sample_data.alt_sub_sample_id == "PLACE_HOLDER"].copy() |
| 180 | + new_sample_groups = new_samples.groupby( |
| 181 | + level=["path","unambig_id"]).groups |
| 182 | + for key in (key |
| 183 | + for key in new_sample_groups if key in old_sample_groups): |
| 184 | + new_sample_data = sample_data.loc[new_sample_groups[key]].copy() |
| 185 | + old_sample_data = sample_data.loc[old_sample_groups[key]].copy() |
| 186 | + old_sample_sub_groups = old_sample_data.groupby( |
| 187 | + level=["ambig_id"]).groups |
| 188 | + new_sample_sub_groups = new_sample_data.groupby( |
| 189 | + level=["ambig_id"]).groups |
| 190 | + for sub_key in (s_key |
| 191 | + for s_key in new_sample_sub_groups |
| 192 | + if s_key in old_sample_sub_groups): |
| 193 | + old_alt_id = old_sample_data.loc[ |
| 194 | + old_sample_sub_groups[sub_key]].alt_sub_sample_id[0] |
| 195 | + sample_data.loc[ |
| 196 | + new_sample_sub_groups[sub_key], |
| 197 | + "alt_sub_sample_id"] = old_alt_id |
| 198 | + |
| 199 | + for sub_key in (skey |
| 200 | + for skey in new_sample_sub_groups |
| 201 | + if skey not in old_sample_sub_groups): |
| 202 | + raise RuntimeError( |
| 203 | + "Use of given sample names and conflict resolution through" |
| 204 | + " longest common prefix is not implemented yet!!!") |
| 205 | + |
| 206 | + new_samples = sample_data.loc[ |
| 207 | + sample_data.alt_sub_sample_id == "PLACE_HOLDER"].copy() |
| 208 | + new_sample_groups = new_samples.groupby( |
| 209 | + level=["path","unambig_id"]).groups |
| 210 | + for (_id,(path, unambig)) in enumerate(new_sample_groups): |
| 211 | + main_name = "Sample_{nid}".format(nid=_id + 1 + num_old_samples) |
| 212 | + sample_data.loc[ |
| 213 | + new_sample_groups[(path, unambig)], |
| 214 | + "alt_sample_id" ] = main_name |
| 215 | + sub_groups = sample_data[ |
| 216 | + sample_data.alt_sample_id == main_name].groupby( |
| 217 | + level=["ambig_id"]).groups |
| 218 | + if len(sub_groups)==1: |
| 219 | + # Easy Case when Sample not split between lanes or |
| 220 | + # in multiple files with differing running number |
| 221 | + sample_data.loc[sample_data.alt_sample_id == main_name, |
| 222 | + "alt_sub_sample_id"] = main_name |
| 223 | + else: |
| 224 | + #import pdb; pdb.set_trace() |
| 225 | + for (_id, ambig) in enumerate(sub_groups): |
| 226 | + sample_data.loc[((sample_data.alt_sample_id == main_name) & |
| 227 | + (sample_data.ambig_id == ambig)), |
| 228 | + "alt_sub_sample_id"] = "{sid}.{nid}".format( |
| 229 | + sid=main_name, nid=_id+1) |
| 230 | + if (sample_data["alt_sub_sample_id"]=="PLACE_HOLDER").any(): |
| 231 | + raise RuntimeError("Some unique ids could not be assigned") |
| 232 | + |
| 233 | +def generate_sample_config(sample_data): |
| 234 | + samples = dict( |
| 235 | + (key, dict(("read{read}".format(read=read_id), os.path.join( |
| 236 | + sample_data[( |
| 237 | + (sample_data.sub_sample_id==key) & |
| 238 | + (sample_data.read_id == read_id))].path[0], |
| 239 | + sample_data[( |
| 240 | + (sample_data.sub_sample_id==key) & |
| 241 | + (sample_data.read_id == read_id))].file[0] |
| 242 | + )) |
| 243 | + for read_id in sample_data[ |
| 244 | + sample_data.sub_sample_id==key].read_id )) |
| 245 | + for key in sample_data.sub_sample_id) |
| 246 | + for key in samples: |
| 247 | + samples[key]["alt_id"] = sample_data[ |
| 248 | + sample_data.sub_sample_id==key].alt_sub_sample_id[0] |
| 249 | + return samples |
| 250 | + |
| 251 | + |
| 252 | +def main(fastq_dir): |
| 253 | + # fastq_dir = '/scratch/Projekte/MF1_genome-reconstruction/covpipe_testdata/20211126_FS10000749_53_BPG61617-2127/sedaghatjoo/' |
| 254 | + assert os.path.isdir(fastq_dir), f"{fastq_dir} does not exist" |
| 255 | + fastq_all = [f for f in os.listdir(fastq_dir) if re.search(r'.(fq|fnq|fastq)(.gz)?', f)] |
| 256 | + |
| 257 | + res={"sample_data":pd.DataFrame( |
| 258 | + columns=["path", "unambig_id", "ambig_id","read", "read_id", |
| 259 | + "sample_id", "sub_sample_id", |
| 260 | + "alt_sample_id", "alt_sub_sample_id", |
| 261 | + "file","match","regex", "state"], |
| 262 | + ), |
| 263 | + "delta_files":[]} |
| 264 | + res['sample_data'].set_index(["path","unambig_id","ambig_id","read"], inplace=True, drop=False) |
| 265 | + |
| 266 | + check_and_add_fastq(fastq_all, res, fastq_dir) |
| 267 | + samp_data = res["sample_data"] |
| 268 | + resolve_sample_id_conflicts(samp_data) |
| 269 | + foo = generate_sample_config(samp_data) |
| 270 | + |
| 271 | + print(f"sample,fastq_1,fastq_2") |
| 272 | + for sample in foo: |
| 273 | + print(f"{sample},{foo[sample]['read1']},{foo[sample]['read2']}") |
| 274 | + |
| 275 | +if __name__ == "__main__": |
| 276 | + fastq_dir = sys.argv[1] |
| 277 | + main(fastq_dir) |
0 commit comments