diff --git a/3rd-party-tools/build-indices/Dockerfile b/3rd-party-tools/build-indices/Dockerfile index 7f8d7075..410e1de9 100644 --- a/3rd-party-tools/build-indices/Dockerfile +++ b/3rd-party-tools/build-indices/Dockerfile @@ -36,6 +36,9 @@ ENV PATH /usr/local/STAR-${STAR_VERSION}/bin/Linux_x86_64_static/:$PATH WORKDIR /script COPY add-introns-to-gtf.py . COPY modify_gtf.py . +COPY modify_gtf_marmoset.py . +COPY create_marmoset_header_mt_genes.py . + # Add all scripts to PATH ENV PATH /script/:$PATH diff --git a/3rd-party-tools/build-indices/create_marmoset_header_mt_genes.py b/3rd-party-tools/build-indices/create_marmoset_header_mt_genes.py new file mode 100644 index 00000000..7e23088a --- /dev/null +++ b/3rd-party-tools/build-indices/create_marmoset_header_mt_genes.py @@ -0,0 +1,92 @@ +import sys + +def add_gene_entries(gtf_file): + + genes = {} + + # First collect ALL features for each gene + for line in open(gtf_file): + fields = line.strip().split('\t') + if len(fields) != 9: + continue + + attrs = {} + for attr in fields[8].split('; '): + if attr: + key, value = attr.split(' ', 1) + attrs[key] = value.strip('"') + + if 'gene_id' in attrs and attrs['gene_id'].startswith('MT-'): + gene_id = attrs['gene_id'] + # Track coordinates regardless of feature type + if gene_id not in genes: + genes[gene_id] = { + 'chrom': fields[0], + 'source': fields[1], + 'min_start': int(fields[3]), + 'max_end': int(fields[4]), + 'strand': fields[6], + 'attrs': attrs, + 'features': set([fields[2]]) # Track feature types + } + else: + genes[gene_id]['min_start'] = min(genes[gene_id]['min_start'], int(fields[3])) + genes[gene_id]['max_end'] = max(genes[gene_id]['max_end'], int(fields[4])) + genes[gene_id]['features'].add(fields[2]) + + gene_entries = [] + for gene_id, info in sorted(genes.items()): + attrs_str = f'gene_id "{gene_id}"; ' + if 'gene' in info['attrs']: + attrs_str += f'gene "{info["attrs"]["gene"]}"; ' + if 'transcript_biotype' in info['attrs']: + attrs_str += f'transcript_biotype "{info["attrs"]["transcript_biotype"]}"; ' + if 'gene_version' in info['attrs']: + attrs_str += f'gene_version "{info["attrs"]["gene_version"]}"; ' + if 'gene_name' in info['attrs']: + attrs_str += f'gene_name "{info["attrs"]["gene_name"]}"' + + gene_entry = [ + info['chrom'], + info['source'], + 'gene', + str(info['min_start']), + str(info['max_end']), + '.', + info['strand'], + '.', + attrs_str + ] + if gene_entry[-1].endswith(';"; '): + gene_entry[-1] = gene_entry[-1][:-3] + gene_entries.append('\t'.join(gene_entry)) + + + + + with open(gtf_file) as f: + + # First print the new header + new_header = """#gtf-version 2.2 +#!genome-build mCalJa1.2.pat.X +#!genome-build-accession NCBI_Assembly:GCF_011100555.1 +#!annotation-date 03/02/2023 +#!annotation-source NCBI RefSeq GCF_011100555.1-RS_2023_03 +##gff-version 2 +##source-version rtracklayer 1.58.0 +##date 2023-09-11""" + print(new_header) + + for line in f: + if not line.startswith("#"): + print(line.strip()) + for entry in gene_entries: + print(entry) + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("Usage: python script.py input.gtf") + sys.exit(1) + add_gene_entries(sys.argv[1]) + + diff --git a/3rd-party-tools/build-indices/modify_gtf_marmoset.py b/3rd-party-tools/build-indices/modify_gtf_marmoset.py new file mode 100644 index 00000000..64fb5b41 --- /dev/null +++ b/3rd-party-tools/build-indices/modify_gtf_marmoset.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 + +import argparse +import gzip +import re + +def get_ref_source(input_gtf): + """Determine if the GTF is from RefSeq or GENCODE.""" + reference_source = "" + with open(input_gtf, 'r') as fh: + for line_num, line in enumerate(fh): + if ('gene_type' in line) or ('transcript_type' in line): + reference_source = "Gencode" + print('Gencode Reference') + break + elif ('gene_biotype' in line) or ('transcript_biotype' in line): + reference_source = "Refseq" + print('Refseq reference') + break + return reference_source + +def is_valid_chromosome_region(fields, species): + """ + Check if the feature is in a valid chromosome region. + Only filters PAR regions for human. + """ + if species != "human": + return True + + chromosome = fields[0] + if chromosome != "chrY": + return True + + start_pos = int(fields[3]) + # For human chrY, check if it's in the valid region (2752083-56887903) + return 2752083 <= start_pos < 56887903 + +def is_allowable_biotype(biotype, ref_source, features_dic=None): + """Check if biotype should be included based on reference source.""" + # Handle mitochondrial genes first + if features_dic and (features_dic.get('gene', '').startswith('MT-') or + features_dic.get('gene_name', '').startswith('MT-')): + return True + + if ref_source == "Refseq": + allowed_types = { + 'mRNA', + 'tRNA', + 'rRNA', + 'C_gene_segment', + 'V_gene_segment', + 'lnc_RNA', + } + else: # Gencode/Ensembl + allowed_types = { + 'protein_coding', + 'protein_coding_LoF', + 'lncRNA', + 'tRNA', + 'rRNA', + 'mRNA', + 'IG_C_gene', + 'IG_D_gene', + 'IG_J_gene', + 'IG_LV_gene', + 'IG_V_gene', + 'IG_V_pseudogene', + 'IG_J_pseudogene', + 'IG_C_pseudogene', + 'TR_C_gene', + 'TR_D_gene', + 'TR_J_gene', + 'TR_V_gene', + 'TR_V_pseudogene', + 'TR_J_pseudogene' + } + + # Handle cases where biotype is missing but it's an MT gene + if biotype is None and features_dic and features_dic.get('gene', '').startswith('MT-'): + return True + + return biotype in allowed_types + +def get_features(features): + """Parse GTF attributes into a dictionary.""" + features_dict = {} + key_value_pairs = features.split(';') + for pair in key_value_pairs: + key_value = pair.strip().split(' ', 1) + if len(key_value) == 2: + key, value = key_value + key = key.strip() + value = value.strip(' "') + if key: + features_dict[key] = value + elif len(key_value) == 1: + key = key_value[0].strip() + features_dict[key] = "" + return features_dict + +def modify_attr(features_dic): + """Modify attribute fields to version format.""" + for key in features_dic.copy(): + if key in ["exon_id", "gene_id", "transcript_id"]: + version_key = key.replace("_id", "_version") + if '.' in features_dic[key]: + features_dic[key], version = features_dic[key].split(".", 1) + else: + features_dic[key] = features_dic[key] + version = 0 + + if version_key not in features_dic: + features_dic[version_key] = version + if key == 'gene' and "gene_name" not in features_dic.keys(): + features_dic['gene_name'] = features_dic['gene'] + + modified_features = "; ".join([f'{key} "{value}"' for key, value in features_dic.items() if key and value is not None]) + return modified_features + +def get_gene_ids_Refseq(input_gtf, species): + """Collect gene IDs from RefSeq GTF that meet filtering criteria.""" + gene_ids = [] + with open(input_gtf, 'r') as input_file: + for line in input_file: + if not line.startswith('#'): + fields = [x.strip() for x in line.strip().split('\t')] + if not is_valid_chromosome_region(fields, species): + continue + + if fields[2] == 'transcript': + features = re.sub('"', '', line.strip().split('\t')[8].strip()) + features_dic = get_features(features) + + # Special handling for mitochondrial genes + if fields[0] == "chrM" or features_dic.get('gene', '').startswith('MT-'): + gene = features_dic['gene_id'].split('.', 1)[0] + if gene not in gene_ids: + gene_ids.append(gene) + continue + + biotype = features_dic.get('transcript_biotype') or features_dic.get('gene_biotype') + if biotype and is_allowable_biotype(biotype, "Refseq", features_dic): + gene = features_dic['gene_id'].split('.', 1)[0] + if gene not in gene_ids: + gene_ids.append(gene) + elif features_dic.get('gene', '').startswith('MT-'): + # Handle MT genes without biotype + gene = features_dic['gene_id'].split('.', 1)[0] + if gene not in gene_ids: + gene_ids.append(gene) + return gene_ids + +def get_gene_ids_Gencode(input_gtf, species): + """Collect gene IDs from GENCODE GTF that meet filtering criteria.""" + gene_ids = set() + with open(input_gtf, 'r') as input_file: + print("open successful") + for line in input_file: + if line.startswith('#'): + continue + fields = [x.strip() for x in line.strip().split('\t')] + + if not is_valid_chromosome_region(fields, species): + continue + + if fields[2] != 'transcript': + continue + + features = re.sub('"', '', line.strip().split('\t')[8].strip()) + features_dic = get_features(features) + + # Special handling for mitochondrial genes + if fields[0] == "chrM" or features_dic.get('gene', '').startswith('MT-'): + gene = features_dic['gene_id'] + if gene not in gene_ids: + gene_ids.add(gene) + continue + + gene_type = features_dic.get('gene_type') + transcript_type = features_dic.get('transcript_type') or features_dic.get('transcript_biotype') + + # Handle MT genes without biotype + if features_dic.get('gene', '').startswith('MT-'): + gene = features_dic['gene_id'] + if gene not in gene_ids: + gene_ids.add(gene) + continue + + if (gene_type and is_allowable_biotype(gene_type, "Gencode", features_dic) and + (transcript_type is None or is_allowable_biotype(transcript_type, "Gencode", features_dic))): + gene = features_dic['gene_id'] + if gene not in gene_ids: + gene_ids.add(gene) + + return list(gene_ids) + +def filter_gtf(input_gtf, output_gtf, species, gene_ids): + """Filter and write the GTF file.""" + with open(input_gtf, 'r') as input_file: + with open(output_gtf, 'w') as output_gtf: + for line in input_file: + if line.startswith("#"): + output_gtf.write(line) + continue + + fields = [x.strip() for x in line.strip().split("\t")] + + if not is_valid_chromosome_region(fields, species): + continue + + features = re.sub('"', '', line.strip().split('\t')[8].strip()) + if features.endswith(';'): + features = features[:-1] + features_dic = get_features(features) + + # For human, skip XGY2 gene explicitly + if species == "human" and features_dic.get('gene_id') == "ENSG00000290840": + continue + + if features_dic['gene_id'] in gene_ids: + modified_fields = fields.copy() + modified_fields[8] = modify_attr(features_dic) + output_gtf.write("{}".format("\t".join(modified_fields)+ "\n")) + +def main(): + """Main function to process GTF files.""" + parser = argparse.ArgumentParser(description="Filter GTF files based on biotypes and format specifications") + parser.add_argument( + "--input-gtf", + "-i", + dest="input_gtf", + required=True, + help="input GTF file" + ) + parser.add_argument( + "--output-gtf", + "-o", + dest="output_gtf", + required=True, + help="output GTF file" + ) + parser.add_argument( + "--species", + "-s", + dest="species", + required=True, + help="Species name (e.g., human, mouse, etc.)" + ) + args = parser.parse_args() + + print(f"Processing {args.species} GTF file...") + + print("Determining reference source...") + ref_source = get_ref_source(input_gtf=args.input_gtf) + + print("Collecting gene IDs...") + if ref_source == "Refseq": + gene_ids = get_gene_ids_Refseq(input_gtf=args.input_gtf, species=args.species) + elif ref_source == "Gencode": + gene_ids = get_gene_ids_Gencode(input_gtf=args.input_gtf, species=args.species) + else: + print("Error: The input GTF file format is not recognized. Must be either GENCODE or RefSeq format.") + exit(1) + + print(f"Found {len(gene_ids)} genes matching criteria") + print("Writing filtered GTF file...") + + filter_gtf(args.input_gtf, args.output_gtf, args.species, gene_ids) + print("Done! Filtered GTF file has been written to:", args.output_gtf) + +if __name__ == "__main__": + main() \ No newline at end of file