added new scripts to BuildIndices docker for marmoset (#155)

Added scripts for handling custom marmoset GTF
broadinstitute · Jan 17, 2025 · 9b82dec · 9b82dec
1 parent 05157c7
commit 9b82dec
Show file tree

Hide file tree

Showing 3 changed files with 367 additions and 0 deletions.
diff --git a/3rd-party-tools/build-indices/Dockerfile b/3rd-party-tools/build-indices/Dockerfile
@@ -36,6 +36,9 @@ ENV PATH /usr/local/STAR-${STAR_VERSION}/bin/Linux_x86_64_static/:$PATH
 WORKDIR /script
 COPY add-introns-to-gtf.py .
 COPY modify_gtf.py .
+COPY modify_gtf_marmoset.py .
+COPY create_marmoset_header_mt_genes.py .
+
 # Add all scripts to PATH
 ENV PATH /script/:$PATH
 

diff --git a/3rd-party-tools/build-indices/create_marmoset_header_mt_genes.py b/3rd-party-tools/build-indices/create_marmoset_header_mt_genes.py
@@ -0,0 +1,92 @@
+import sys
+
+def add_gene_entries(gtf_file):
+
+    genes = {}
+
+    # First collect ALL features for each gene
+    for line in open(gtf_file):
+        fields = line.strip().split('\t')
+        if len(fields) != 9:
+            continue
+
+        attrs = {}
+        for attr in fields[8].split('; '):
+            if attr:
+                key, value = attr.split(' ', 1)
+                attrs[key] = value.strip('"')
+
+        if 'gene_id' in attrs and attrs['gene_id'].startswith('MT-'):
+            gene_id = attrs['gene_id']
+            # Track coordinates regardless of feature type
+            if gene_id not in genes:
+                genes[gene_id] = {
+                    'chrom': fields[0],
+                    'source': fields[1],
+                    'min_start': int(fields[3]),
+                    'max_end': int(fields[4]),
+                    'strand': fields[6],
+                    'attrs': attrs,
+                    'features': set([fields[2]])  # Track feature types
+                }
+            else:
+                genes[gene_id]['min_start'] = min(genes[gene_id]['min_start'], int(fields[3]))
+                genes[gene_id]['max_end'] = max(genes[gene_id]['max_end'], int(fields[4]))
+                genes[gene_id]['features'].add(fields[2])
+
+    gene_entries = []
+    for gene_id, info in sorted(genes.items()):
+        attrs_str = f'gene_id "{gene_id}"; '
+        if 'gene' in info['attrs']:
+            attrs_str += f'gene "{info["attrs"]["gene"]}"; '
+        if 'transcript_biotype' in info['attrs']:
+            attrs_str += f'transcript_biotype "{info["attrs"]["transcript_biotype"]}"; '
+        if 'gene_version' in info['attrs']:
+            attrs_str += f'gene_version "{info["attrs"]["gene_version"]}"; '
+        if 'gene_name' in info['attrs']:
+            attrs_str += f'gene_name "{info["attrs"]["gene_name"]}"'
+
+        gene_entry = [
+            info['chrom'],
+            info['source'],
+            'gene',
+            str(info['min_start']),
+            str(info['max_end']),
+            '.',
+            info['strand'],
+            '.',
+            attrs_str
+        ]
+        if gene_entry[-1].endswith(';"; '):
+            gene_entry[-1] = gene_entry[-1][:-3]
+        gene_entries.append('\t'.join(gene_entry))
+
+
+
+
+    with open(gtf_file) as f:
+
+        # First print the new header
+        new_header = """#gtf-version 2.2
+#!genome-build mCalJa1.2.pat.X
+#!genome-build-accession NCBI_Assembly:GCF_011100555.1
+#!annotation-date 03/02/2023
+#!annotation-source NCBI RefSeq GCF_011100555.1-RS_2023_03
+##gff-version 2
+##source-version rtracklayer 1.58.0
+##date 2023-09-11"""
+        print(new_header)
+
+        for line in f:
+            if not line.startswith("#"):
+                print(line.strip())
+    for entry in gene_entries:
+        print(entry)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python script.py input.gtf")
+        sys.exit(1)
+    add_gene_entries(sys.argv[1])
+
+
diff --git a/3rd-party-tools/build-indices/modify_gtf_marmoset.py b/3rd-party-tools/build-indices/modify_gtf_marmoset.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+
+import argparse
+import gzip
+import re
+
+def get_ref_source(input_gtf):
+    """Determine if the GTF is from RefSeq or GENCODE."""
+    reference_source = ""
+    with open(input_gtf, 'r') as fh:
+        for line_num, line in enumerate(fh):
+            if ('gene_type' in line) or ('transcript_type' in line):
+                reference_source = "Gencode"
+                print('Gencode Reference')
+                break
+            elif ('gene_biotype' in line) or ('transcript_biotype' in line):
+                reference_source = "Refseq"
+                print('Refseq reference')
+                break
+    return reference_source 
+
+def is_valid_chromosome_region(fields, species):
+    """
+    Check if the feature is in a valid chromosome region.
+    Only filters PAR regions for human.
+    """
+    if species != "human":
+        return True
+
+    chromosome = fields[0]
+    if chromosome != "chrY":
+        return True
+
+    start_pos = int(fields[3])
+    # For human chrY, check if it's in the valid region (2752083-56887903)
+    return 2752083 <= start_pos < 56887903
+
+def is_allowable_biotype(biotype, ref_source, features_dic=None):
+    """Check if biotype should be included based on reference source."""
+    # Handle mitochondrial genes first
+    if features_dic and (features_dic.get('gene', '').startswith('MT-') or 
+                        features_dic.get('gene_name', '').startswith('MT-')):
+        return True
+
+    if ref_source == "Refseq":
+        allowed_types = {
+            'mRNA',           
+            'tRNA',
+            'rRNA',
+            'C_gene_segment',
+            'V_gene_segment',
+            'lnc_RNA',      
+        }
+    else:  # Gencode/Ensembl
+        allowed_types = {
+            'protein_coding',
+            'protein_coding_LoF',
+            'lncRNA',
+            'tRNA',
+            'rRNA',
+            'mRNA',
+            'IG_C_gene',
+            'IG_D_gene',
+            'IG_J_gene',
+            'IG_LV_gene',
+            'IG_V_gene',
+            'IG_V_pseudogene',
+            'IG_J_pseudogene',
+            'IG_C_pseudogene',
+            'TR_C_gene',
+            'TR_D_gene',
+            'TR_J_gene',
+            'TR_V_gene',
+            'TR_V_pseudogene',
+            'TR_J_pseudogene'
+        }
+
+    # Handle cases where biotype is missing but it's an MT gene
+    if biotype is None and features_dic and features_dic.get('gene', '').startswith('MT-'):
+        return True
+
+    return biotype in allowed_types
+
+def get_features(features):
+    """Parse GTF attributes into a dictionary."""
+    features_dict = {}
+    key_value_pairs = features.split(';')
+    for pair in key_value_pairs:
+        key_value = pair.strip().split(' ', 1)
+        if len(key_value) == 2:
+            key, value = key_value
+            key = key.strip()
+            value = value.strip(' "')
+            if key:
+                features_dict[key] = value
+        elif len(key_value) == 1:
+            key = key_value[0].strip()
+            features_dict[key] = ""
+    return features_dict
+
+def modify_attr(features_dic):
+    """Modify attribute fields to version format."""
+    for key in features_dic.copy():
+        if key in ["exon_id", "gene_id", "transcript_id"]:
+            version_key = key.replace("_id", "_version")
+            if '.' in features_dic[key]:
+                features_dic[key], version = features_dic[key].split(".", 1)
+            else:
+                features_dic[key] = features_dic[key]
+                version = 0
+
+            if version_key not in features_dic:
+                features_dic[version_key] = version
+        if key == 'gene' and "gene_name" not in features_dic.keys():
+            features_dic['gene_name'] = features_dic['gene']
+
+    modified_features = "; ".join([f'{key} "{value}"' for key, value in features_dic.items() if key and value is not None])
+    return modified_features
+
+def get_gene_ids_Refseq(input_gtf, species):
+    """Collect gene IDs from RefSeq GTF that meet filtering criteria."""
+    gene_ids = []
+    with open(input_gtf, 'r') as input_file:
+        for line in input_file:
+            if not line.startswith('#'):
+                fields = [x.strip() for x in line.strip().split('\t')]
+                if not is_valid_chromosome_region(fields, species):
+                    continue
+
+                if fields[2] == 'transcript':
+                    features = re.sub('"', '', line.strip().split('\t')[8].strip())
+                    features_dic = get_features(features)
+
+                    # Special handling for mitochondrial genes
+                    if fields[0] == "chrM" or features_dic.get('gene', '').startswith('MT-'):
+                        gene = features_dic['gene_id'].split('.', 1)[0]
+                        if gene not in gene_ids:
+                            gene_ids.append(gene)
+                        continue
+
+                    biotype = features_dic.get('transcript_biotype') or features_dic.get('gene_biotype')
+                    if biotype and is_allowable_biotype(biotype, "Refseq", features_dic):
+                        gene = features_dic['gene_id'].split('.', 1)[0]
+                        if gene not in gene_ids:
+                            gene_ids.append(gene)
+                    elif features_dic.get('gene', '').startswith('MT-'):
+                        # Handle MT genes without biotype
+                        gene = features_dic['gene_id'].split('.', 1)[0]
+                        if gene not in gene_ids:
+                            gene_ids.append(gene)
+    return gene_ids
+
+def get_gene_ids_Gencode(input_gtf, species):
+    """Collect gene IDs from GENCODE GTF that meet filtering criteria."""
+    gene_ids = set()
+    with open(input_gtf, 'r') as input_file:
+        print("open successful")
+        for line in input_file:
+            if line.startswith('#'):
+                continue
+            fields = [x.strip() for x in line.strip().split('\t')]
+
+            if not is_valid_chromosome_region(fields, species):
+                continue
+
+            if fields[2] != 'transcript':
+                continue
+
+            features = re.sub('"', '', line.strip().split('\t')[8].strip())
+            features_dic = get_features(features)
+
+            # Special handling for mitochondrial genes
+            if fields[0] == "chrM" or features_dic.get('gene', '').startswith('MT-'):
+                gene = features_dic['gene_id']
+                if gene not in gene_ids:
+                    gene_ids.add(gene)
+                continue
+
+            gene_type = features_dic.get('gene_type')
+            transcript_type = features_dic.get('transcript_type') or features_dic.get('transcript_biotype')
+
+            # Handle MT genes without biotype
+            if features_dic.get('gene', '').startswith('MT-'):
+                gene = features_dic['gene_id']
+                if gene not in gene_ids:
+                    gene_ids.add(gene)
+                continue
+
+            if (gene_type and is_allowable_biotype(gene_type, "Gencode", features_dic) and
+                (transcript_type is None or is_allowable_biotype(transcript_type, "Gencode", features_dic))):
+                gene = features_dic['gene_id']
+                if gene not in gene_ids:
+                    gene_ids.add(gene)
+
+    return list(gene_ids)
+
+def filter_gtf(input_gtf, output_gtf, species, gene_ids):
+    """Filter and write the GTF file."""
+    with open(input_gtf, 'r') as input_file:
+        with open(output_gtf, 'w') as output_gtf:
+            for line in input_file:
+                if line.startswith("#"):
+                    output_gtf.write(line)
+                    continue
+
+                fields = [x.strip() for x in line.strip().split("\t")]
+
+                if not is_valid_chromosome_region(fields, species):
+                    continue
+
+                features = re.sub('"', '', line.strip().split('\t')[8].strip())
+                if features.endswith(';'):
+                    features = features[:-1]                     
+                features_dic = get_features(features)
+
+                # For human, skip XGY2 gene explicitly
+                if species == "human" and features_dic.get('gene_id') == "ENSG00000290840":
+                    continue
+
+                if features_dic['gene_id'] in gene_ids:
+                    modified_fields = fields.copy()
+                    modified_fields[8] = modify_attr(features_dic)
+                    output_gtf.write("{}".format("\t".join(modified_fields)+ "\n"))
+
+def main():
+    """Main function to process GTF files."""
+    parser = argparse.ArgumentParser(description="Filter GTF files based on biotypes and format specifications")
+    parser.add_argument(
+        "--input-gtf",
+        "-i",
+        dest="input_gtf",
+        required=True,
+        help="input GTF file"
+    )
+    parser.add_argument(
+        "--output-gtf",
+        "-o",
+        dest="output_gtf",
+        required=True,
+        help="output GTF file"
+    )
+    parser.add_argument(
+        "--species",
+        "-s",
+        dest="species",
+        required=True,
+        help="Species name (e.g., human, mouse, etc.)"
+    )
+    args = parser.parse_args()
+
+    print(f"Processing {args.species} GTF file...")
+
+    print("Determining reference source...")
+    ref_source = get_ref_source(input_gtf=args.input_gtf)
+
+    print("Collecting gene IDs...")
+    if ref_source == "Refseq":
+        gene_ids = get_gene_ids_Refseq(input_gtf=args.input_gtf, species=args.species)
+    elif ref_source == "Gencode":
+        gene_ids = get_gene_ids_Gencode(input_gtf=args.input_gtf, species=args.species)
+    else:
+        print("Error: The input GTF file format is not recognized. Must be either GENCODE or RefSeq format.")
+        exit(1)
+
+    print(f"Found {len(gene_ids)} genes matching criteria")
+    print("Writing filtered GTF file...")
+
+    filter_gtf(args.input_gtf, args.output_gtf, args.species, gene_ids)
+    print("Done! Filtered GTF file has been written to:", args.output_gtf)
+
+if __name__ == "__main__":
+    main()