Skip to content

Commit

Permalink
added new scripts to BuildIndices docker for marmoset (#155)
Browse files Browse the repository at this point in the history
Added scripts for handling custom marmoset GTF
  • Loading branch information
ekiernan authored Jan 17, 2025
1 parent 05157c7 commit 9b82dec
Show file tree
Hide file tree
Showing 3 changed files with 367 additions and 0 deletions.
3 changes: 3 additions & 0 deletions 3rd-party-tools/build-indices/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ ENV PATH /usr/local/STAR-${STAR_VERSION}/bin/Linux_x86_64_static/:$PATH
WORKDIR /script
COPY add-introns-to-gtf.py .
COPY modify_gtf.py .
COPY modify_gtf_marmoset.py .
COPY create_marmoset_header_mt_genes.py .

# Add all scripts to PATH
ENV PATH /script/:$PATH

Expand Down
92 changes: 92 additions & 0 deletions 3rd-party-tools/build-indices/create_marmoset_header_mt_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import sys

def add_gene_entries(gtf_file):

genes = {}

# First collect ALL features for each gene
for line in open(gtf_file):
fields = line.strip().split('\t')
if len(fields) != 9:
continue

attrs = {}
for attr in fields[8].split('; '):
if attr:
key, value = attr.split(' ', 1)
attrs[key] = value.strip('"')

if 'gene_id' in attrs and attrs['gene_id'].startswith('MT-'):
gene_id = attrs['gene_id']
# Track coordinates regardless of feature type
if gene_id not in genes:
genes[gene_id] = {
'chrom': fields[0],
'source': fields[1],
'min_start': int(fields[3]),
'max_end': int(fields[4]),
'strand': fields[6],
'attrs': attrs,
'features': set([fields[2]]) # Track feature types
}
else:
genes[gene_id]['min_start'] = min(genes[gene_id]['min_start'], int(fields[3]))
genes[gene_id]['max_end'] = max(genes[gene_id]['max_end'], int(fields[4]))
genes[gene_id]['features'].add(fields[2])

gene_entries = []
for gene_id, info in sorted(genes.items()):
attrs_str = f'gene_id "{gene_id}"; '
if 'gene' in info['attrs']:
attrs_str += f'gene "{info["attrs"]["gene"]}"; '
if 'transcript_biotype' in info['attrs']:
attrs_str += f'transcript_biotype "{info["attrs"]["transcript_biotype"]}"; '
if 'gene_version' in info['attrs']:
attrs_str += f'gene_version "{info["attrs"]["gene_version"]}"; '
if 'gene_name' in info['attrs']:
attrs_str += f'gene_name "{info["attrs"]["gene_name"]}"'

gene_entry = [
info['chrom'],
info['source'],
'gene',
str(info['min_start']),
str(info['max_end']),
'.',
info['strand'],
'.',
attrs_str
]
if gene_entry[-1].endswith(';"; '):
gene_entry[-1] = gene_entry[-1][:-3]
gene_entries.append('\t'.join(gene_entry))




with open(gtf_file) as f:

# First print the new header
new_header = """#gtf-version 2.2
#!genome-build mCalJa1.2.pat.X
#!genome-build-accession NCBI_Assembly:GCF_011100555.1
#!annotation-date 03/02/2023
#!annotation-source NCBI RefSeq GCF_011100555.1-RS_2023_03
##gff-version 2
##source-version rtracklayer 1.58.0
##date 2023-09-11"""
print(new_header)

for line in f:
if not line.startswith("#"):
print(line.strip())
for entry in gene_entries:
print(entry)

if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python script.py input.gtf")
sys.exit(1)
add_gene_entries(sys.argv[1])


272 changes: 272 additions & 0 deletions 3rd-party-tools/build-indices/modify_gtf_marmoset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
#!/usr/bin/env python3

import argparse
import gzip
import re

def get_ref_source(input_gtf):
"""Determine if the GTF is from RefSeq or GENCODE."""
reference_source = ""
with open(input_gtf, 'r') as fh:
for line_num, line in enumerate(fh):
if ('gene_type' in line) or ('transcript_type' in line):
reference_source = "Gencode"
print('Gencode Reference')
break
elif ('gene_biotype' in line) or ('transcript_biotype' in line):
reference_source = "Refseq"
print('Refseq reference')
break
return reference_source

def is_valid_chromosome_region(fields, species):
"""
Check if the feature is in a valid chromosome region.
Only filters PAR regions for human.
"""
if species != "human":
return True

chromosome = fields[0]
if chromosome != "chrY":
return True

start_pos = int(fields[3])
# For human chrY, check if it's in the valid region (2752083-56887903)
return 2752083 <= start_pos < 56887903

def is_allowable_biotype(biotype, ref_source, features_dic=None):
"""Check if biotype should be included based on reference source."""
# Handle mitochondrial genes first
if features_dic and (features_dic.get('gene', '').startswith('MT-') or
features_dic.get('gene_name', '').startswith('MT-')):
return True

if ref_source == "Refseq":
allowed_types = {
'mRNA',
'tRNA',
'rRNA',
'C_gene_segment',
'V_gene_segment',
'lnc_RNA',
}
else: # Gencode/Ensembl
allowed_types = {
'protein_coding',
'protein_coding_LoF',
'lncRNA',
'tRNA',
'rRNA',
'mRNA',
'IG_C_gene',
'IG_D_gene',
'IG_J_gene',
'IG_LV_gene',
'IG_V_gene',
'IG_V_pseudogene',
'IG_J_pseudogene',
'IG_C_pseudogene',
'TR_C_gene',
'TR_D_gene',
'TR_J_gene',
'TR_V_gene',
'TR_V_pseudogene',
'TR_J_pseudogene'
}

# Handle cases where biotype is missing but it's an MT gene
if biotype is None and features_dic and features_dic.get('gene', '').startswith('MT-'):
return True

return biotype in allowed_types

def get_features(features):
"""Parse GTF attributes into a dictionary."""
features_dict = {}
key_value_pairs = features.split(';')
for pair in key_value_pairs:
key_value = pair.strip().split(' ', 1)
if len(key_value) == 2:
key, value = key_value
key = key.strip()
value = value.strip(' "')
if key:
features_dict[key] = value
elif len(key_value) == 1:
key = key_value[0].strip()
features_dict[key] = ""
return features_dict

def modify_attr(features_dic):
"""Modify attribute fields to version format."""
for key in features_dic.copy():
if key in ["exon_id", "gene_id", "transcript_id"]:
version_key = key.replace("_id", "_version")
if '.' in features_dic[key]:
features_dic[key], version = features_dic[key].split(".", 1)
else:
features_dic[key] = features_dic[key]
version = 0

if version_key not in features_dic:
features_dic[version_key] = version
if key == 'gene' and "gene_name" not in features_dic.keys():
features_dic['gene_name'] = features_dic['gene']

modified_features = "; ".join([f'{key} "{value}"' for key, value in features_dic.items() if key and value is not None])
return modified_features

def get_gene_ids_Refseq(input_gtf, species):
"""Collect gene IDs from RefSeq GTF that meet filtering criteria."""
gene_ids = []
with open(input_gtf, 'r') as input_file:
for line in input_file:
if not line.startswith('#'):
fields = [x.strip() for x in line.strip().split('\t')]
if not is_valid_chromosome_region(fields, species):
continue

if fields[2] == 'transcript':
features = re.sub('"', '', line.strip().split('\t')[8].strip())
features_dic = get_features(features)

# Special handling for mitochondrial genes
if fields[0] == "chrM" or features_dic.get('gene', '').startswith('MT-'):
gene = features_dic['gene_id'].split('.', 1)[0]
if gene not in gene_ids:
gene_ids.append(gene)
continue

biotype = features_dic.get('transcript_biotype') or features_dic.get('gene_biotype')
if biotype and is_allowable_biotype(biotype, "Refseq", features_dic):
gene = features_dic['gene_id'].split('.', 1)[0]
if gene not in gene_ids:
gene_ids.append(gene)
elif features_dic.get('gene', '').startswith('MT-'):
# Handle MT genes without biotype
gene = features_dic['gene_id'].split('.', 1)[0]
if gene not in gene_ids:
gene_ids.append(gene)
return gene_ids

def get_gene_ids_Gencode(input_gtf, species):
"""Collect gene IDs from GENCODE GTF that meet filtering criteria."""
gene_ids = set()
with open(input_gtf, 'r') as input_file:
print("open successful")
for line in input_file:
if line.startswith('#'):
continue
fields = [x.strip() for x in line.strip().split('\t')]

if not is_valid_chromosome_region(fields, species):
continue

if fields[2] != 'transcript':
continue

features = re.sub('"', '', line.strip().split('\t')[8].strip())
features_dic = get_features(features)

# Special handling for mitochondrial genes
if fields[0] == "chrM" or features_dic.get('gene', '').startswith('MT-'):
gene = features_dic['gene_id']
if gene not in gene_ids:
gene_ids.add(gene)
continue

gene_type = features_dic.get('gene_type')
transcript_type = features_dic.get('transcript_type') or features_dic.get('transcript_biotype')

# Handle MT genes without biotype
if features_dic.get('gene', '').startswith('MT-'):
gene = features_dic['gene_id']
if gene not in gene_ids:
gene_ids.add(gene)
continue

if (gene_type and is_allowable_biotype(gene_type, "Gencode", features_dic) and
(transcript_type is None or is_allowable_biotype(transcript_type, "Gencode", features_dic))):
gene = features_dic['gene_id']
if gene not in gene_ids:
gene_ids.add(gene)

return list(gene_ids)

def filter_gtf(input_gtf, output_gtf, species, gene_ids):
"""Filter and write the GTF file."""
with open(input_gtf, 'r') as input_file:
with open(output_gtf, 'w') as output_gtf:
for line in input_file:
if line.startswith("#"):
output_gtf.write(line)
continue

fields = [x.strip() for x in line.strip().split("\t")]

if not is_valid_chromosome_region(fields, species):
continue

features = re.sub('"', '', line.strip().split('\t')[8].strip())
if features.endswith(';'):
features = features[:-1]
features_dic = get_features(features)

# For human, skip XGY2 gene explicitly
if species == "human" and features_dic.get('gene_id') == "ENSG00000290840":
continue

if features_dic['gene_id'] in gene_ids:
modified_fields = fields.copy()
modified_fields[8] = modify_attr(features_dic)
output_gtf.write("{}".format("\t".join(modified_fields)+ "\n"))

def main():
"""Main function to process GTF files."""
parser = argparse.ArgumentParser(description="Filter GTF files based on biotypes and format specifications")
parser.add_argument(
"--input-gtf",
"-i",
dest="input_gtf",
required=True,
help="input GTF file"
)
parser.add_argument(
"--output-gtf",
"-o",
dest="output_gtf",
required=True,
help="output GTF file"
)
parser.add_argument(
"--species",
"-s",
dest="species",
required=True,
help="Species name (e.g., human, mouse, etc.)"
)
args = parser.parse_args()

print(f"Processing {args.species} GTF file...")

print("Determining reference source...")
ref_source = get_ref_source(input_gtf=args.input_gtf)

print("Collecting gene IDs...")
if ref_source == "Refseq":
gene_ids = get_gene_ids_Refseq(input_gtf=args.input_gtf, species=args.species)
elif ref_source == "Gencode":
gene_ids = get_gene_ids_Gencode(input_gtf=args.input_gtf, species=args.species)
else:
print("Error: The input GTF file format is not recognized. Must be either GENCODE or RefSeq format.")
exit(1)

print(f"Found {len(gene_ids)} genes matching criteria")
print("Writing filtered GTF file...")

filter_gtf(args.input_gtf, args.output_gtf, args.species, gene_ids)
print("Done! Filtered GTF file has been written to:", args.output_gtf)

if __name__ == "__main__":
main()

0 comments on commit 9b82dec

Please sign in to comment.