MoTrPAC · archanaraja · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/README.md b/README.md
@@ -297,6 +297,7 @@ elif [[ "${GENOME}" == "motrpac_rn6" ]]; then
   TSS="http://mitra.stanford.edu/montgomery/projects/motrpac/atac/SCG/motrpac_references/rn6_release96/Rattus_norvegicus.Rnor_6.0.96_protein_coding.tss.bed.gz"
   BLACKLIST=
 ```
+**NOTE:** The TSS reference file was generated from the Ensembl GTF using [this script](src/tss_from_gtf.py).
 
 Now run the script to build the custom genome database. Specify a destination directory and install the MoTrPAC rn6 reference with the following command. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time. 
 ```bash

diff --git a/src/tss_from_gtf.py b/src/tss_from_gtf.py
@@ -0,0 +1,54 @@
+"""
+Extract transcription start sites from an Ensembl GTF. Used to create the TSS reference file used in 
+~/ATAC_PIPELINE/atac-seq-pipeline/scripts/build_genome_data.sh
+
+Usage: 
+    python tss_from_gtf.py <gtf.gz> <tss_outfile.gz>
+"""
+import gzip
+import re
+import sys
+
+in_gtf = sys.argv[1]
+out_tss = sys.argv[2]
+
+# make TSS annotation file from GTF
+# chr1	69089	69090	ENSG00000186092.4	0	+
+# chr1    HAVANA  gene    11869   14409   .       +       .       gene_id "ENSG00000223972.5";
+
+with gzip.open(in_gtf, 'rb') as gtf, gzip.open(out_tss, 'wb') as out:
+
+	for line in gtf:
+
+		if line.startswith('#'):
+			continue
+
+		l = line.strip().split('\t')
+
+		if not l[2] == 'gene':
+			continue
+
+		strand = l[6]
+
+		if strand == '+':
+			tss = l[3]
+		elif strand == '-':
+			tss = l[4]
+		else:
+			print 'Strand not recognized'
+			break
+
+		chrom = l[0]
+
+		gene_id = l[8].split(';')[0].split(' ')[1]
+		gene_id = re.sub("\"", "", gene_id)
+
+		gene_type = l[8].split(';')[2].split(' ')[1]
+		if not gene_type == 'protein_coding':
+			continue
+
+		end = tss
+		start = int(tss) - 1
+
+		out.write('\t'.join([chrom, str(start), end, gene_id, '0', strand])+'\n')
+