update parse_gtf to parse and classify UTR features in gencode annnot…

…ation types
comprna · Apr 10, 2024 · ed3826d · ed3826d
1 parent b80066c
commit ed3826d
Show file tree

Hide file tree

Showing 6 changed files with 274 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -166,16 +166,17 @@ R2Dtool is designed to work with tab-delimited, plain text BED-like files (optio
 - column 5, representing strand, is assumed as being (+)
 - columns 4, and any column greater than 5 are preserved during annotation or liftover, and may be used to hold additional information, e.g. predictions of RNA modification stoichiometry, probability, etc.
 
-The GTF annotation provided must contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using gffread.  
 
 # Gene annotation requirements 
 
 R2Dtool is designed to work with GTF version 2 annotations
 
-- Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR' and 'three_prime_utr'/'3UTR' 
+- Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR'
 - 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation 
 - Column 9 should contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype')
 
+The GTF annotation provided must contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using gffread.  
+
 # Utilities
 
 ### Convert CHEUI model II output to a BED-like input

diff --git a/benchmark.sh b/benchmark.sh
@@ -157,13 +157,14 @@ cargo build --release
 ##########
 
 # compile from source and run liftover
-cd R2Dtool 
+cd ~/R2Dtool 
 cargo build --release 
 export PATH="$PATH:$(pwd)/target/release/"
 
 # liftover transcriptomic sites to genomic coordinates
 mkdir ./test/outputs/ 2>/dev/null
 time r2d liftover -H -g ./test/GRCm39_subset.gtf -i ./test/out_CHEUI_modelII.bed > ./test/outputs/liftover.bed
+head ./test/outputs/liftover.bed
 
 # make a 6-col bedfile without header for bedtools
 tail -n +2 ./test/outputs/liftover.bed | cut -f1-6 > ./test/outputs/liftover.bed6
@@ -198,4 +199,16 @@ time r2d annotate -H -g ./test/GRCm39_subset.gtf -i ./test/out_CHEUI_modelII.bed
 grep -A 20 -B 20 "ENSMUST00000124002.2" ./test/outputs/annotate_all.txt | more
 
 # no gene info for 
-ENSMUST00000124002.2
+ENSMUST00000124002.2
+
+
+##########
+
+# test annotation parsing for GENCODE gtf files
+mkdir ~/R2Dtool/test/temp
+export annotation="/g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf"
+cat $annotation | grep "ENST00000579823.1" > ~/R2Dtool/test/temp/gecode_test.gtf
+
+# see how the transcript is parsed 
+cd ~/R2Dtool 
+cargo test test_print_all_transcript_info -- --nocapture 
diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh
@@ -1,17 +1,26 @@
 #!/bin/bash
 
 # Written by AJ Sethi on 2022-09-08
-# Last modified on ...
+# Last modified on 2024-04-09 using R2Dtool v2.0.0 
 
 ##################################################
 
-# add files to environment
-export methylationCalls="/g/data/xc17/pm1122/xpore_hek293/results/HEK293T-WT_CHEUI_predictions_site_level_WT.txt"
+# R2Dtool rust implementation 
+export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"
+
+# Input data corresponds to HEK293 m6A predictions, calculated against the gencode V38 transcriptome 
+export methylation_calls="/g/data/xc17/pm1122/xpore_hek293/results/HEK293T-WT_CHEUI_predictions_site_level_WT.txt"
 export annotation="/g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf"
-export wd="/scratch/lf10/as7425/R2_example"; mkdir -p ${wd}
+export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}
 
 # convert cheui data to bed-like
-bash ~/R2Dtool/scripts/cheui_to_bed.sh ${methylationCalls} "${wd}/methylationCalls.bed"
+bash ~/R2Dtool/scripts/cheui_to_bed.sh ${methylation_calls} "${wd}/methylation_calls.bed"
+
+# annotate the methylation calls against gencode v38 
+r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed"
+
+# liftover the annotated called to genomic coordinates 
+r2d liftover -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed"
 
 # annotate methylation calls using Gencode v38
 time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylationCalls.bed" "${annotation}" "${wd}/methylationCalls_annotated.bed"

diff --git a/src/annotate.rs b/src/annotate.rs
@@ -99,7 +99,7 @@ fn splice_site_distances(tx_coord: u64, splice_sites: &[SpliceSite]) -> (Option<
 
 pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Box<dyn Error>> {
 
-    eprintln!("Running the annotate functionality...");
+    // eprintln!("Running the annotate functionality...");
 
     let gtf_file: String = matches.get_one::<String>("gtf").unwrap().to_string();
     let input_file: String = matches.get_one::<String>("input").unwrap().to_string();
@@ -213,11 +213,11 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
 }
 
 pub fn preview_annotations(annotations: &HashMap<String, Transcript>) {
-    println!("Number of annotations: {}", annotations.len()); // Add this line
+    eprintln!("Number of annotations: {}", annotations.len()); 
     for (key, transcript) in annotations {
-        println!("Annotations start");
-        println!("Transcript ID: {}", key);
-        println!("{:#?}", transcript);
-        println!("Annotations end");
+        eprintln!("Annotations start");
+        eprintln!("Transcript ID: {}", key);
+        eprintln!("{:#?}", transcript);
+        eprintln!("Annotations end");
     }
 }