-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreparing_files_for_Alevin_bash.sh
65 lines (35 loc) · 2.76 KB
/
Preparing_files_for_Alevin_bash.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
###############
# ====================================
# Author:Manu Singh 20 November 2020
# =====================================
# Set Enviornment variables as per the .bash_profiles on the system
# =====================================
#first we are dealing with TE coordinates of human genome
# the file name is "Human_class2_TE.bed"
#Human_class2_TE.bed file looks like following
#chr1 41379 42285 ERVL-E-int 1118.000000 - hg19_rmsk exon . gene_id "ERVL-E-int"; transcript_id "ERVL-E-int_dup3";
# these bed files are already curated based on the 3 step filtering process mentioned in manuscript
# this file was generated and further modified for the analysis of manuscripts viz. Nature 516 (7531), 405-409, Bioessays 38 (1), 109-117, Circulation 136 (19), 1824-1839, JCI insight 5 (7), Viruses 12 (11), 1303, bioRxiv, 318329
# the full bed files are available on request
#Here we are going to make a combined gtf file containing these TEs and gene from human hg19 sequences
# First step to create an ID for each sequences, which we could fetch later for the integrative kinds of analysis
cut -f 2 Human_class2_TE.bed -d ";" | cut -f3 -d " " | sed 's/"//g' > Human_class2_TE_transID.txt
## Here, we add the prefix "TELONG", so that we can fetch TEs from mixed data later
paste Human_class2_TE.bed Human_class2_TE_transID.txt | awk '{print $1,$2,$3,"TELONG-"$14,$5,$6}' OFS="\t" > Human_class2_TE_ID.bed
awk '{print $1,$2,$3,$4"-"$1"-"$2"-"$3,$5,$6}' OFS="\t" Human_class2_TE_ID.bed > Human_class2_TE_IDv2.bed
bedtools getfasta -fi hg19.fa -bed Human_class2_TE_IDv2.bed -s -name | awk '{ print toupper($0) }' | sed 's/(+)\|(-)//g' > Human_class2_TE_ID.fa
bedtools maskfasta -fi hg19.fa -bed Human_class2_TE_ID.bed -fo hg19_masked.fa
#above commands have given the fasta files of TEs , where the headers can be split to gain any information from TEs as given above
# Now let's deal with genes
#This is mart export file, downlaoded from UCSC, with following Genes coordinates
#file name is "mart_export.txt"
#ENSG00000229483 13 23743974 23744736 -1 LINC00362
awk 'NR>1{if($5=="1") print "chr"$2,$3,$4,$6"_"$1,$1,"+"; else print "chr"$2,$3,$4,$6"_"$1,$1,"-"}' OFS="\t" mart_export.txt > hg19_Genes.bed
bedtools getfasta -fi hg19_masked.fa -bed hg19_Genes.bed -s -name | awk '{ print toupper($0) }' | sed 's/(+)\|(-)//g' > hg19_Genes.fa
cat hg19_Genes.fa Human_class2_TE_ID.fa > hg19_Genes_long_TE_ID.fa
grep ">" hg19_Genes_long_TE_ID.fa | sed 's/>//g' | awk '{print $1,$1}' OFS="\t" > txp2gene.tsv
/programs/salmon-1.2.1/bin/salmon index -t hg19_Genes_long_TE_ID.fa -i Genes_TEindex -k 31
### The required files to run Alevin is here
/workdir/Manu/Homo_Genome/Genome/bed_files/Alevin/TE_long/Genes_TEindex
/workdir/Manu/Homo_Genome/Genome/bed_files/Alevin/TE_long/txp2gene.tsv
########