diff --git a/Dockerfile b/Dockerfile index 6a74d2d..c2de092 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,14 @@ -FROM ubuntu:latest as bioinformatics_base +FROM ubuntu:20.04 as bioinformatics_base #===============================# # Docker Image Configuration # #===============================# -LABEL vendor="Englander Institute for Precision Medicine" \ - description="ERVmap" \ - maintainer="ans2077@med.cornell.edu" \ - base_image="ubuntu" \ - base_image_version="latest" \ - base_image_SHA256="sha256:fc04b2781f41f76c5b126ec26c0c0c26c7fc047318347a2112856253a88bb01d" +LABEL org.opencontainers.image.source='https://github.com/eipm/ERVmap' \ + vendor="Englander Institute for Precision Medicine" \ + description="ERVmap" \ + maintainer="ans2077@med.cornell.edu" \ + base_image="ubuntu" \ + base_image_version="20.04" ENV APP_NAME="ERVmap" \ TZ='US/Eastern' \ @@ -19,7 +19,7 @@ RUN apt-get update \ && apt-get upgrade -y --fix-missing \ && apt-get install build-essential -y \ && apt-get install -y \ - vim \ + vim \ emacs \ bedtools \ wget \ @@ -58,7 +58,6 @@ RUN wget -O STAR-${STAR_VERSION}.tar.gz https://github.com/alexdobin/STAR/archiv && make STAR RUN ln -s ${star_dir}/source/STAR /usr/local/bin/ - #===========================# # Production layer # #===========================# diff --git a/README.md b/README.md index ae227d1..7f09a5f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,30 @@ # **ERVmap** + ERVmap is one part curated database of human proviral ERV loci and one part a stringent algorithm to determine which ERVs are transcribed in their RNA seq data. -## Citation +[](https://github.com/eipm/ERVmap/actions) [](https://github.com/eipm/ERVmap) [](https://hub.docker.com/repository/docker/eipm/ervmap) [](https://github.com/orgs/eipm/packages/container/package/ervmap) + +## Citation + Tokuyama M. et. al., ERVmap analysis reveals genome-wide transcription of human endogenous retroviruses. Proc Natl Acad Sci USA 2018 Dec 11;115(50):12565-12572. [doi: 10.1073/pnas.1814589115](http:/doi.org/10.1073/pnas.1814589115). ## **How to use it** ### Install + This version of the tool consists on 2 steps: 1. alignment to the human genome (GRC38) and 2. quantification of the ERV regions. To download and install ERVmap latest version provided as docker image, simply type: -``` + +```bash docker pull eipm/ervmap:latest ``` + **NOTE**: for a specific version replace `latest` with the release version. ### **How to run ERVmap** + To run ERVmap, you'd need: 1. an indexed genome reference for STAR; 2. A bed file with the curated ERV regions on the human genome (see `ERVmap.bed`); 3. the input FASTQ data (gzipped). Assuming that your sample is called `SAMPLE`, and has 2 FASTQ files (one per read) in the folder `/path/to/input/data`; the reference genome is in `/path/to/genome` and the ERV bed file is in `/path/to/erv/file` here is the command: -``` + +```bash docker run --rm \ -u $(id -u):$(id -g) \ -v /path/to/input/data:/data:ro \ @@ -28,8 +37,10 @@ docker run --rm \ --output SAMPLE/SAMPLE. \ --mode ALL ``` + This command will generate the alignment files (BAMs) in the `/path/to/output/SAMPLE/` folder and all files will have the prefix `SAMPLE.`. The generated files will be: -``` + +```bash SAMPLE.Aligned.sortedByCoord.out.bam SAMPLE.Aligned.sortedByCoord.out.bam.bai SAMPLE.ERVresults.txt @@ -38,9 +49,11 @@ SAMPLE.Log.out SAMPLE.Log.progress.out SAMPLE.SJ.out.tab ``` + (See [STAR documentation](https://github.com/alexdobin/STAR) for the description of the output files of the STAR aligner ). The results of ERV quantification will be in the `SAMPLE.ERVresults.txt` file. This is a tab-delimited file with 7 columns from [bedtools](https://bedtools.readthedocs.io/en/latest/). For example: -``` + +```bash 1 896176 898458 5803 500 + 70 1 1412251 1418852 5804 500 + 36 1 3801730 3806808 5807 500 + 6 @@ -48,32 +61,34 @@ The results of ERV quantification will be in the `SAMPLE.ERVresults.txt` file. T ``` ## The **`--mode`** option + This option can only have 3 values: { `ALL`, `STAR`, `BED` }: + * `ALL` to run both the STAR aligner and the ERV quantification from start to finish; * `STAR` to only perform the alignment; * `BED` to only run the ERV quantification. - ### <a id='optparam'></a>Optional parameters (*recommended*) There are a few parameters that can be added to the ERVmap image to make the process more efficient. + * `--cpus 20`: if you have a multi-core system (and you should have one), you can specify the number of CPUs to use (e.g. 20); -* `--limit-ram 48000000000`: this limits the amount of RAM used to avoid overusing the resources +* `--limit-ram 48000000000`: this limits the amount of RAM used to avoid overusing the resources You can see the full set of parameters by typing: `docker run --rm ervmap`. -There are also other parameters from Docker that should be included before `ervmap` in the command line, e.g. -``` +There are also other parameters from Docker that should be included before `ervmap` in the command line, e.g. + +```bash --memory 50G \ --memory-swap 100G -``` - ---- +``` -# Nextflow version +## Nextflow version To run this pipeline using [Nextflow](https://www.nextflow.io/), simply run the following: `nextflow -C nextflow.config run main.nf` where `nextflow.config` include the minimum set of parameters to run ERVmap within the docker container. Specifically: -``` + +```bash params { genome='/path/to/genome' # external path to the indexed genome for the STAR aligner inputDir='path/to/input/folder' # external path of the input data @@ -92,14 +107,15 @@ params { ---- -# Published version +## Published version Please note that the instructions hereafter refer to the orignal published version (see [ERVmap on GitHub](https://github.com/mtokuyama/ERVmap)) ## **Installing** ### Install dependencies -``` + +```bash bedtools2 cufflinks bwa-0.7.17 @@ -112,7 +128,8 @@ trim (http://graphics.med.yale.edu/trim/) ``` ### Install .pl and r files -``` + +```bash erv_genome.pl interleaved.pl run_clean_htseq.pl @@ -126,19 +143,22 @@ normalize_deseq.r This step will yield raw counts for cellular genes and ERVmap loci as separate files. -### For single-end sequences: -``` +### For single-end sequences + +```bash erv_genome.pl -stage 1 -stage2 6 -fastq /${i}_SS.fastq.gz ``` -### For pair-end sequences: -``` +### For pair-end sequences + +```bash interleaved.pl --read1 ${i}_R1.fastq.gz --read2 ${i}_R2.fastq.gz > ${i}.fastq.gz erv_genome.pl -stage 1 -stage2 6 -fastq /${i}.fastq.gz ``` ### Store output files -``` + +```bash mkdir -p output mv ./sample/herv_coverage_GRCh38_genome.txt ./output/erv/${i}.e mv ./sample/GRCh38/htseq.cnt ./output/cellular/${i}.c @@ -146,10 +166,10 @@ mv ./sample/GRCh38/htseq.cnt ./output/cellular/${i}.c ## **Clean up data, merge, and normalize** -These steps will yield normalized ERV read counts based on size factors obtained through DESeq2 analysis. -Use the output files from above. +These steps will yield normalized ERV read counts based on size factors obtained through DESeq2 analysis. +Use the output files from above. -``` +```bash run_clean_htseq.pl ./output/cellular c c2 __ merge_count.pl 3 6 e ./output/erv > ./output/erv/merged_erv.txt merge_count.pl 0 1 c2 ./output/cellular > ./output/cellular/merged_cellular.txt @@ -161,6 +181,3 @@ normalize_with_file.pl ./output/cellular/normalized_factors ./output/erv/merged_ * Maria Tokuyama * Yong Kong - - -