diff --git a/.gitignore b/.gitignore index 39e5657..3b490e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,27 @@ +# General +*.pyc +.~lock* + +# Tests +.cache/ +*peppro_test* + +# MkDocs files +site/ + +# Jekyll files +jekyll/ +_site +.DS_store +.jekyll +.bundle +.sass-cache +_site/ +/_site/ +.sass-cache/ +.jekyll-metadata + +# Annotation files # ignore local annotation files anno/hg19_annotations.bed.gz anno/hg19_annotations.bed @@ -7,3 +31,4 @@ anno/mm10_annotations.bed.gz anno/mm10_annotations.bed anno/mm9_annotations.bed.gz anno/mm9_annotations.bed + diff --git a/BiocProject/PEPPRO_BiocProject.Rmd b/BiocProject/PEPPRO_BiocProject.Rmd new file mode 100644 index 0000000..9654da8 --- /dev/null +++ b/BiocProject/PEPPRO_BiocProject.Rmd @@ -0,0 +1,55 @@ +--- +title: "PEPPRO BiocProject" +author: "Jason Smith" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{PEPPRO BiocProject} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Introduction + +Before you start see the [Getting started with `BiocProject` vignette](http://code.databio.org/BiocProject/articles/vignette1getStarted.html) for the basic `BiocProject` information and installation instructions and [`PEPPRO` website](http://peppro.databio.org) for information regarding this nascent RNA profiling pipeline. + +`BiocProject` provides a straigtforward method to read in pipeline outputs as listed in the `outputs` section of its [pipeline interface](http://code.databio.org/looper/pipeline-interface/). + +__With a single line of code you can read all the indicated results and your project metadata.__ + +# Read the results of `PEPPRO` run + +The function shown below reads in the gene count `BED` files from the `output` section specified in the [`PEPPRO` pipeline interface](https://github.com/databio/peppro/blob/master/pipeline_interface.yaml). + +The way the output files are read is defined in a [function](https://github.com/databio/peppro/blob/master/BiocProject/readPepproGeneCounts.R) supplied by the `PEPPRO` developers. The function listed in `bioconductor` section of `PEPPRO` pipeline interface file is identified by `BiocProject` function, sourced and automatically executed on samples matching the protocols bound to the pipeline specified as an argument in [`outputsByPipeline`](http://code.databio.org/BiocProject/reference/outputsByPipeline.html) function. + +## Get the project config + +```{r echo=T, message=FALSE} +library(BiocProject) +ProjectConfig = "peppro_da.yaml" +``` + +## Run the `BiocProject` function + +```{r} +bp = BiocProject(ProjectConfig) +``` + +As you can see in the message above, the `readPepproGeneCounts` function was sourced from the file indicated in the `PEPPRO` pipeline interface. + +## Browse the results + +The read data is conveninetly stored in a `List` object with ([`pepr::Project`](http://code.databio.org/pepr/reference/Project-class.html) object in its metadata slot: + +```{r} +bp +``` \ No newline at end of file diff --git a/BiocProject/peppro_da.csv b/BiocProject/peppro_da.csv new file mode 100644 index 0000000..9d55d3e --- /dev/null +++ b/BiocProject/peppro_da.csv @@ -0,0 +1,7 @@ +sample_name,toggle,protocol,organism,read_type,umi_status,umi_length,data_source,read1,read2,srr,experiment,geo,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Cell_Line,Cell_type,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Instrument,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,Sample Name,source_name,SRA Study,treatment +H9_DMSO_rep1,1,PRO,human,PAIRED,true_8,8,SRA,PE1,PE2,SRR10669536,SRX7348011,GSM4214080,OTHER,83,4066110724,PRJNA594951,SAMN13541464,1576539216,H9,embryonic stem cells,GEO,public,"fastq,sra","s3,ncbi,gs","s3.us-east-1,gs.US,ncbi.public",NextSeq 500,PAIRED,other,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2019-12-13T00:00:00Z,GSM4214080,H9 cells,SRP236879,control +H9_DMSO_rep2,1,PRO,human,PAIRED,true_8,8,SRA,PE1,PE2,SRR10669537,SRX7348012,GSM4214081,OTHER,83,4824187397,PRJNA594951,SAMN13541463,1851708346,H9,embryonic stem cells,GEO,public,"sra,fastq","gs,s3,ncbi","gs.US,s3.us-east-1,ncbi.public",NextSeq 500,PAIRED,other,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2019-12-13T00:00:00Z,GSM4214081,H9 cells,SRP236879,control +H9_DMSO_rep3,1,PRO,human,PAIRED,true_8,8,SRA,PE1,PE2,SRR10669538,SRX7348013,GSM4214082,OTHER,83,3857336412,PRJNA594951,SAMN13541462,1508923353,H9,embryonic stem cells,GEO,public,"sra,fastq","s3,ncbi,gs","gs.US,s3.us-east-1,ncbi.public",NextSeq 500,PAIRED,other,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2019-12-13T00:00:00Z,GSM4214082,H9 cells,SRP236879,control +H9_200nM_romidepsin_rep1,1,PRO,human,PAIRED,true_8,8,SRA,PE1,PE2,SRR10669539,SRX7348014,GSM4214083,OTHER,83,4636791999,PRJNA594951,SAMN13541471,1798846852,H9,embryonic stem cells,GEO,public,"fastq,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",NextSeq 500,PAIRED,other,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2019-12-13T00:00:00Z,GSM4214083,H9 cells,SRP236879,60 minutes 200nM romidepsin +H9_200nM_romidepsin_rep2,1,PRO,human,PAIRED,true_8,8,SRA,PE1,PE2,SRR10669540,SRX7348015,GSM4214084,OTHER,82,4730726832,PRJNA594951,SAMN13541470,1833437275,H9,embryonic stem cells,GEO,public,"fastq,sra","ncbi,gs,s3","gs.US,ncbi.public,s3.us-east-1",NextSeq 500,PAIRED,other,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2019-12-13T00:00:00Z,GSM4214084,H9 cells,SRP236879,60 minutes 200nM romidepsin +H9_200nM_romidepsin_rep3,1,PRO,human,PAIRED,true_8,8,SRA,PE1,PE2,SRR10669541,SRX7348016,GSM4214085,OTHER,83,5015008131,PRJNA594951,SAMN13541469,1922230177,H9,embryonic stem cells,GEO,public,"fastq,sra","s3,gs,ncbi","ncbi.public,gs.US,s3.us-east-1",NextSeq 500,PAIRED,other,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2019-12-13T00:00:00Z,GSM4214085,H9 cells,SRP236879,60 minutes 200nM romidepsin diff --git a/BiocProject/peppro_da.yaml b/BiocProject/peppro_da.yaml new file mode 100644 index 0000000..3431f57 --- /dev/null +++ b/BiocProject/peppro_da.yaml @@ -0,0 +1,27 @@ +# Run PEPPRO paper differential analysis samples through PEPPRO +name: PEPPRO + +metadata: + sample_table: "peppro_da.csv" + output_dir: "$PROCESSED/peppro/paper/da" + pipeline_interfaces: "$CODE/peppro/pipeline_interface.yaml" +bioconductor: + readFunName: readPepproGeneCounts + readFunPath: readPepproGeneCounts.R + +derived_columns: [read1, read2] + +data_sources: + PE1: "${SRAFQ}/{srr}_1.fastq.gz" + PE2: "${SRAFQ}/{srr}_2.fastq.gz" + +implied_columns: + organism: + human: + genome: hg38 + prealignments: human_rDNA + max_len: -1 + umi_status: + true_8: + umi_len: 8 + diff --git a/BiocProject/readPepproGeneCounts.R b/BiocProject/readPepproGeneCounts.R new file mode 100644 index 0000000..acddf46 --- /dev/null +++ b/BiocProject/readPepproGeneCounts.R @@ -0,0 +1,31 @@ +readPepproGeneCounts = function(project) { + cwd <- getwd() + project_dir <- pepr::config(project)$metadata$output_dir + sample_names <- pepr::samples(project)$sample_name + genomes <- as.list(pepr::samples(project)$genome) + names(genomes) <- sample_names + paths <- vector("list", length(sample_names)) + names(paths) <- sample_names + + for (sample in sample_names) { + paths[[sample]] <- paste(project_dir, 'results_pipeline', sample, + paste0('signal_', genomes[[sample]]), + paste0(sample, "_gene_coverage.bed"), sep="/") + } + + result <- lapply(paths, function(x){ + #message(paste0("x: ", x)) + if (file.exists(x)) { + df <- fread(x) + colnames(df) <- c('chr', 'start', 'end', 'geneName', + 'score', 'strand', 'count') + gr <- GenomicRanges::GRanges(df) + } else { + gr <- GenomicRanges::GRanges() + } + }) + + setwd(cwd) + #names(result) <- sample_names + return(GenomicRanges::GRangesList(Filter(length, result))) +} diff --git a/PEPPROr/DESCRIPTION b/PEPPROr/DESCRIPTION index 46f4b32..c5a1a05 100644 --- a/PEPPROr/DESCRIPTION +++ b/PEPPROr/DESCRIPTION @@ -1,6 +1,6 @@ Package: PEPPROr Title: Functions and libraries to analyze pro-seq (or gro-seq) data -Version: 0.0.1.0000 +Version: 0.0.2.0000 Authors@R: person("Jason", "Smith", email = "jasonsmith@virginia.edu", role = c("aut", "cre")) Maintainer: Jason Smith Description: Installs required libraries to calculate the fraction of reads in features, to plot library complexity curves, TSS enrichments, and fragment length distributions. diff --git a/PEPPROr/NAMESPACE b/PEPPROr/NAMESPACE index 220ca80..1743893 100644 --- a/PEPPROr/NAMESPACE +++ b/PEPPROr/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(plotCutadapt) +export(plotAdapt) export(plotPI) export(mRNAcontamination) export(plotFRiF) diff --git a/PEPPROr/R/PEPPROr.R b/PEPPROr/R/PEPPROr.R index 3a502f8..577f448 100644 --- a/PEPPROr/R/PEPPROr.R +++ b/PEPPROr/R/PEPPROr.R @@ -14,6 +14,29 @@ NULL ################################################################################ # FUNCTIONS +#' A standardized ggplot theme for PEPPRO plots +#' +#' @keywords ggplot2 theme +#' @examples +#' theme_PEPPRO() +theme_PEPPRO <- function(base_family = "sans", ...){ + theme_classic(base_family = base_family, base_size = 14, ...) + + theme( + axis.line = element_line(size = 0.5), + axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_rect(fill = "transparent"), + plot.background = element_rect(fill = "transparent", color = NA), + legend.background = element_rect(fill = "transparent", color = NA), + legend.box.background = element_rect(fill = "transparent", color = NA), + aspect.ratio = 1, + legend.position = "none", + plot.title = element_text(hjust = 0.5), + panel.border = element_rect(colour = "black", fill=NA, size=0.5) + ) +} + #' Plot library complexity curves #' @@ -75,7 +98,7 @@ plotComplexityCurves <- function(ccurves, rcDT$name <- basename(rc_file$V1) rcDT$total <- as.integer(rc_file$V2) if (ncol(rc_file) == 3 && !ignore_unique) { - rcDT$unique <- as.integer(rc_file$V3) + rcDT$unique <- as.integer(rc_file$V2) } else { rcDT$unique <- NA } @@ -84,9 +107,9 @@ plotComplexityCurves <- function(ccurves, for (rc in 2:length(real_counts_path)) { rc_file <- real_counts_path[rc] rcDT$name[rc] <- basename(rc_file$V1) - rcDT$total[rc] <- as.integer(rc_file$V2) + rcDT$total[rc] <- as.integer(rc_file$V3) if (ncol(rc_file) == 3 && !ignore_unique) { - rcDT$unique[rc] <- as.integer(rc_file$V3) + rcDT$unique[rc] <- as.integer(rc_file$V2) } else { rcDT$unique[rc] <- NA } @@ -98,9 +121,9 @@ plotComplexityCurves <- function(ccurves, if (file.exists(real_counts_path[rc]) && info$size != 0) { rc_file <- fread(real_counts_path[rc]) rcDT$name[rc] <- basename(rc_file$V1) - rcDT$total[rc] <- as.integer(rc_file$V2) + rcDT$total[rc] <- as.integer(rc_file$V3) if (ncol(rc_file) == 3 && !ignore_unique) { - rcDT$unique[rc] <- as.integer(rc_file$V3) + rcDT$unique[rc] <- as.integer(rc_file$V2) } else { rcDT$unique[rc] <- NA } @@ -243,6 +266,15 @@ plotComplexityCurves <- function(ccurves, } } + # Plot by millions of reads + plottingFactor <- 1000000 + + df$total_reads <- df$total_reads/plottingFactor + df$expected_distinct <- df$expected_distinct/plottingFactor + rcDT$total <- rcDT$total/plottingFactor + rcDT$unique <- rcDT$unique/plottingFactor + x_max <- x_max/plottingFactor + # Plot the curve fig <- ggplot(df, aes(total_reads, expected_distinct, @@ -259,8 +291,8 @@ plotComplexityCurves <- function(ccurves, aes(total, unique, col=color), shape=23, size=3) message(paste0("INFO: Found real counts for ", - paste(rcDT$name, sep=","), " - Total: ", - rcDT$total, " Unique: ", + paste(rcDT$name, sep=","), " - Total (M): ", + rcDT$total, " Unique (M): ", rcDT$unique, "\n")) } else if (any(rcDT$total > 0)) { if (max(rcDT$total) > max(df$total_reads)) { @@ -273,8 +305,8 @@ plotComplexityCurves <- function(ccurves, aes(total, interp, col=color), shape=23, size=3) message(paste0("INFO: Found real counts for ", - paste(rcDT$name, sep=","), " - Total: ", - rcDT$total, " (preseq unique reads: ", + paste(rcDT$name, sep=","), " - Total (M): ", + rcDT$total, " (preseq unique reads (M): ", interp, ")\n")) } } else { @@ -308,6 +340,11 @@ plotComplexityCurves <- function(ccurves, if (coverage > 0) { default_ylim <- as.numeric(default_ylim) / coverage } + + # Adjust limits by plottingFactor + default_ylim <- default_ylim/plottingFactor + preseq_ymax <- preseq_ymax/plottingFactor + fig <- fig + coord_cartesian(xlim=c(x_min, x_max), ylim = c(default_ylim, @@ -324,52 +361,48 @@ plotComplexityCurves <- function(ccurves, if (coverage > 0) { if (!any(is.na(rcDT$unique)) && any(rcDT$unique > 0)) { fig <- fig + - xlab(paste0("Total Coverage (incl. duplicates)\n", - "Points show read count versus deduplicated ", - "read counts (externally calculated)")) + labs(x = paste0("total coverage (incl. duplicates)"), + caption = paste0("Points show read count versus ", + "deduplicated read counts ", + "(externally calculated)")) } else if (any(rcDT$total > 0)) { fig <- fig + - xlab(paste0("Total Coverage (incl. duplicates)\n", - "Points show read count versus projected unique ", - "read counts on the curves")) + labs(x = "total coverage (incl. duplicates)", + caption = paste0("Points show read count versus projected ", + "unique read counts on the curves")) } else { fig <- fig + - xlab(paste0("Total Coverage (incl. duplicates)")) + labs(x = "total coverage (incl. duplicates)") } fig <- fig + - ylab("Unique Coverage") + - ggtitle("Complexity Curve: preseq") + labs = (y = "unique coverage") + #ggtitle("Complexity Curve: preseq") } else { if (!any(is.na(rcDT$unique)) && any(rcDT$unique > 0)) { fig <- fig + - xlab(paste0("Total Reads (incl. duplicates)\n", - "Points show read count versus deduplicated ", - "read counts (externally calculated)")) + labs(x = "total reads (M) (incl. duplicates)", + caption = paste0("Points show read count versus deduplicated ", + "read counts (externally calculated)")) } else if (any(rcDT$total > 0)) { fig <- fig + - xlab(paste0("Total Reads (incl. duplicates)\n", - "Points show externally calculated read ", - "counts on the curves")) + labs(x = "total reads (M) (incl. duplicates)", + caption = paste0("Points show externally calculated read ", + "counts on the curves")) } else { fig <- fig + - xlab(paste0("Total Reads (incl. duplicates)")) + labs(x = "total reads (M) (incl. duplicates)") } fig <- fig + - ylab("Unique Reads") + - ggtitle("Complexity Curve: preseq") + labs(y = "unique reads (M)") + #ggtitle("Complexity Curve: preseq") } fig <- fig + labs(col = "") + scale_color_discrete(labels=c(clist$SAMPLE_NAME)) + - theme_classic(base_size=14) + - theme(axis.line = element_line(size = 0.5)) + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - aspect.ratio = 1, - panel.border = element_rect(colour = "black", - fill=NA, size=0.5)) + - theme(plot.title = element_text(hjust = 0.5)) + theme_PEPPRO() + + theme(legend.position = "right", + plot.caption = element_text(size = 8, face = "italic")) # inset zoom plot zoom_theme <- theme(legend.position = "none", @@ -383,7 +416,7 @@ plotComplexityCurves <- function(ccurves, panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(color='black'), - plot.margin = unit(c(0,0,-6,-6),"mm")) + plot.margin = unit(c(0.1,0.1,-6,-6),"mm")) if (!any(is.na(rcDT$unique)) && any(rcDT$unique > 0)) { zoom_fig <- ggplot(df, aes(total_reads, @@ -408,7 +441,7 @@ plotComplexityCurves <- function(ccurves, annotation_custom(grob = g, xmin = x_max / 2, xmax = x_max, - ymin = 0, + ymin = 10, ymax = max(preseq_ymax, max_unique)/2) } else if (any(rcDT$total > 0)) { interp <- approx(df$total_reads, df$expected_distinct, rcDT$total)$y @@ -434,13 +467,19 @@ plotComplexityCurves <- function(ccurves, annotation_custom(grob = g, xmin = x_max / 2, xmax = x_max, - ymin = 0, + ymin = 10, ymax = max(preseq_ymax, max_unique)/2) } - return(p) + # Don't include legend for single sample plots + if (length(ccurves) == 1) { + fig <- fig + theme(legend.position = "none") + } + + return(fig) } + #' Compute the axis value limit #' #' This function returns the index of ccurve_TOTAL_READS containing the @@ -480,6 +519,7 @@ computeLimit <- function(value, ccurve_TOTAL_READS) { return(first_point) } + #' Calculate the Fraction of Reads in Features (FRiF) #' #' This function calculates the fraction of reads in a feature and returns @@ -488,23 +528,38 @@ computeLimit <- function(value, ccurve_TOTAL_READS) { #' number of total features. #' #' @param bedFile A BED format file -#' @param reads Number of aligned reads +#' @param total Number of aligned reads (or number of aligned bases) +#' @param reads If TRUE, we're working with read counts. +#' If FALSE, we're working with absolute number of bases #' @keywords FRiF #' @examples #' calcFRiF() -calcFRiF <- function(bedFile, reads) { - colnames(bedFile) <- c("chromosome","start","end","count") +calcFRiF <- function(bedFile, total, reads) { + colnames(bedFile) <- c("chromosome", "start", "end", + "count", "bases", "width", "fraction") grObj <- makeGRangesFromDataFrame(bedFile) grObj <- reduce(grObj) redBed <- data.frame(chromosome=seqnames(grObj), start=start(grObj), end=end(grObj)) bedFile <- merge(redBed, bedFile, by=c("chromosome","start","end")) bedFile <- cbind(bedFile, size=(bedFile$end-bedFile$start)) - bedFile <- bedFile[order(-bedFile$count),] + + if (reads) { + bedFile <- bedFile[order(-bedFile$count),] + } else { + bedFile <- bedFile[order(-bedFile$bases),] + } + bedFile <- bedFile[apply(bedFile != 0, 1, all),] - bedFile <- cbind(bedFile, cumsum=cumsum(bedFile$count)) + + if (reads) { + bedFile <- cbind(bedFile, cumsum=cumsum(bedFile$count)) + } else { + bedFile <- cbind(bedFile, cumsum=cumsum(bedFile$bases)) + } + bedFile <- cbind(bedFile, cumSize=cumsum(bedFile$size)) - bedFile <- cbind(bedFile, frip=bedFile$cumsum/as.numeric(reads)) + bedFile <- cbind(bedFile, frip=bedFile$cumsum/as.numeric(total)) bedFile <- cbind(bedFile, numfeats=as.numeric(1:nrow(bedFile))) return(bedFile) } @@ -517,6 +572,7 @@ calcFRiF <- function(bedFile, reads) { #' #' @param sample_name Name of sample #' @param num_reads Number of aligned reads in sample +#' @param genome_size Size of genome in bp #' @param output_name Output file name #' @param bedFile A BED format file #' @keywords FRiP FRiF BED @@ -533,10 +589,15 @@ calcFRiF <- function(bedFile, reads) { #' bedFile = c("promoter", "promoter_flanking", "exon", #' "intron", "utr3", "utr5")) #' @export -plotFRiF <- function(sample_name, num_reads, output_name, bedFile) { +plotFRiF <- function(sample_name, num_reads, genome_size, + type = c("frif", "prif", "both"), + reads=TRUE, output_name, bedFile) { labels <- data.frame(xPos=numeric(), yPos=numeric(), name=character(), val=numeric(), color=character(), stringsAsFactors=FALSE) + feature_dist <- data.frame(feature=character(), numfeats=numeric(), + numbases=numeric(), expected=numeric(), + stringsAsFactors=FALSE) palette <- colorRampPalette(c("#999999", "#FFC107", "#27C6AB", "#004D40", "#B97BC8", "#009E73", "#C92404", "#E3E550", "#372B4C", "#E3DAC7", "#27CAE6", "#B361BC", @@ -549,22 +610,32 @@ plotFRiF <- function(sample_name, num_reads, output_name, bedFile) { if (exists(bedFile[1])) { bed <- get(bedFile[1]) - bedCov <- calcFRiF(bed, num_reads) + bedCov <- calcFRiF(bed, num_reads, reads) name <- bedFile[1] labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, name, round(max(bedCov$frip),2), "#FF0703") + feature_dist[1,] <- c(name, nrow(bed), + as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) bedCov$feature <- name } else if (file.exists(file.path(bedFile[1])) && info$size != 0) { - bed <- read.table(file.path(bedFile[1])) - bedCov <- calcFRiF(bed, num_reads) - name <- basename(tools::file_path_sans_ext(bedFile[1])) - name <- gsub(sample_name, "", name) - name <- gsub("^.*?_", "", name) - numFields <- 2 - for(i in 1:numFields) name <- gsub("_[^_]*$", "", name) - labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, - name, round(max(bedCov$frip),2), "#FF0703") - bedCov$feature <- name + bed <- read.table(file.path(bedFile[1])) + if (nrow(bed[which(bed$V5 != 0),]) == 0) { + message(paste0(name, " has no covered features")) + } else { + bedCov <- calcFRiF(bed, num_reads, reads) + name <- basename(tools::file_path_sans_ext(bedFile[1])) + name <- gsub(sample_name, "", name) + name <- gsub("^.*?_", "", name) + numFields <- 2 + for(i in 1:numFields) name <- gsub("_[^_]*$", "", name) + labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, + name, round(max(bedCov$frip),2), "#FF0703") + feature_dist[1,] <- c(name, nrow(bed), + as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) + bedCov$feature <- name + } } else { if (is.na(info[1])) { message(paste0(name, " coverage file is missing")) @@ -577,6 +648,8 @@ plotFRiF <- function(sample_name, num_reads, output_name, bedFile) { if (exists("bedCov")) { covDF <- bedCov + } else { + return(ggplot()) } if (length(bedFile) > 1) { @@ -604,48 +677,188 @@ plotFRiF <- function(sample_name, num_reads, output_name, bedFile) { if (max(bed[,4] > 0)) { if (exists("covDF")) { - covFile <- calcFRiF(bed, num_reads) + covFile <- calcFRiF(bed, num_reads, reads) covFile$feature <- name covDF <- rbind(covDF, covFile) labels <- rbind(labels, c(0.95*max(log10(covFile$cumSize)), max(covFile$frip)+0.001, name, round(max(covFile$frip),2), plotColors[i])) + feature_dist <- rbind(feature_dist, + c(name, nrow(bed), as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size)))) } else { - covDF <- calcFRiF(bed, num_reads) + covDF <- calcFRiF(bed, num_reads, reads) covDF$feature <- name labels <- rbind(labels, c(0.95*max(log10(covDF$cumSize)), max(covDF$frip)+0.001, name, round(max(covDF$frip),2), plotColors[i])) + feature_dist <- rbind(feature_dist, + c(name, nrow(bed), as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size)))) } } } } - # Reorder by labels + # Reorder by labels (ensures plotting matches up labels and colors) if (exists("covDF")) { covDF$feature <- factor(covDF$feature, levels=(labels$name)) } + feature_dist$numbases <- as.numeric(feature_dist$numbases) + feature_dist$expected <- as.numeric(feature_dist$expected) + feature_dist$observed <- as.numeric(labels$val) + feature_dist$logOE <- log10(feature_dist$observed/feature_dist$expected) + feature_dist$logOE <- ifelse(feature_dist$logOE < 0, 0, feature_dist$logOE) + feature_dist <- merge(feature_dist, labels, by.x="feature", by.y="name") + #feature_dist <- feature_dist[order(feature_dist$logOE, decreasing=TRUE),] + feature_dist <- feature_dist[order(feature_dist$logOE),] + rownames(feature_dist) <- NULL + feature_dist$feature <- factor(feature_dist$feature, + levels=feature_dist$feature) + feature_dist$color <- factor(feature_dist$color, + levels=feature_dist$color) + if (!is.null(bedFile)) { - # Produce plot with bed files - p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, - group=feature, color=feature)) + - geom_line() + - labs(x="log(number of bases)", y="FRiF") + - theme_classic() + - theme(panel.border = element_rect(colour = "black", - fill=NA, - size=0.5)) - - # Recolor and reposition legend - p <- p + scale_color_manual(labels=paste0(labels$name, ": ", - labels$val), - values=labels$color) + - theme(legend.position=c(0.05,0.95), - legend.justification=c(0.1,0.9)) + + if (tolower(type) == "both") { + # Produce plot with bed files + # take minimum quantile (only works if everything is above that value) + #p <- ggplot(covDF[which(covDF$frip > min(density(covDF$frip)$y)),], + # aes(x=log10(cumSize), y=frip, + # group=feature, color=feature)) + + p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, + group=feature, color=feature)) + + #geom_line(aes(linetype=feature), size=2, alpha=0.5) + + geom_line(size=2, alpha=0.5) + + guides(linetype = FALSE) + + labs(x="log10(number of bases)", y="FRiF") + + theme_PEPPRO() + + # Recolor and reposition legend + p <- p + scale_color_manual(labels=paste0(labels$name, ": ", + labels$val), + values=labels$color) + + labs(color="FRiF") + + theme(legend.position="right", + legend.justification=c(0.1,0.9), + legend.background=element_blank(), + legend.text = element_text(size = rel(0.65)), + legend.key = element_blank(), + axis.text.x = element_text(angle = 0, hjust = 1, + vjust=0.5)) + + p2 <- ggplot(feature_dist, aes(x = feature, y = logOE)) + + geom_bar(stat="identity", fill=labels$color, alpha=0.5) + + geom_hline(aes(yintercept=0), linetype="dotted") + + xlab('') + + ylab('log10(Obs/Exp)') + + coord_flip() + + scale_x_discrete(position="top") + + theme_PEPPRO() + + theme(plot.background = element_rect(fill = "transparent", + color = NA,), + panel.background = element_rect(fill = "transparent"), + rect = element_rect(fill = "transparent"), + plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) + + g <- ggplotGrob(p2) + min_x <- min(layer_scales(p)$x$range$range) + max_x <- max(layer_scales(p)$x$range$range) + min_y <- min(layer_scales(p)$y$range$range) + max_y <- max(layer_scales(p)$y$range$range) + + p <- p + annotation_custom(grob = g, xmin = 1.05*min_x, + xmax=min_x*2.05, ymin=max_y/2, + ymax=max_y) + } else if (tolower(type) == "frif") { + # take minimum quantile (only works if everything is above that value) + #p <- ggplot(covDF[which(covDF$frip > min(density(covDF$frip)$y)),], + # aes(x=log10(cumSize), y=frip, + # group=feature, color=feature)) + + p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, + group=feature, color=feature)) + + #geom_line(aes(linetype=feature), size=2, alpha=0.5) + + geom_line(size=2, alpha=0.5) + + guides(linetype = FALSE) + + labs(x="log10(number of bases)", y="FRiF") + + theme_PEPPRO() + + # Recolor and reposition legend + p <- p + scale_color_manual(labels=paste0(labels$name, ": ", + labels$val), + values=labels$color) + + labs(color="FRiF") + + theme(legend.position=c(0.075,0.975), + legend.justification=c(0.1,0.9), + legend.title = element_blank(), + legend.text = element_text(size = rel(0.65)), + legend.background=element_blank(), + legend.key = element_blank(), + axis.text.x = element_text(angle = 0, hjust = 1, + vjust=0.5)) + } else if (tolower(type) == "prif") { + p <- ggplot(feature_dist, aes(x = feature, y = logOE)) + + geom_bar(stat="identity", + fill = feature_dist$color, + alpha = 0.5) + + geom_hline(aes(yintercept=0), linetype="dotted") + + xlab('') + + ylab('log10(Obs/Exp)') + + coord_flip() + + theme_PEPPRO() + } else { + #default to both + # Produce plot with bed files + p <- ggplot(covDF, + aes(x=log10(cumSize), y=frip, + group=feature, color=feature)) + + geom_line(aes(linetype=feature), size=2, alpha=0.5) + + guides(linetype = FALSE) + + labs(x="log10(number of bases)", y="FRiF") + + theme_PEPPRO() + + # Recolor and reposition legend + p <- p + scale_color_manual(labels=paste0(labels$name, ": ", + labels$val), + values=labels$color) + + labs(color="FRiF") + + theme(legend.position="right", + legend.justification=c(0.1,0.9), + legend.background=element_blank(), + legend.key = element_blank(), + axis.text.x = element_text(angle = 0, hjust = 1, + vjust=0.5)) + + p2 <- ggplot(feature_dist, aes(x = feature, y = logOE)) + + geom_bar(stat="identity", fill=labels$color, alpha=0.5) + + geom_hline(aes(yintercept=0), linetype="dotted") + + xlab('') + + ylab('log10(Obs/Exp)') + + coord_flip() + + scale_x_discrete(position="top") + + theme_PEPPRO() + + theme(plot.background = element_rect(fill = "transparent", + color = NA,), + panel.background = element_rect(fill = "transparent"), + rect = element_rect(fill = "transparent"), + plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) + + g <- ggplotGrob(p2) + min_x <- min(layer_scales(p)$x$range$range) + max_x <- max(layer_scales(p)$x$range$range) + min_y <- min(layer_scales(p)$y$range$range) + max_y <- max(layer_scales(p)$y$range$range) + + p <- p + annotation_custom(grob = g, xmin = 1.05*min_x, + xmax=min_x*2.05, ymin=max_y/2, + ymax=max_y) + } + + } else { write("Unable to produce FRiF plot!\n", stdout()) } @@ -670,7 +883,7 @@ roundUpNice <- function(x, nice=c(1,2,3,4,5,6,7,8,9,10)) { #' Plot TSS enrichment #' -#' This function plots the global TSS enrichment and produces pdf/png files. +#' This function plots the global TSS enrichment. #' #' @param TSSfile TSS enrichment file #' @keywords TSS enrichment @@ -696,26 +909,17 @@ plotTSS <- function(TSSfile) { } } - t1 <- theme( - plot.background = element_blank(), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill=NA, size=0.5), - panel.background = element_blank(), - axis.line = element_blank(), - axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), - axis.text.y = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), - axis.title.x = element_text(face = "plain", color = "black", size = 22, - hjust = 0.5, vjust=0.5), - axis.title.y = element_text(face = "plain", color = "black", size = 22, - hjust = 0.5), - plot.title = element_text(face="bold", color = "black", size=12, - hjust=0.5), - legend.position="none", - axis.ticks.length = unit(2, "mm") - ) + t1 <- theme_classic(base_size=14) + + theme(plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", + fill=NA, size=0.5), + panel.background = element_blank(), + axis.line = element_blank(), + legend.position="none", + aspect.ratio = 1, + axis.ticks.length = unit(2, "mm")) iMat <- data.table(V1 = numeric()) if (length(TSSfile) == 1) { @@ -809,16 +1013,16 @@ plotTSS <- function(TSSfile) { pre <- ggplot(normTSS, aes(x=(as.numeric(rownames(normTSS))- (nrow(normTSS)/2)), y=score, group=1, colour="black")) + - geom_hline(yintercept = 6, linetype = 2, - color = "grey", size = 0.25) + + # geom_hline(yintercept = 6, linetype = 2, + # color = "grey", size = 0.25) + geom_smooth(method="loess", span=0.02, se=FALSE, colour=lineColor) + labs(x = "Distance from TSS (bp)", y = "TSS Enrichment Score") - y_max <- max(30, roundUpNice(TSSscore)) + y_max <- roundUpNice(TSSscore) p <- pre + t1 + scale_x_continuous(expand=c(0,0)) + scale_y_continuous(expand=c(0,0)) + - coord_cartesian(xlim=c(-2300, 2300), ylim=c(0, y_max+2)) + coord_cartesian(xlim=c(-2300, 2300), ylim=c(0, 1.1*y_max)) if (exists("minus")) { val <- 0.025*nrow(minus) # normTSS <- (minus / mean(minus[c(1:val, @@ -846,25 +1050,25 @@ plotTSS <- function(TSSfile) { y=score, group=1, colour="black"), method="loess", span=0.02, se=FALSE, colour="blue") + - annotate("rect", xmin=1200, xmax=2300, ymin=y_max-8, - ymax=y_max+2, fill="gray95", size = 0.5) + - annotate("text", x=1750, y=y_max, label="TSS Score", - fontface = 1, size=6, hjust=0.5) + - annotate("text", x=1500, y=y_max-2, label="+", fontface = 2, - size=8, hjust=0.5, color=lineColor) + - annotate("text", x=1500, y=y_max-5, label=TSSscore, - fontface = 2, size=8, hjust=0.5, color=lineColor) + - annotate("text", x=2000, y=y_max-2, label="-", - fontface = 2, size=8, hjust=0.5, color="blue") + - annotate("text", x=2000, y=y_max-5, label=minusTSSscore, - fontface = 2, size=8, hjust=0.5, color="blue") + annotate("rect", xmin=1200, xmax=2300, ymin=0.9*y_max, + ymax=1.1*y_max, fill="gray95") + + annotate("text", x=1750, y=1.05*y_max, label="TSS Score", + fontface = 1, hjust=0.5) + + annotate("text", x=1500, y=y_max, label="+", fontface = 2, + hjust=0.5, color=lineColor) + + annotate("text", x=1500, y=0.95*y_max, label=TSSscore, + fontface = 2, hjust=0.5, color=lineColor) + + annotate("text", x=2000, y=y_max, label="-", + fontface = 2, hjust=0.5, color="blue") + + annotate("text", x=2000, y=0.95*y_max, label=minusTSSscore, + fontface = 2, hjust=0.5, color="blue") } else { - p <- p + annotate("rect", xmin=1200, xmax=2300, ymin=y_max-4, - ymax=y_max+2, fill="gray95", size = 0.5) + - annotate("text", x=1750, y=y_max+1, label="TSS Score", - fontface = 1, size=6, hjust=0.5) + - annotate("text", x=1750, y=y_max-1, label=TSSscore, - fontface = 2, size=10, hjust=0.5) + p <- p + annotate("rect", xmin=1200, xmax=2300, ymin=0.9*y_max, + ymax=1.1*y_max, fill="gray95") + + annotate("text", x=1750, y=1.05*y_max, label="TSS Score", + fontface = 1, hjust=0.5) + + annotate("text", x=1750, y=0.95*y_max, label=TSSscore, + fontface = 2, hjust=0.5) } return(p) @@ -878,6 +1082,7 @@ plotTSS <- function(TSSfile) { #' @param delim A delimiter for the fields splitting a path or string sampleName <- function(path, num_fields=2, delim='_') { name <- basename(tools::file_path_sans_ext(path)) + if(num_fields == 0) {return(name)} for(n in 1:num_fields) name <- gsub(paste0(delim, "[^", delim, "]*$"), "", name) return(paste(dirname(path), name, sep="/")) } @@ -898,11 +1103,12 @@ sampleName <- function(path, num_fields=2, delim='_') { #' data("frag_len") #' data("frag_len_count") #' plotFLD(fragL = "frag_len", fragL_count = "frag_len_count", -#' fragL_txt = "fragLenDistribution_example.txt") +#' fragL_txt = "fragLenDistribution_example.txt", max_fragment=200) #' @export plotFLD <- function(fragL, fragL_count, - fragL_txt="fragLenDistribution.txt") { + fragL_txt="fragLenDistribution.txt", + max_fragment = 200) { if (exists(fragL_count)) { dat <- data.table(get(fragL_count)) @@ -922,28 +1128,29 @@ plotFLD <- function(fragL, quit(save = "no", status = 1, runLast = FALSE) } - dat1 <- dat[dat$V2<=600,] + dat1 <- dat[dat$V2<=max_fragment,] tmp <- seq(1:as.numeric(dat1[1,2]-1)) dat0 <- data.table(V1=rep(0,length(tmp)),V2=tmp) dat2 <- rbind(dat0, dat1) - t1 = theme_classic(base_size=14) + - theme(axis.line = element_line(size = 0.5), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - legend.position = "none", - aspect.ratio = 1, - panel.border = element_rect(colour = "black", fill=NA, size=0.5), - plot.title = element_text(hjust = 0.5)) + x_min = which.min(dat1$V1[1:which.max(dat1$V1)]) - p <- ggplot(dat1, aes(x=V2, y=V1)) + + p <- ggplot(dat1[x_min:nrow(dat1),], aes(x=V2, y=V1)) + geom_point(size=1, alpha=0.25) + geom_line(alpha=0.5) + - geom_vline(xintercept = 30, linetype = "longdash", alpha=0.5) + - xlab("Fragment length") + - ylab("Number of reads") + - ggtitle("Insert size distribution") + - t1 + annotate("rect", xmin=-Inf, xmax=20, ymin=-Inf, ymax=Inf, + alpha=0.1, fill="#ff001e") + + annotate("text", x=25, y=(max(dat1$V1)/2), + size=theme_get()$text[["size"]]/4, + label="partial degradation", angle=90, col="#858585") + + annotate("rect", xmin=-Inf, xmax=30, ymin=-Inf, ymax=Inf, + alpha=0.1, fill="#ffee00") + + annotate("text", x=7.5, y=(max(dat1$V1)/2), + size=theme_get()$text[["size"]]/4, + label="high degradation", angle=90, col="#858585") + + xlab("fragment length") + + ylab("number of reads") + + theme_PEPPRO() summ <- data.table(Min=min(summary_table$V1), Max=max(summary_table$V1), @@ -1020,6 +1227,120 @@ fancyNumbers <- function(n){ return(textReturn) } + +#' Determine which quantile to use as cutoff. +#' +#' Modified from: GenomicDistributions (Tessa Danehy) +#' +#' @param vec A vector of numbers. +#' @param baseline A minimum quantile cutoff. +#' @keywords cutoff quantiles +#' @examples +#' calcQuantileCutoff() +calcQuantileCutoff = function(vec, baseline=1){ + n <- length(vec) # number of observations + if (n > 1000) {n = 1000} + if (n < 100) {n = 100} + q <- max(baseline, (11 - round(n/100))) # finding quantiles for flanking bins + return(q) +} + + +#' Calculate histogram binning by quantile. +#' +#' Modified from: GenomicDistributions (Tessa Danehy) +#' +#' @param vec A vector of numbers. +#' @param q The quantile to use for cutoffs. +#' @param bins The number of bins to use for a histogram. +#' @param transformed Adjust divisions if transformed +#' @keywords quantiles +#' @examples +#' calcDivisions() +calcDivisions = function(vec, q, bins=NULL, transformed=FALSE){ + q = as.numeric(q) + if(q > 50){ + message("Quantile should not be larger than 50. Optimal size is under 10.") + q <- 50 + } + if(!is.null(bins)){ + b <- bins + 1 + } + else { + b <- abs((30-(2*q))/q) # finding the number of bins based on the quantiles + } + quant <- unname(quantile(vec, probs = c((q/100), (1-(q/100))))) + seq_10 <- seq(quant[1], quant[2], length = b) + if (transformed) { + div <- c(-Inf, round(seq_10, 2), Inf) + } else { + div <- c(-Inf, round(seq_10), Inf) + } + + return(div) +} + + +#' Create a character vector of bin labels. +#' +#' From: GenomicDistributions (Tessa Danehy) +#' +#' @param breakPoints A vector of numbers. +#' @param digits The number of digits to round to. +#' @param collapse The character to separate paste values by. +#' @param infBins Whether to use infinite bins. +#' @keywords labels +#' @examples +#' labelCuts() +labelCuts = function(breakPoints, digits=1, collapse="-", infBins=FALSE) { + labels <- + apply(round(cbind(breakPoints[-length(breakPoints)], + breakPoints[-1]),digits), 1, paste0, + collapse=collapse) + + if (infBins) { + labels[1] <- paste0("<", breakPoints[2]) + labels[length(labels)] <- paste0(">", breakPoints[length(breakPoints)-1]) + } + return(labels) +} + + +#' Create a character vector of bin labels. +#' +#' Modified from: GenomicDistributions (Tessa Danehy) +#' +#' @param vec A vector (or list of vectors) of numbers. +#' @param divisions The break points of the distribution +#' @keywords quantiles +#' @examples +#' cutDists() +cutDists = function(vec, divisions = c(-Inf, -1e6, -1e4, -1000, -100, 0, + 100, 1000, 10000, 1e6, Inf)) { + if (is.list(vec)) { + x = lapply(vec, cutDists) + + # To accommodate multiple lists, we'll need to introduce a new 'name' + # column to distinguish them. + nameList = names(vec) + if(is.null(nameList)) { + nameList = 1:length(query) # Fallback to sequential numbers + } + + # Append names + xb = rbindlist(x) + xb$name = rep(nameList, sapply(x, nrow)) + + return(xb) + } + divisions <- unique(divisions) + labels <- labelCuts(signif(divisions, 3), collapse=" to ", infBins=TRUE) + #message(paste0("breaks: ", paste0(divisions, collapse=" "))) + cuts <- cut(vec, divisions, labels) + return(as.data.frame(table(cuts))) +} + + #' Plot the distribution of genic exonRPKM/intronRPKM ratios #' #' This function plots the distribution of by gene exon RPKM divided by @@ -1028,14 +1349,21 @@ fancyNumbers <- function(n){ #' #' @param rpkm A three column TSV format file containing #' "gene", "intron RPKM", "exon RPKM" columns. +#' @param name X-axis label, typically a sample name #' @param raw Plot raw distribution +#' @param type Plot format +#' @param annotate Display mean and median values on plot #' @keywords mRNA contamination #' @export #' @examples #' data("rpkm_ratios") #' mRNAcontamination(rpkm = "rpkm_ratios") #' @export -mRNAcontamination <- function(rpkm, raw=FALSE) { +mRNAcontamination <- function(rpkm, + name='mRNA contamination ratios', + raw=TRUE, + type=c("histogram", "boxplot", "violin"), + annotate=TRUE) { if (exists(rpkm)) { RPKM <- data.table(get(rpkm)) } else if (file.exists(rpkm)) { @@ -1044,36 +1372,162 @@ mRNAcontamination <- function(rpkm, raw=FALSE) { stop(paste0("FileExistsError: ", rpkm, " could not be found.")) quit(save = "no", status = 1, runLast = FALSE) } - colnames(RPKM) <- c("gene","intron","exon") + colnames(RPKM) <- c("chr", "start", "end", "gene","ratio","strand") + + finite_rpkm <- RPKM[is.finite(RPKM$ratio),] + + if (raw) { + div <- calcDivisions(finite_rpkm$ratio, + calcQuantileCutoff(finite_rpkm$ratio, + baseline = 3)) + } else { + div <- calcDivisions(log10(finite_rpkm$ratio), + calcQuantileCutoff(log10(finite_rpkm$ratio), + baseline = 3), + transformed=TRUE) + } - finite_rpkm <- RPKM[is.finite(RPKM$exon/RPKM$intron),] + # ensure breaks are not duplicated + div <- unique(div) + quantLabel <- paste(calcQuantileCutoff(finite_rpkm$ratio),"%", sep='') if (raw) { - q <- ggplot(data = finite_rpkm, - aes(x="", y=(exon/intron))) + + if (type == "histogram") { + if (length(div) <= 3) { + base_plot <- ggplot(data = finite_rpkm, aes(x=ratio)) + } else { + # calculate a frequency table with the specified divisions + rpkm_table <- cutDists(finite_rpkm$ratio, divisions = div) + base_plot <- ggplot(data = rpkm_table, aes(x=cuts, y=Freq)) + } + + } else { + base_plot <- ggplot(data = finite_rpkm, aes(x="", y=(ratio))) + } + } else { + if (type == "histogram") { + if (length(div) <= 3) { + base_plot <- ggplot(data = finite_rpkm, aes(x=log10(ratio))) + } else { + # calculate a frequency table with the specified divisions + rpkm_table <- cutDists(log10(finite_rpkm$ratio), + divisions = div) + base_plot <- ggplot(data = rpkm_table, aes(x=cuts, y=Freq)) + } + } else { + base_plot <- ggplot(data = finite_rpkm, aes(x="", y=log10(ratio))) + } + } + + if (type == "histogram") { + if (raw) { + if (length(div) <= 3) { + plot <- base_plot + + geom_histogram(col="black", fill=I("transparent")) + + geom_vline(aes(xintercept=median(ratio)), + color="gray", linetype="dashed", size=1) + + annotate("text", x=median(finite_rpkm$ratio), + y=(ceiling(quantile(finite_rpkm$ratio, 0.25))), + label="median", angle=90, + color="gray", vjust=-0.5) + + geom_vline(aes(xintercept=mean(ratio)), + color="light gray", linetype="dotted", size=1) + + annotate("text", x=mean(finite_rpkm$ratio), + y=(ceiling(quantile(finite_rpkm$ratio, 0.25))), + label="mean", angle=90, + color="light gray", vjust=-0.5) + + labs(x=expression((over(exon[RPKM], intron[RPKM]))~X~Gene), + y="frequency") + + xlim(c(0, ceiling(quantile(finite_rpkm$ratio, 0.90)))) + + theme_PEPPRO() + } else { + plot <- base_plot + + geom_bar(stat="identity", + fill = c("maroon", + rep("gray", (length(div)-3)), + "maroon")) + + labs(x=expression((over(exon[RPKM], intron[RPKM]))~X~Gene), + y="frequency") + + geom_text(aes(label= quantLabel), + data=rpkm_table[c(1,length(rpkm_table$Freq)),], + vjust=-1) + } + } else { + if (length(div) <= 3) { + plot = base_plot + + geom_histogram(col="black", fill=I("transparent")) + + geom_vline(aes(xintercept=median(log10(ratio))), + color="gray", linetype="dashed", size=1) + + geom_vline(aes(xintercept=mean(log10(ratio))), + color="light gray", linetype="dotted", size=1) + + labs(x=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene)) + + scale_x_log10(limits = c(0.001, 50), + expand = expand_scale(mult = c(0, 0)), + labels=fancyNumbers, + breaks=prettyLogs) + + annotation_logticks(sides = c("rl")) + } else { + plot <- base_plot + + geom_bar(stat="identity", + fill = c("maroon", + rep("gray", (length(div)-3)), + "maroon")) + + labs(x=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene), + y="frequency") + + geom_text(aes(label= quantLabel), + data=rpkm_table[c(1,length(rpkm_table$Freq)),], + vjust=-1) + } + } + } else if (type == "boxplot") { + if (raw) { + plot = base_plot + + stat_boxplot(geom ='errorbar', width = 0.25) + + geom_boxplot(width = 0.25, + outlier.color='red', + outlier.shape=1) + + stat_summary(fun.y = "mean", geom = "point", + shape = 1, size = 2) + + labs(x=name, + y=expression((over(exon[RPKM], intron[RPKM]))~X~Gene)) + + ylim(c(0, ceiling(quantile(finite_rpkm$ratio, 0.90)))) + } else { + plot <- base_plot + stat_boxplot(geom ='errorbar', width = 0.25) + geom_boxplot(width = 0.25, outlier.color='red', outlier.shape=1) + + stat_summary(fun.data = n_fun, geom = "text", hjust = 0.5) + + stat_summary(fun.y = "mean", geom = "point", + shape = 1, size = 2) + + scale_y_log10(limits = c(0.001, 50), + expand = expand_scale(mult = c(0, 0)), + labels=fancyNumbers, + breaks=prettyLogs) + + annotation_logticks(sides = c("rl")) + + labs(x=name, + y=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene)) + } + } else if (type == "violin") { + if (raw) { + plot = base_plot + + #stat_boxplot(geom ='errorbar', width = 0.25) + + geom_violin(width = 0.25, draw_quantiles = c(0.25,0.75), + linetype="dashed") + + geom_violin(width=0.25, fill="transparent", + draw_quantiles = 0.5) + stat_summary(fun.y = "mean", geom = "point", shape = 1, size = 2) + labs(x=name, y=expression((over(exon[RPKM], intron[RPKM]))~X~Gene)) + - ylim(c(0, ceiling(summary(finite_rpkm$exon/finite_rpkm$intron)[5]))) + - theme_classic(base_size=14) + - theme(axis.line = element_line(size = 0.5)) + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - aspect.ratio = 1, - panel.border = element_rect(colour = "black", - fill=NA, size=0.5)) - } else { - q <- ggplot(data = finite_rpkm, - aes(x="", y=log10(exon/intron))) + - stat_boxplot(geom ='errorbar', width = 0.25) + - geom_boxplot(width = 0.25, - outlier.color='red', - outlier.shape=1) + + ylim(c(0, ceiling(quantile(finite_rpkm$ratio, 0.90)))) + } else { + plot <- base_plot + + #stat_boxplot(geom ='errorbar', width = 0.25) + + geom_violin(width = 0.25, draw_quantiles = c(0.25,0.75), + linetype="dashed") + + geom_violin(width=0.25, fill="transparent", + draw_quantiles = 0.5) + stat_summary(fun.data = n_fun, geom = "text", hjust = 0.5) + stat_summary(fun.y = "mean", geom = "point", shape = 1, size = 2) + @@ -1083,32 +1537,100 @@ mRNAcontamination <- function(rpkm, raw=FALSE) { breaks=prettyLogs) + annotation_logticks(sides = c("rl")) + labs(x=name, - y=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene)) + - theme_classic(base_size=14) + - theme(axis.line = element_line(size = 0.5)) + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - aspect.ratio = 1, - panel.border = element_rect(colour = "black", - fill=NA, size=0.5)) + y=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene)) + } + } else { + # Default to histogram + if (raw) { + if (length(div) <= 3) { + plot <- base_plot + + geom_histogram(col="black", fill=I("transparent")) + + geom_vline(aes(xintercept=median(ratio)), + color="gray", linetype="dashed", size=1) + + annotate("text", x=median(finite_rpkm$ratio), + y=(ceiling(quantile(finite_rpkm$ratio, 0.25))), + label="median", angle=90, + color="gray", vjust=-0.5) + + geom_vline(aes(xintercept=mean(ratio)), + color="light gray", linetype="dotted", size=1) + + annotate("text", x=mean(finite_rpkm$ratio), + y=(ceiling(quantile(finite_rpkm$ratio, 0.25))), + label="mean", angle=90, + color="light gray", vjust=-0.5) + + labs(x=expression((over(exon[RPKM], intron[RPKM]))~X~Gene), + y="frequency") + + xlim(c(0, ceiling(quantile(finite_rpkm$ratio, 0.90)))) + + theme_PEPPRO() + } else { + plot <- base_plot + + geom_bar(stat="identity", + fill = c("maroon", + rep("gray", (length(div)-3)), + "maroon")) + + labs(x=expression((over(exon[RPKM], intron[RPKM]))~X~Gene), + y="frequency") + + geom_text(aes(label= quantLabel), + data=rpkm_table[c(1,length(rpkm_table$Freq)),], + vjust=-1) + } + } else { + if (length(div) <= 3) { + plot = base_plot + + geom_histogram(col="black", fill=I("transparent")) + + geom_vline(aes(xintercept=median(log10(ratio))), + color="gray", linetype="dashed", size=1) + + geom_vline(aes(xintercept=mean(log10(ratio))), + color="light gray", linetype="dotted", size=1) + + labs(x=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene)) + + scale_x_log10(limits = c(0.001, 50), + expand = expand_scale(mult = c(0, 0)), + labels=fancyNumbers, + breaks=prettyLogs) + + annotation_logticks(sides = c("rl")) + } else { + plot <- base_plot + + geom_bar(stat="identity", + fill = c("maroon", + rep("gray", (length(div)-3)), + "maroon")) + + labs(x=expression(log[10](over(exon[RPKM], intron[RPKM]))~X~Gene), + y="frequency") + + geom_text(aes(label= quantLabel), + data=rpkm_table[c(1,length(rpkm_table$Freq)),], + vjust=-1) + } + } } - label1 <- c(paste("'median'[log[10]]", ":~", - round(median(log10((finite_rpkm$exon/finite_rpkm$intron))), 2)), - paste("'median'[raw]", ":", - round(median(finite_rpkm$exon/finite_rpkm$intron), 2))) - - max_y <- layer_scales(q)$y$range$range[2] + label1 <- paste("'median'[log[10]]", ":~", + round(median(log10((finite_rpkm$ratio))), 2)) + label2 <- paste("'median'[raw]", ":", round(median(finite_rpkm$ratio), 2)) - if (raw) { - q <- q + annotate("text", x = 0.5, y = c(max_y, 0.95*max_y), - hjust=0, vjust=1, label = label1, parse=TRUE) + if (type == "histogram") { + max_x <- length(layer_scales(plot)$x$range$range) } else { - q <- q + annotate("text", x = 0.5, y = c(10^max_y, 10^max_y-10), - hjust=0, vjust=1, label = label1, parse=TRUE) + max_x <- suppressMessages( + suppressWarnings(layer_scales(plot)$x$range$range[2])) } + if (is.na(max_x)) {max_x <- Inf} + max_y <- suppressMessages( + suppressWarnings(layer_scales(plot)$y$range$range[2])) + + # TODO: make summary plot of these that IS boxplots + if (annotate) { + q <- plot + annotate("text", x = floor(max_x), y = floor(max_y), + hjust="right", vjust=1.05, + label = label1, parse=TRUE) + + annotate("text", x = floor(max_x), y = floor(max_y), + hjust="right", vjust=2.05, + label = label2, parse=TRUE) + + theme_PEPPRO() + } else { + q <- plot + theme_PEPPRO() + } + - return(p) + return(q) } @@ -1116,13 +1638,19 @@ mRNAcontamination <- function(rpkm, raw=FALSE) { #' #' @param pi A single column containing the ratio of TSS densities/gene body #' densities for the highest scoring TSSs +#' @param name X-axis label, typically a sample name +#' @param type Plot format +#' @param annotate Display mean and median values on plot #' @keywords pause index #' @export #' @examples #' data("pidx") #' plotPI(pi = "pidx") #' @export -plotPI <- function(pi) { +plotPI <- function(pi, name='pause indicies', + type=c("histogram", "boxplot", "violin"), + annotate=TRUE) { + # TODO: make summary plot of these that IS boxplots if (exists(pi)) { PI <- data.table(get(pi)) } else if (file.exists(pi)) { @@ -1131,55 +1659,198 @@ plotPI <- function(pi) { stop(paste0("FileExistsError: ", pi, " could not be found.")) quit(save = "no", status = 1, runLast = FALSE) } - colnames(PI) <- c("pi") - - q <- ggplot(data = PI, aes(x="", y=pi)) + - stat_boxplot(geom ='errorbar', width = 0.25) + - geom_boxplot(width = 0.25, - outlier.color='red', - outlier.shape=1) + - stat_summary(fun.y = "mean", geom = "point", - shape = 1, size = 2) + - labs(x=name, y="each gene's pause index") - - if (max(PI$pi) > 500) { - q <- q + scale_y_continuous(breaks = round(seq(min(PI$pi), - max(PI$pi), - by = 50), 0), - limits=c(0, max(PI$pi))) - } else if (max(PI$pi) > 100 & max(PI$pi) < 500) { - q <- q + scale_y_continuous(breaks = round(seq(min(PI$pi), - max(PI$pi), - by = 10), 0), - limits=c(0, max(PI$pi))) + colnames(PI) <- c("chr", "start", "end", "name", "pi", "strand") + + div <- calcDivisions(PI$pi, calcQuantileCutoff(PI$pi, baseline = 3)) + quantLabel <- paste(calcQuantileCutoff(PI$pi, baseline = 3),"%", sep='') + + if (type == "histogram") { + if (length(div) <= 3) { + base_plot <- ggplot(data = PI, aes(x=pi)) + } else { + # calculate a frequency table with the specified divisions + pi_table <- cutDists(PI$pi, divisions = div) + base_plot <- ggplot(data = pi_table, aes(x=cuts, y=Freq)) + } + } else { - q <- q + scale_y_continuous(breaks = round(seq(min(PI$pi), - max(PI$pi), - by = 5), 0), - limits=c(0, max(PI$pi))) + base_plot <- ggplot(data = PI, aes(x="", y=pi)) + } + + if (type == "histogram") { + if (length(div) <= 3) { + q <- base_plot + + geom_histogram(col="black", fill=I("transparent")) + + geom_vline(aes(xintercept=median(PI$pi)), + color="gray", linetype="dashed", size=1) + + annotate("text", x=median(PI$pi), + y=(ceiling(quantile(PI$pi, 0.25))), + label="median", angle=90, + color="gray", vjust=-0.5) + + geom_vline(aes(xintercept=mean(PI$pi)), + color="light gray", linetype="dotted", size=1) + + annotate("text", x=mean(PI$pi), + y=(ceiling(quantile(PI$pi, 0.25))), + label="mean", angle=90, + color="light gray", vjust=-0.5) + + labs(x="pause indicies", y="frequency") + + xlim(c(0, ceiling(quantile(PI$pi, 0.90)))) + + theme_PEPPRO() + } else { + q <- base_plot + + geom_bar(stat="identity", + fill = c("maroon", + rep("gray", (length(div)-3)), + "maroon")) + + labs(x="pause indicies", y="frequency") + + geom_text(aes(label= quantLabel), + data=pi_table[c(1,length(pi_table$Freq)),], + vjust=-1) + } + } else if (type == "boxplot") { + plot <- base_plot + + stat_boxplot(geom ='errorbar', width = 0.25) + + geom_boxplot(width = 0.25, + outlier.color='red', + outlier.shape=1) + + stat_summary(fun.y = "mean", geom = "point", + shape = 1, size = 2) + + labs(x=name, y="each gene's pause index") + } else if (type == "violin") { + plot <- base_plot + + geom_violin(width = 0.25, draw_quantiles = c(0.25,0.75), + linetype="dashed") + + geom_violin(width=0.25, fill="transparent", + draw_quantiles = 0.5) + + stat_summary(fun.y = "mean", geom = "point", + shape = 1, size = 2) + + labs(x=name, y="each gene's pause index") + } else { + # default to histogram + if (length(div) <= 3) { + q <- base_plot + + geom_histogram(col="black", fill=I("transparent")) + + geom_vline(aes(xintercept=median(PI$pi)), + color="gray", linetype="dashed", size=1) + + annotate("text", x=median(PI$pi), + y=(ceiling(quantile(PI$pi, 0.25))), + label="median", angle=90, + color="gray", vjust=-0.5) + + geom_vline(aes(xintercept=mean(PI$pi)), + color="light gray", linetype="dotted", size=1) + + annotate("text", x=mean(PI$pi), + y=(ceiling(quantile(PI$pi, 0.25))), + label="mean", angle=90, + color="light gray", vjust=-0.5) + + labs(x="pause indicies", y="frequency") + + xlim(c(0, ceiling(quantile(PI$pi, 0.90)))) + + theme_PEPPRO() + } else { + q <- base_plot + + geom_bar(stat="identity", + fill = c("maroon", + rep("gray", (length(div)-3)), + "maroon")) + + labs(x="pause indicies", y="frequency") + + geom_text(aes(label= quantLabel), + data=pi_table[c(1,length(pi_table$Freq)),], + vjust=-1) + } + } + + if (type != "histogram") { + if (max(PI$pi) > 500) { + q <- plot + scale_y_continuous(breaks = round(seq(min(PI$pi), + max(PI$pi), + by = 50), 0), + limits=c(0, max(PI$pi))) + } else if (max(PI$pi) > 100 & max(PI$pi) < 500) { + q <- plot + scale_y_continuous(breaks = round(seq(min(PI$pi), + max(PI$pi), + by = 25), 0), + limits=c(0, max(PI$pi))) + } else { + q <- plot + scale_y_continuous(breaks = round(seq(min(PI$pi), + max(PI$pi), + by = 5), 0), + limits=c(0, max(PI$pi))) + } + q <- q + coord_cartesian(ylim=c(0, ceiling(boxplot(PI$pi)$stats[5]))) + + theme_PEPPRO() + max_x <- suppressMessages( + suppressWarnings(layer_scales(q)$x$range$range[2])) + max_y <- ceiling(boxplot(PI$pi)$stats[5]) + } else { + max_x <- length(layer_scales(q)$x$range$range) + max_y <- suppressMessages( + suppressWarnings(layer_scales(q)$y$range$range[2])) + } + + if (is.na(max_x)) {max_x <- Inf} + + label1 <- paste("'median'", ":", round(median(PI$pi), 2)) + label2 <- paste("'mean'", ":", round(mean(PI$pi), 2)) + if (annotate) { + q <- q + annotate("text", x = floor(max_x), y = floor(max_y), + hjust="right", vjust=1.05, label = label1, parse=TRUE) + + annotate("text", x = floor(max_x), y = floor(max_y), + hjust="right", vjust=2.15, label = label2, parse=TRUE) + + theme_PEPPRO() + } else { + q <- q + theme_PEPPRO() } - q <- q + coord_cartesian(ylim=c(0, ceiling(boxplot(PI$pi)$stats[5]))) + - theme_classic(base_size=14) + - theme(axis.line = element_line(size = 0.5)) + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - aspect.ratio = 1, - panel.border = element_rect(colour = "black", - fill=NA, size=0.5)) - max_y <- ceiling(boxplot(PI$pi)$stats[5]) - label1 <- c(paste("'median'", ":", round(median(PI$pi), 2)), - paste("'mean'", ":", round(mean(PI$pi), 2))) - q <- q + annotate("text", x = 0.5, y = c(max_y, 0.95*max_y), - hjust=0, vjust=1, label = label1, parse=TRUE) return(q) } +#' Calculate mode(s) of data +#' +#' From: https://stackoverflow.com/questions/2547402/is-there-a-built-in-function-for-finding-the-mode +#' @param x A vector of numbers or characters +#' @param return_multiple Bool to return multiple modes or first in order +#' @param na.rm Bool Remove NAs +#' +#' @keywords mode +mode <- function(x, return_multiple = TRUE, na.rm = FALSE) { + if(na.rm){ + x <- na.omit(x) + } + ux <- unique(x) + freq <- tabulate(match(x, ux)) + mode_loc <- if(return_multiple) which(freq==max(freq)) else which.max(freq) + return(ux[mode_loc]) +} -#' Plot the distribution of adapter insertions + +#' Determine the appropriate abbreviation for large numbers +#' +#' Modified From: https://stackoverflow.com/questions/28159936/formatting-large-currency-or-dollar-values-to-millions-billions +#' @param vec A vector of numbers +#' +#' @keywords abbreviation +getAbbr <- function(vec) { + div <- findInterval(as.numeric(gsub("\\,", "", vec)), c(0, 1e3, 1e6, 1e9, 1e12) ) + return(paste(c("","K","M","B","T")[mode(div)])) +} + + +#' Determine the appropriate dividing factor for large numbers +#' +#' Modified From: https://stackoverflow.com/questions/28159936/formatting-large-currency-or-dollar-values-to-millions-billions +#' @param vec A vector of numbers +#' +#' @keywords abbreviation +getFactor <- function(vec) { + div <- findInterval(as.numeric(gsub("\\,", "", vec)), c(0, 1e3, 1e6, 1e9, 1e12) ) + return(as.numeric(paste(c(1, 1e3, 1e6, 1e9, 1e12)[mode(div)]))) +} + + +#' Plot the cutadapt-based distribution of adapter insertions #' #' @param input A cutadapt report #' @param name A sample name or identifier for the plot title +#' @param umi_len The UMI length #' #' @keywords cutadapt #' @export @@ -1187,7 +1858,9 @@ plotPI <- function(pi) { #' data("cutadapt") #' plotCutadapt(input = "cutadapt") #' @export -plotCutadapt <- function(input, name='cutadapt') { +plotCutadapt <- function(input, name='cutadapt', + umi_len = 0, + count_factor = 1000000) { if (exists(input)) { report <- data.table(get(input)) } else if (file.exists(input)) { @@ -1197,26 +1870,158 @@ plotCutadapt <- function(input, name='cutadapt') { quit(save = "no", status = 1, runLast = FALSE) } + if (umi_len > 0) { + report <- report[-(which(report$length == (max(report$length) - umi_len))),] + } + # only keep sizes where the expected count represents less than 1% of # the actual count report <- report[which(report$expect/report$count < 0.01),] + # inverse length to get ascending order + report$length <- max(report$length)-report$length + + # don't include size 0 insertions + report <- report[which(report$length > 0),] + + abbr <- getAbbr(report$count) + if (abbr == '') { + ylabel <- "Number of reads" + } else { + ylabel <- paste0("Number of reads (", abbr, ")") + } + + count_factor <- getFactor(report$count) + + if (20 %in% report$length) { + degraded_upper <- 20 + degraded_lower <- 10 + } else { + degraded_upper <- min(report$length) + 10 + degraded_lower <- max(1, degraded_upper - 10) + } + + if (40 %in% report$length) { + intact_upper <- 40 + intact_lower <- 30 + } else { + intact_upper <- max(report$length) + intact_lower <- max(1, intact_upper - 10) + } + + degradation <- sum(report[which(report$length >= degraded_lower & + report$length <= degraded_upper),]$count) / + max(1, sum(report[which(report$length >= intact_lower & + report$length <= intact_upper),]$count)) + + q <- ggplot(report, aes(x=length, y=count/count_factor)) + + geom_point() + + geom_vline(xintercept = 20, linetype = "dotted", alpha=0.25) + + geom_vline(xintercept = 30, linetype = "longdash", alpha=0.5) + + labs(x="Size of insertion", y=ylabel) + + theme_PEPPRO() + + theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) + q <- q + + annotate("rect", xmin=-Inf, xmax=20, ymin=-Inf, ymax=Inf, + alpha=0.1, fill="#ff001e") + + annotate("text", x=25, y=(max(report$count/count_factor)/2), + size=theme_get()$text[["size"]]/4, + label="partial degradation", angle=90, col="#858585") + + annotate("rect", xmin=-Inf, xmax=30, ymin=-Inf, ymax=Inf, + alpha=0.1, fill="#ffee00") + + annotate("text", x=7.5, y=(max(report$count/count_factor)/2), + size=theme_get()$text[["size"]]/4, + label="high degradation", angle=90, col="#858585") + + annotate("text", x=Inf, y=(max(report$count/count_factor)*0.99), + size=theme_get()$text[["size"]]/3, hjust=1.1, + label=paste0("degradation ratio: ", round(degradation, 2))) + + return(q) +} + + +#' Plot the distribution of adapter insertions +#' +#' @param input FLASH histogram output +#' @param name A sample name or identifier for the plot title +#' @param umi_len The UMI length +#' +#' @keywords cutadapt +#' @export +#' @examples +#' data("adapt") +#' plotAdapt(input = "adapt") +#' @export +plotAdapt <- function(input, name='adapt', umi_len = 0) { + if (exists(input)) { + report <- data.table(get(input)) + } else if (file.exists(input)) { + report <- fread(input) + } else { + stop(paste0("FileExistsError: ", input, " could not be found.")) + quit(save = "no", status = 1, runLast = FALSE) + } + + colnames(report) <- c("length", "count") + + if (umi_len > 0) { + report$length <- report$length - umi_len + } + # don't include size 0 insertions - report <- report[-nrow(report),] + report <- report[which(report$length > 0),] + + abbr <- getAbbr(report$count) + if (abbr == '') { + ylabel <- "Number of reads" + } else { + ylabel <- paste0("Number of reads (", abbr, ")") + } + + count_factor <- getFactor(report$count) + + if (20 %in% report$length) { + degraded_upper <- 20 + degraded_lower <- 10 + } else { + degraded_upper <- min(report$length) + 10 + degraded_lower <- max(1, degraded_upper - 10) + } + + if (40 %in% report$length) { + intact_upper <- 40 + intact_lower <- 30 + } else { + intact_upper <- max(report$length) + intact_lower <- max(1, intact_upper - 10) + } + + degradation <- sum(report[which(report$length >= degraded_lower & + report$length <= degraded_upper),]$count) / + max(1, sum(report[which(report$length >= intact_lower & + report$length <= intact_upper),]$count)) - q <- ggplot(report, aes(x=max(length)-length, y=count)) + + q <- ggplot(report, aes(x=length, y=count/count_factor)) + geom_point() + geom_vline(xintercept = 20, linetype = "dotted", alpha=0.25) + geom_vline(xintercept = 30, linetype = "longdash", alpha=0.5) + - labs(title=name, x="Size of insertion", y="Number of reads") + - theme_classic(base_size=14) + - theme(axis.line = element_line(size = 0.5), - plot.title = element_text(hjust = 0.5), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - aspect.ratio = 1, - panel.border = element_rect(colour = "black", - fill=NA, size=0.5)) + labs(x="Size of insertion", y=ylabel) + + theme_PEPPRO() + + theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) + q <- q + + annotate("rect", xmin=-Inf, xmax=20, ymin=-Inf, ymax=Inf, + alpha=0.1, fill="#ff001e") + + annotate("text", x=25, y=(max(report$count/count_factor)/2), + size=theme_get()$text[["size"]]/4, + label="partial degradation", angle=90, col="#858585") + + annotate("rect", xmin=-Inf, xmax=30, ymin=-Inf, ymax=Inf, + alpha=0.1, fill="#ffee00") + + annotate("text", x=7.5, y=(max(report$count/count_factor)/2), + size=theme_get()$text[["size"]]/4, + label="high degradation", angle=90, col="#858585") + + annotate("text", x=Inf, y=(max(report$count/count_factor)*0.99), + size=theme_get()$text[["size"]]/3, hjust=1.1, + label=paste0("degradation ratio: ", round(degradation, 2))) return(q) } diff --git a/containers/peppro.Dockerfile b/containers/peppro.Dockerfile index 25a344c..cd3582d 100644 --- a/containers/peppro.Dockerfile +++ b/containers/peppro.Dockerfile @@ -115,6 +115,12 @@ RUN wget https://github.com/shenwei356/seqkit/releases/download/v0.10.1/seqkit_l tar -zxvf seqkit_linux_amd64.tar.gz && \ ln -s /home/src/seqkit /usr/bin/ +# Install flash +WORKDIR /home/src/ +RUN wget http://ccb.jhu.edu/software/FLASH/FLASH-1.2.11-Linux-x86_64.tar.gz && \ + tar xvfz FLASH-1.2.11-Linux-x86_64.tar.gz && \ + ln -s /home/src/FLASH-1.2.11-Linux-x86_64/flash /usr/bin/ + # Install fastp WORKDIR /home/src/ RUN git clone https://github.com/OpenGene/fastp.git && \ diff --git a/docs/README.md b/docs/README.md index fe63db4..cae84f1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,22 +1,18 @@ -# PEPPRO +# PEPPRO
[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) -`PEPPRO` is a pipeline designed to process PRO-seq data. It is optimized on unique features of PRO-seq to be fast and accurate. It performs adapter removal, including UMI of variable length, read deduplication, trimming, mapping, and signal tracks (bigWig) for plus and minus strands using scaled (based on mappability information) or unscaled read count patterns. +`PEPPRO` is a pipeline for PRO-seq nascent RNA sequencing data. It is optimized on unique features of PRO-seq to be fast and accurate. It performs variable-length UMI adapter removal, read deduplication, trimming, mapping, QC, and signal tracks (bigWig) for plus and minus strands using mappability-scaled or unscaled read counts. ## Outputs -`PEPPRO` produces quality control plots, summary statistics, and several data formats to set the stage for project-specific analysis. +`PEPPRO` produces quality control plots, statistics, and data formats to set the stage for project-specific analysis. We have produced an [interactive display of the output folder structure](browse_output/), which includes: -- PEPPRO produces an easily-navigable HTML report when used with [`Looper`](http://looper.databio.org/en/latest/): View this [HTML Summary report demo](files/examples/tutorial/tutorial_summary.html) -- We have produced an [interactive display of the output folder structure](browse_output/), which includes: - - [Easily parsable summary statistics file](files/examples/tutorial/results_pipeline/tutorial/stats.tsv) - - BigWig signal tracks (plus and minus stranded): - - nucleotide-resolution, exact RNA polymerase position signal - - smoothed signal - - nucleotide-resolution signal corrected for enzymatic sequence bias +- **HTML report**: an easily-navigable HTML report with pretty plots: [HTML summary report demo](files/examples/tutorial/tutorial_summary.html). +- **Stats**: An easily parsable stats file: [Summary statistics demo file](files/examples/tutorial/results_pipeline/tutorial/stats.tsv). +- **Processed data**: Several bigWig signal tracks (plus and minus stranded), with options to produce: smoothed signal; exact (nucleotide-resolution) RNA polymerase position signal; or nucleotide-resolution signal corrected for enzymatic sequence bias. ## User interface diff --git a/docs/annotation.md b/docs/annotation.md new file mode 100644 index 0000000..1615a13 --- /dev/null +++ b/docs/annotation.md @@ -0,0 +1,70 @@ +# Custom reference data + +The pipeline uses reference data at various stages, such as for alignment, calculating TSS enrichments, and other QC scores. If you're using a common genome assembly, these resources are pre-built and can be easily downloaded using `refgenie pull`, as described in the setup instructions. If the resources are not available, you'll have to build them. This document outlines how we created the reference data, so you can recreate it if you need to. The easiest way to do this is use `refgenie build`. All you need to do is: + +## 1: Build the fasta asset + +You need a FASTA file for your genome. You can insert this file into refgenie like this: +```console +refgenie build -g GENOME -a fasta --files fasta=/path/to/file.fa +``` + +## 2. Build the bowtie2_index + +To build a bowtie2_index and have it managed by `refgenie` you'll, of course, need [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) already installed. You will also need the requisite FASTA file, which you just added in step 1. +```console +refgenie build -g GENOME -a bowtie2_index +``` + +## 3: Build the ensembl_gtf asset + +The ensembl_gtf asset includes several related assets (*e.g.* pause index gene bodies and TSS's) the pipeline will employ. To build an ensembl_gtf asset, you need an Ensembl GTF file (or equivalent) for your genome. You can have refgenie build and manage this file as follows: + +```console +refgenie build -g GENOME -a ensembl-gtf --files ensembl_gtf=/path/to/Homo_sapiens.GRCh38.97.gtf.gz +``` + +## 4: Build the refgene_anno asset + +The refgene_anno asset actually includes several related assets that we'll need (*e.g.* TSS and premature mRNA annotations). To build these, for example for hg38, you will need to [download a refGene annotation](http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz). Build it for a any genome like so: + +```console +refgenie build -g GENOME -a refgene_anno --files refgene=/path/to/refGene.txt.gz +``` + +## 5: Build the feat_annotation asset +The `feat_annotation` asset includes feature annotations used to calculate the [FRiF](glossary.md) and [PRiF](glossary.md). `Refgenie` can automatically build this after you have the above assets installed: + +```console +refgenie build -g GENOME -a feat_annotation +``` + +That's it! These assets will be automatically detected by PEPPRO if you build them like this with `refgenie`. + +### Create a custom feature annotation file + +The pipeline will calculate the fraction (and proportion) of reads in genomic features using the feat_annotation asset, but you can also construct this file yourself. + +This annotation file is really just a modified `BED` file, with the chromosomal coordinates and type of feature included. For example, the [downloadable `hg38_annotations.bed.gz` file](http://big.databio.org/peppro/hg38_annotations.bed.gz) looks like so: + +``` +chr1 28200 30001 Promoter . * +chr1 198800 200201 Promoter . * +chr1 778000 780001 Promoter . * +chr1 817400 817601 Promoter . * +chr1 826200 828801 Promoter . * +chr1 904200 905201 Promoter . * +chr1 923800 924601 Promoter . * +chr1 925000 925601 Promoter . * +chr1 941800 942201 Promoter . * +chr1 958400 961401 Promoter . * +``` + +Just like a standard `BED` file, the first three fields are: +1. **chrom** - the name of the chromosome +2. **chromStart** - the starting position of the feature +3. **chromEnd** - the ending position of the feature + +Column four is the **name** column, in our case the name of our feature of interest. The fifth column is the **score**, which would determine how darkly an item would be displayed in a genome browser if you chose to set that or if the information in your file of interest has ascribed a score to the features. The final, sixth, column is the **strand** column. + +After creating your `BED` file, you can point the pipeline to it using the `--anno-name` option followed with the path to your file. The pipeline will then use that file to determine the fractions of reads that cover those features. diff --git a/docs/browse_output.md b/docs/browse_output.md index 3a4747e..2484d97 100644 --- a/docs/browse_output.md +++ b/docs/browse_output.md @@ -10,6 +10,7 @@ This is an interactive display of exactly what results you'll get as output from * :fa-file-code-o: [plus_frif.html](../files/examples/tutorial/reports/plus_frif.html) * :fa-file-code-o: [mrna_contamination.html](../files/examples/tutorial/reports/mrna_contamination.html) * :fa-file-code-o: [objects.html](../files/examples/tutorial/reports/objects.html) + * :fa-file-code-o: [pause_index.html](../files/examples/tutorial/reports/pause_index.html) * :fa-file-code-o: [samples.html](../files/examples/tutorial/reports/samples.html) * :fa-file-code-o: [status.html](../files/examples/tutorial/reports/status.html) * :fa-file-code-o: [tss_enrichment.html](../files/examples/tutorial/reports/tss_enrichment.html) @@ -24,9 +25,13 @@ This is an interactive display of exactly what results you'll get as output from * :fa-file-o: tutorial_unmap.bam * ???+ danger ":fa-folder-open-o: fastq/" * ??? danger ":fa-folder-open-o: fastqc/" - * [:fa-file-code-o: tutorial_rmAdapter.html](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_rmAdapter.html) - * [:fa-file-code-o: tutorial_rmAdapter.json](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_rmAdapter.json) - * [:fa-file-text-o: tutorial_rmAdapter.txt](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_rmAdapter.txt) + * [:fa-file-code-o: tutorial_R1_rmAdapter.html](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.html) + * [:fa-file-code-o: tutorial_R1_rmAdapter.json](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.json) + * [:fa-file-text-o: tutorial_R1_rmAdapter.txt](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt) + * [:fa-file-code-o: tutorial_R2_rmAdapter.html](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html) + * [:fa-file-code-o: tutorial_R2_rmAdapter.json](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json) + * [:fa-file-text-o: tutorial_R2_rmAdapter.txt](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.txt) + * [:fa-file-text-o: tutorial_R1_processed_fastqc.html](../files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_processed_fastqc.html) * ??? danger ":fa-folder-open-o: prealignments/" * :fa-file-text-o: tutorial_rCRSd_3k_bt_aln_summary.log * :fa-file-archive-o: tutorial_rCRSd_3k_unmap_R1.fq.gz @@ -76,7 +81,7 @@ This is an interactive display of exactly what results you'll get as output from * :fa-file-code-o: tutorial.yaml * :fa-file-text-o: peppro.py_tutorial.log * ??? danger ":fa-folder-open-o: summary/" - * :fa-file-pdf-o: [tutorial_library_complexity.pdf](../files/examples/tutorial/summary/tutorial_library_complexity.pdf) - * :fa-file-image-o: [tutorial_library_complexity.png](../files/examples/tutorial/summary/tutorial_library_complexity.png) + * :fa-file-pdf-o: tutorial_libComplexity.pdf + * :fa-file-image-o: tutorial_liComplexity.png * :fa-file-text-o: [tutorial_stats_summary.tsv](../files/examples/tutorial/tutorial_stats_summary.tsv) * :fa-file-code-o: [tutorial_summary.html](../files/examples/tutorial/tutorial_summary.html) diff --git a/docs/changelog.md b/docs/changelog.md index 77d986c..472326b 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,27 @@ # Change log All notable changes to this project will be documented in this file. +## [0.8.6] -- 2020-01-28 + +### Changed +- Update FRiF calculation to optionally follow a priority ranked method +- Update how adapter insertion distributions are plotted to be the same for SE or PE data +- Make cutadapt the default for adapter removal +- Streamline the use of Refgenie assets + - Refgenie manages pause indicies + - Refgenie manages feature annotations + - Refgenie manages assets for mRNA contamination + - Refgenie manages seqOutBias required suffixerator indicies +- Change pause index and mRNA contamination plots to histograms + +### Added +- Add PRiF plot +- Require FLASH tool +- Produce sample level gene counts file as output +- Generate project level counts table including all samples X gene counts +- Report degradation metric for library quality +- Add BiocProject integration + ## [0.8.1] -- 2019-07-15 ### Changed diff --git a/docs/cluster.md b/docs/cluster.md new file mode 100644 index 0000000..49518b0 --- /dev/null +++ b/docs/cluster.md @@ -0,0 +1,40 @@ +# Running on a cluster + +## Default computing options + +When you run your PEPPRO project using `looper run`, by default it will simply run each sample locally. You can change that using `looper run --compute PACKAGE`, where PACKAGE is an option described below. This enables you to adjust your computing preferences on-the-fly. You have several built-in packages, which you can view by typing `divvy list`. Default packages include: + +- `--compute slurm`. Submit the jobs to a SLURM cluster using `sbatch`. +- `--compute sge`. Submit the jobs to a SGE cluster using `qsub`. +- `--compute docker`. Submit the jobs locally using the `databio/peppro` docker image. +- `--compute singularity`. Submit the jobs locally using the singularity image. +- `--compute singularity_slurm`. Submit jobs using `sbatch`, but run them using the singularity image. + +To show how this works, let's run the example project using the `slurm` compute package. Used `-d` for a dry run to create the submits scripts but not run them: + +```console +cd peppro +looper run examples/meta/peppro_test.yaml -d \ + --compute slurm +``` + +This will produce a job script: + +```console +cat peppro_test/submission/peppro_test.sub +``` + +If all looks well, run looper without `-d` to actually submit the jobs. To use the docker or singularity options, see [running PEPPRO in containers](container.md). + +## Customizing compute options + +These default computing options may not fit your needs exactly. PEPPRO allows you to very easily change templates or add your own, so you can run PEPPRO in any possible computing environment. PEPPRO uses a standardized computing configuration called [divvy](https://divvy.databio.org). The instructions for changing these computing configuration options are universal for any software that relies on `divvy`. + +To customize your compute packages, you first create a `divvy` computing configuration file and point an environment variable (`DIVCFG`) to that file: + +```console +export DIVCFG="divvy_config.yaml" +divvy init -c $DIVCFG +``` + +Next, you edit that config file to add in any compute packages you need. PEPPRO will then give you access to any of your custom packages with `looper --compute `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). diff --git a/docs/container.md b/docs/container.md new file mode 100644 index 0000000..e422cbd --- /dev/null +++ b/docs/container.md @@ -0,0 +1,61 @@ +# Running in a container + +We have produced both docker and singularity containers that hold all the necessary software for `PEPPRO`. If your containers are set up correctly, then you won't need to install any additional software. It's easy to run your jobs in a container by configuring `looper` to use a container-compatible template. Follow the instructions below for either `docker` or `singularity` as you wish: + +## Run PEPPRO using docker + +You can pull the docker [databio/peppro image](https://hub.docker.com/r/databio/peppro/) from dockerhub like this: + +``` +docker pull databio/peppro +``` + +Or build the image using the included Dockerfile (you can use a recipe in the included Makefile): +``` +cd peppro/ +make docker +``` + +Next, just add `--compute docker` to your `looper run` command: + +``` +cd peppro +looper run examples/meta/peppro_test.yaml --compute docker +``` + +## Run PEPPRO using singularity + +You can [download the singularity image](http://big.databio.org/simages/peppro) or build it from the docker image using the Makefile: +``` +cd peppro/ +make singularity +``` + +Now you'll need to tell the pipeline where you saved the singularity image. By default PEPPRO expects you to put your singularity image in a folder referred to with an environment variable called `$SIMAGES`: + +``` +export SIMAGES=path/to/singularity/folder/ +``` + +You could also tweak the `pipeline_interface.yaml` file so that the `compute.singularity_image` attribute is pointing to the right location on disk. Run it like this: + +``` +cd peppro +looper run examples/meta/peppro_test.yaml \ + --compute singularity +``` + +## Run PEPPRO using singularity with SLURM + +``` +cd peppro +looper run examples/meta/peppro_test.yaml \ + --compute singularity_slurm +``` + +## More details on containers + +You may need to adjust the built-in templates to fit with how you run docker or singularity in your environment. For example, you'll need to make sure to mount any filesystems you need. To do this pretty easy, you will just need to tweak the default templates to fit your environment. Here are some resources to get you started: + +- [Divvy documentation on container templates](http://divvy.databio.org/en/latest/containers/) +- Looper documentation detailed instructions for [how to run pipelines in containers](http://looper.databio.org/en/latest/containers/). diff --git a/docs/howto/use_container.md b/docs/container_reference.md similarity index 55% rename from docs/howto/use_container.md rename to docs/container_reference.md index d752a08..e53b95b 100644 --- a/docs/howto/use_container.md +++ b/docs/container_reference.md @@ -1,34 +1,6 @@ -# Run PEPPRO in a container +# Running individual samples in a container -We have produced both docker and singularity containers that hold all the necessary software for `PEPPRO`. You can run `PEPPRO` as an individual pipeline on a single sample using these containers by directly calling `docker run` or `singularity exec`. Or, you can rely on `looper`, which is already set up to run any pipeline in existing containers using the `divvy` templating system. Instructions for both follow: - -First, make sure your environment is set up to run either docker or singularity containers. Then, pull the container image: - -**Docker**: You can pull the docker [databio/peppro image](https://hub.docker.com/r/databio/peppro/) from dockerhub like this: - -``` -docker pull databio/peppro -``` - -Or build the image using the included Dockerfile (you can use a recipe in the included Makefile): -``` -cd peppro/ -make docker -``` - -**Singularity**: You can [download the singularity image](http://big.databio.org/simages/peppro) or build it from the docker image using the Makefile: -``` -cd peppro/ -make singularity -``` - -Now you'll need to tell the pipeline where you saved the singularity image. You can either create an environment variable called `$SIMAGES` that points to the folder where your image is stored, or you can tweak the `pipeline_interface.yaml` file so that the `compute.singularity_image` attribute is pointing to the right location on disk. - -If your containers are set up correctly, then won't need to install any additional software. - -## Running individual samples in a container - -Individual jobs can be run in a container by simply running the `peppro.py` command through `docker run` or `singularity exec`. You can run containers either on your local computer, or in an HPC environment, as long as you have `docker` or `singularity` installed. For example, run it locally in singularity like this: +Individual jobs can be run in a container by simply running the `peppro.py` command through `docker run` or `singularity exec`. For example, run it locally in singularity like this: ``` singularity exec --bind $REFGENIE $SIMAGES/peppro pipelines/peppro.py --help ``` @@ -42,7 +14,7 @@ Be sure to mount the volumes you need with `--volume`. If you're utilizing any e ### Container details #### Using `docker` -The pipeline has been successfully run in both a Linux and MacOS environment. With `docker` you need to bind mount your volume that contains the pipeline and your `$REFGENIE` location, as well as provide the container the same environment variables your host environment is using. +With `docker` you need to bind mount your volume that contains the pipeline and your `$REFGENIE` location, as well as provide the container the same environment variables your host environment is using. In the first example, we're mounting our home user directory (`/home/jps3ag/`) which contains the parent directories to our `$REFGENIE` folder and to the pipeline itself. We'll also provide the pipeline two environment variables, `$REFGENIE` and `$HOME`. @@ -106,6 +78,6 @@ Third, close your instance when finished. singularity instance.stop peppro_instance ``` -## Running multiple samples in a container with looper -To run multiple samples in a container, you simply need to configure `looper` to use a container-compatible template. The looper documentation has detailed instructions for [how to run pipelines in containers](http://code.databio.org/looper/containers/). \ No newline at end of file + + diff --git a/docs/howto/detailed_install.md b/docs/detailed_install.md similarity index 56% rename from docs/howto/detailed_install.md rename to docs/detailed_install.md index 6cad101..1f33757 100644 --- a/docs/howto/detailed_install.md +++ b/docs/detailed_install.md @@ -2,47 +2,12 @@ This guide walks you through the minutiae of how to install each prerequisite component. We'll presume you're installing this in a Linux environment. If not the case, you'll need to go to each tool's respective site to find alternative installation approaches and options. -## 1: Clone the `PEPPRO` pipeline +## Install required software -To begin, we need to get the `PEPPRO` pipeline itself. The pipeline is hosted on [github](https://github.com/databio/peppro). If you don't have git installed, follow the [git installation instructions](https://git-scm.com/download/linux), and here is a [brief introduction to git](https://guides.github.com/introduction/git-handbook/). To install `PEPPRO`, you can use one of the following methods: - -* using SSH: -``` -git clone git@github.com:databio/peppro.git -``` -* using HTTPS: -``` -git clone https://github.com/databio/peppro.git -``` - -We'll use HTTPS in this example. From an open terminal, let's first create a directory we'll use to run through this guide: -```console -mkdir peppro_tutorial -``` - -Let's move into our newly created directory and create a few more folders that we'll use later. -```console -cd peppro_tutorial/ -mkdir data -mkdir genomes -mkdir processed -mkdir templates -mkdir tools -cd tools/ -``` - -Time to get PEPPRO! -``` -git clone https://github.com/databio/peppro.git -``` -Success! If you had any issues, feel free to [reach out to us with questions](https://github.com/databio/peppro/issues). Otherwise, let's move on to installing additional software. - -## 2: Install required software - -You have two options for installing the software prerequisites: 1) use a container, in which case you need only either `docker` or `singularity`; or 2) install all prerequisites natively. We'll install everything natively in this guide. +You have two options for installing the software prerequisites: 1) use a container, in which case you need only either `docker` or `singularity`; or 2) install all prerequisites natively. We'll install everything natively in this guide. If you want to try the container approach, read [PEPPRO in containers](container.md). To use `PEPPRO`, we need the following software: -**Python packages**. The pipeline uses [`pypiper`](http://pypiper.readthedocs.io/en/latest/) to run a single sample, [`looper`](http://looper.readthedocs.io/en/latest/) to handle multi-sample projects (for either local or cluster computation), [`pararead`](https://github.com/databio/pararead) for parallel processing sequence reads, [`refgenie`](http://refgenie.databio.org/en/latest/) to organize and build reference assemblies, [`cutadapt`](https://cutadapt.readthedocs.io/) to remove adapters for single-end data or optionally for paired-end, and the common `python` libraries [`numpy`](https://www.numpy.org/) and [`pandas`](https://pandas.pydata.org/). You can do a user-specific install using the included requirements.txt file in the pipeline directory: +**Python packages**. The pipeline uses [`pypiper`](http://pypiper.readthedocs.io/en/latest/) to run a single sample, [`looper`](http://looper.readthedocs.io/en/latest/) to handle multi-sample projects (for either local or cluster computation), [`pararead`](https://github.com/databio/pararead) for parallel processing sequence reads, [`refgenie`](http://refgenie.databio.org/en/latest/) to organize and build reference assemblies, [`cutadapt`](https://cutadapt.readthedocs.io/) to remove adapters, [`refgenie`](http://refgenie.databio.org/) to manage genome assets, and the common `python` libraries [`numpy`](https://www.numpy.org/) and [`pandas`](https://pandas.pydata.org/). You can do a user-specific install using the included requirements.txt file in the pipeline directory: ```console pip install --user -r requirements.txt ``` @@ -56,16 +21,16 @@ The following tools are used by the pipeline: * [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/) * [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) +* [fastq-pair](https://github.com/linsalrob/fastq-pair.git) +* [flash](https://ccb.jhu.edu/software/FLASH/) +* [preseq](http://smithlabresearch.org/software/preseq/) +* [picard](https://broadinstitute.github.io/picard/) * [samtools (v1.7)](http://www.htslib.org/) * [seqkit](https://bioinf.shenwei.me/seqkit/) -* [fastp](https://github.com/OpenGene/fastp) * [seqtk](https://github.com/lh3/seqtk) -* [preseq](http://smithlabresearch.org/software/preseq/) -* [fastq-pair](https://github.com/linsalrob/fastq-pair.git) -* [picard](http://broadinstitute.github.io/picard/) -* UCSC tools (v3.5.1) - * [wigToBigWig (v4)](https://www.encodeproject.org/software/wigtobigwig/) +* Two specific UCSC tools (v3.5.1) * [bigWigCat (v4)](http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/) + * [wigToBigWig (v4)](https://www.encodeproject.org/software/wigtobigwig/) #### bedtools We'll install each of these pieces of software before moving forward. Let's start right at the beginning and install `bedtools`. We're going to install from source, but if you would prefer to install from a package manager, you can follow the instructions in the [bedtools' installation guide](http://bedtools.readthedocs.io/en/latest/content/installation.html). @@ -92,6 +57,7 @@ unzip bowtie2-2.3.4.1-source.zip rm bowtie2-2.3.4.1-source.zip cd bowtie2-2.3.4.1 make +cd ../ ``` Again, let's add `bowtie2` to our `PATH` environment variable: ``` @@ -99,54 +65,46 @@ export PATH="$PATH:/path/to/peppro_tutorial/tools/bowtie2-2.3.4.1/" ``` Great! On to the next one. -#### samtools -Next up, `samtools`. -```console -wget https://sourceforge.net/projects/samtools../files/samtools/1.9/samtools-1.9.tar.bz2 -tar xvfj samtools-1.9.tar.bz2 -rm samtools-1.9.tar.bz2 -cd samtools-1.9 -./configure -``` -Alternatively, if you do not have the ability to install `samtools` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). +#### fastq_pair +Finally, because PRO-seq treats read1 differently than read2 in paired-end data, we need to resync paired-end files after processing. We [use `fastq_pair`](https://github.com/linsalrob/fastq-pair/blob/master/INSTALLATION.md) to do so efficiently. ```console +git clone https://github.com/linsalrob/fastq-pair.git +cd fastq-pair/ +mkdir build +cd build/ +cmake3 .. make make install -``` -As for our other tools, add `samtools` to our `PATH` environment variable: -``` -export PATH="$PATH:/path/to/peppro_tutorial/tools/samtools-1.9/" +cd ../../ ``` -#### seqkit -Let's grab `seqkit` now. Check out [the author's installation guide](https://github.com/shenwei356/seqkit#installation) for more instruction if necessary. +### flash + +To obtain a plot to evaluate library quality when we have paired-end reads, we use FLASH to generate a distribution of reads. ```console -cd ../ -wget https://github.com/shenwei356/seqkit/releases/download/v0.10.1/seqkit_linux_amd64.tar.gz -tar -zxvf seqkit_linux_amd64.tar.gz +wget http://ccb.jhu.edu/software/FLASH/FLASH-1.2.11-Linux-x86_64.tar.gz +tar xvfz FLASH-1.2.11-Linux-x86_64.tar.gz ``` -And then make sure that executable is in our `PATH`. -```console -export PATH="$PATH:/path/to/peppro_tutorial/tools/" + +And let's add `FLASH` to our `PATH` environment variable: +``` +export PATH="$PATH:/path/to/peppro_tutorial/tools/FLASH-1.2.11-Linux-x86_64/" ``` -#### fastp -Next on our list is `fastp`. Check out their [install instructions](https://github.com/OpenGene/fastp#get-fastp) if necessary. +#### picard +`PEPPRO` is built using `PyPiper` and relies upon the `PyPiper NGSTK` tool kit which itself employs `Picard`. [Read the `picard` installation guide](http://broadinstitute.github.io/picard/) for more assistance. ```console -git clone https://github.com/OpenGene/fastp.git -cd fastp/ -make -make install +wget https://github.com/broadinstitute/picard/releases/download/2.20.3/picard.jar +chmod +x picard.jar ``` -Add to `PATH`! -```console -export PATH="$PATH:/path/to/peppro_tutorial/tools/fastp/" +Create an environmental variable pointing to the `picard.jar` file called `$PICARD`. Alternatively, [update the `peppro.yaml` file](https://github.com/databio/peppro/blob/master/pipelines/peppro.yaml) with the full PATH to the `picard.jar` file. +``` +export PICARD="/path/to/peppro_tutorial/tools/picard.jar" ``` #### preseq The pipeline uses `preseq` to calculate library complexity. Check out the author's [page for more instruction](https://github.com/smithlabcode/preseq). ```console -cd ../ wget http://smithlabresearch.org/downloads/preseq_linux_v2.0.tar.bz2 tar xvfj preseq_linux_v2.0.tar.bz2 ``` @@ -155,28 +113,35 @@ Add to `PATH`! export PATH="$PATH:/path/to/peppro_tutorial/tools/preseq_v2.0/" ``` -#### fastq_pair -Finally, because PRO-seq treats read1 differently than read2 in paired-end data, we need to resync paired-end files after processing. We [use `fastq_pair`](https://github.com/linsalrob/fastq-pair/blob/master/INSTALLATION.md) to do so efficiently. +#### samtools +Next up, `samtools`. +```console +wget https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2 +tar xvfj samtools-1.10.tar.bz2 +rm samtools-1.10.tar.bz2 +cd samtools-1.10/ +./configure +``` +Alternatively, if you do not have the ability to install `samtools` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). ```console -git clone https://github.com/linsalrob/fastq-pair.git -cd fastq-pair/ -mkdir build -cd build/ -cmake3 .. make make install ``` +As for our other tools, add `samtools` to our `PATH` environment variable: +``` +export PATH="$PATH:/path/to/peppro_tutorial/tools/samtools-1.10/" +``` -#### picard -`PEPPRO` is built using `PyPiper` and relies upon the `PyPiper NGSTK` tool kit which itself employs `Picard`. [Read the `picard` installation guide](http://broadinstitute.github.io/picard/) for more assistance. +#### seqkit +Let's grab `seqkit` now. Check out [the author's installation guide](https://github.com/shenwei356/seqkit#installation) for more instruction if necessary. ```console -cd ../../ -wget https://github.com/broadinstitute/picard/releases/download/2.20.3/picard.jar -chmod +x picard.jar -``` -Create an environmental variable pointing to the `picard.jar` file called `$PICARD`. Alternatively, [update the `peppro.yaml` file](https://github.com/databio/peppro/blob/master/pipelines/peppro.yaml) with the full PATH to the `picard.jar` file. +cd ../ +wget https://github.com/shenwei356/seqkit/releases/download/v0.10.1/seqkit_linux_amd64.tar.gz +tar -zxvf seqkit_linux_amd64.tar.gz ``` -export PICARD="/path/to/peppro_tutorial/tools/picard.jar" +And then make sure that executable is in our `PATH`. +```console +export PATH="$PATH:/path/to/peppro_tutorial/tools/" ``` #### UCSC utilities @@ -191,13 +156,14 @@ Add our `tools/` directory to our `PATH` environment variable. ``` export PATH="$PATH:/path/to/peppro_tutorial/tools/" ``` -That should do it! Now we'll [install some **optional** packages](../tutorial.md#install-optional-software). Of course, these are not required, but for the purposes of this tutorial we're going to be completionists. +That should do it! Now we'll install some **optional** packages. Of course, these are not required, but for the purposes of this tutorial we're going to be completionists. ### Optional software `PEPPRO` uses `R` to generate quality control plots. These are **optional** and the pipeline will run without them, but you would not get any QC plots. If you need to don't have [R installed, you can follow these instructions](https://cran.r-project.org/doc/manuals/r-release/R-admin.html). We'll use and install the necessary packages in this example. Here is the list of required packages: - [data.table (v1.11.2)](https://cran.r-project.org/package=data.table) + - [devtools](https://cran.r-project.org/web/packages/devtools/index.html) - [GenomicDistributions (v0.5)](http://code.databio.org/GenomicDistributions/index.html) - [ggplot2 (v2.2.1)](https://cran.r-project.org/package=ggplot2) - [pepr (v0.2.1)](http://code.databio.org/pepr/) @@ -207,6 +173,7 @@ To install the needed packages, enter the following command in the pipeline fold ``` Rscript -e 'install.packages("PEPPROr", repos=NULL, type="source")' ``` + To extract files quicker, `PEPPRO` can also utilize `pigz` in place of `gzip` if you have it installed. Let's go ahead and do that now. It's not required, but it can help speed everything up when you have many samples to process. ``` cd /path/to/peppro_tutorial/tools/ @@ -220,58 +187,27 @@ Don't forget to add this to your `PATH` too! ``` export PATH="$PATH:/path/to/peppro_tutorial/tools/pigz-2.4/" ``` -That's it! Everything we need to run `PEPPRO` to its full potential should be installed. If you are interested and have experience using containers, you can check out the [alternate installation methods](../install.md#121-use-containers). - -### Create environment variables - -We also need to create some environment variables to help point `looper` to where we keep our data files and our tools. You may either set the environment variables up, like we're going to do now, or you may simply hard code the necessary locations in our configuration files. -First, let's create a `PROCESSED` variable that represents the location where we want to save output. -``` -export PROCESSED="/path/to/peppro_tutorial/processed/" -``` -Second, we'll create a variable representing the root path to all our tools named `CODEBASE`. -``` -export CODEBASE="/path/to/peppro_tutorial/tools/" -``` -(Add these environment variables to your `.bashrc` or `.profile` so you don't have to always do this step). -Fantastic! Now that we have the pipeline and its requirements installed, we're ready to get our reference genome(s). - - -## 3: Download a reference genome -Before we analyze anything, we also need a reference genome. `PEPPRO` uses `refgenie` genomes. For the purposes of this tutorial, we'll just download pre-built genomes. Follow the `'refgenie` instructions if you'd like to [build your own reference genome](https://github.com/databio/refgenie). First, let's change into our `genomes/` folder. -``` -cd /path/to/peppro_tutorial/genomes/ -wget http://big.databio.org/refgenomes/hg38.tgz -wget http://cloud.databio.org.s3.amazonaws.com/refgenomes/human_repeats_170502.tgz -wget http://cloud.databio.org.s3.amazonaws.com/refgenomes/rCRSd_170502.tgz -tar xvfz hg38.tgz -tar xvfz human_repeats_170502.tgz -tar xvfz rCRSd_170502.tgz -rm hg38.tgz -rm human_repeats_170502.tgz -rm rCRSd_170502.tgz -``` +## Download `refgenie` assets -## 4: Point the pipeline to your Refgenie assemblies +PEPPRO uses [`refgenie`](http://refgenie.databio.org/) assets for alignment, quality control reports, and some outputs. You can initialize a refgenie config file like this: -Let's also create another environment variable that points to our genomes. -``` -export GENOMES="/path/to/peppro_tutorial/genomes/ +```console +export REFGENIE=your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE ``` -(Don't forget to add this to your `.bashrc` or `.profile` to ensure it persists). +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. -## 5: Download or create annotation files +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: -To calculate TSS enrichments, you will need a [TSS annotation file](http://big.databio.org/refgenomes/) in your reference genome directory. If a pre-built version for your genome of interest isn't present, you can quickly create that file yourself. In the reference genome directory, you can perform the following commands for in this example, `hg38`: -``` -wget -O hg38_TSS_full.txt.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz \ -zcat hg38_TSS_full.txt.gz | \ - awk '{if($4=="+"){print $3"\t"$5"\t"$5"\t"$4"\t"$13}else{print $3"\t"$6"\t"$6"\t"$4"\t"$13}}' | \ - LC_COLLATE=C sort -k1,1 -k2,2n -u > hg38_TSS.tsv +```console +refgenie pull -g hg38 -a bowtie2_index ensembl_gtf ensembl_rb refgene_anno feat_annotation ``` +PEPPRO also requires `bowtie2_index` for any pre-alignment genomes: -We also have [downloadable pre-built genome annotation files](http://big.databio.org/peppro/) for `hg38`, `hg19`, `mm10`, and `mm9` that you can use to annotate the reads and peaks. These files annotate 3' and 5' UTR, Exonic, Intronic, Intergenic, Promoter, and Promoter Flanking Regions of the corresponding genome as indicated in Ensembl or UCSC. Simply move the corresponding genome annotation file into the `peppro/anno` folder. Once present in the `peppro/anno` folder you don't need to do anything else as the pipeline will look there automatically. Alternatively, you can use the `--anno-name` pipeline option to directly point to this file when running. You can also [learn how to create a custom annotation file](annotation_files.md) to calculate coverage using your own features of interest. +```console +refgenie pull -g human_rDNA -a bowtie2_index +``` -Alright! Time to setup the pipeline configuration files and run our sample. +That's it! Everything we need to run `PEPPRO` to its full potential should be installed. \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 0000000..f2ed598 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,9 @@ +# FAQ + +## Do I have to use PEPPRO with looper? + +No. `PEPPRO` by itself does not specify any cluster resources, so you could just roll your own and submit individual jobs to a cluster however you choose. But because `PEPPRO` is already `looper`-compatible, the easier way is to use `looper's` built-in template system, which `looper` uses to build flexible shell scripts for job submission. These templates can be used to run jobs in a container, to submit to a cluster resource manager, or both. + +## Will PEPPRO run on a mac? + +The pipeline has been successfully run in both a Linux and MacOS environment. diff --git a/docs/features.md b/docs/features.md index 00048c2..4e3fdc8 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,16 +1,16 @@ -# PEPPRO features at-a-glance +# PEPPRO features at-a-glance Here are a few of the highlights that make `PEPPRO` valuable. -- **Scalability.** Run the pipeline easily on a project with a single sample or a thousand. This pipeline is compatible with [`looper`](https://github.com/pepkit/looper), so it can run locally, in a cloud container engine, or with any cluster resource manager (e.g. SLURM, SGE, or LFS). -- **Restartability.** The pipeline is built using [`pypiper`](https://github.com/databio/pypiper), so it automatically picks up where it left off in case of preemption or crash. +- **Scalability.** Run the pipeline easily on a project with a single sample or a thousand. This pipeline is compatible with [`looper`](https://looper.databio.org), so it can run locally, in a cloud container engine, or with any cluster resource manager (e.g. SLURM, SGE, or LFS). +- **Restartability.** The pipeline is built using [`pypiper`](https://pypiper.databio.org), so it automatically picks up where it left off in case of preemption or crash. - **Copious logging.** The pipeline produces a detailed log file recording all output from every command run, and also records the time and memory use of every process, the version of the pipeline and other software, and other useful run information. - **Flexibility.** The pipeline provides options for multiple peak callers, multiple adapter trimmers, and fully configurable parameterization for many underlying tools. - **Portability.** Run it using `docker` or `singularity` with no other prerequisites, or it can be run natively without containers. The choice is yours. - **Standardized user interface.** The pipeline reads sample metadata formatted in [standard PEP format](http://pepkit.github.io/), so you can use the same sample annotation sheets for your downstream R or python analysis using tools from [pepkit](http://pepkit.github.io/). -- **Standardized reference genome assembly.** The pipeline uses standard reference genome assemblies produced by [`refgenie`](http://github.com/databio/refgenie)], which provides a scripted way to produce a compatible reference assembly for any custom genome. For common genomes, you can either download pre-indexed assemblies or build your own. +- **Standardized reference genome assembly.** The pipeline uses standard reference genome assemblies produced by [refgenie](http://refgenie.databio.org), which provides a scripted way to produce a compatible reference assembly for any custom genome. For common genomes, you can either download pre-indexed assemblies or build your own. - **Prealignments.** The pipeline can (optionally) first align to any number of reference assemblies separately before the primary genome alignment. This increases both speed and accuracy and can be used, for example, to align sequentially to mtDNA, repeats, or spike-ins. - **Fraction of reads in pre-mature mRNA and features (FRiP/FRiF).** By default, the pipeline will calculate the FRiP using annotated pre-mature mRNA. The pipeline will also calculate the fraction of reads in known annotated features if using a common reference genome and may be customized to use any feature set. - **TSS enrichments, Fragment length distributions and more.** The pipeline produces various nice QC plots. -- **Beautiful `HTML` reports.** Your results include an easy-to-navigate `HTML` report with a sample table, job status, summary statistics, and QC plots at your fingertips when run using our [pipeline submission engine, `Looper`](https://looper.readthedocs.io/en/latest/index.html). +- **Beautiful `HTML` reports.** Your results include an easy-to-navigate `HTML` report with a sample table, job status, summary statistics, and QC plots at your fingertips when run using our [pipeline submission engine, looper](https://looper.databio.org). diff --git a/docs/files/examples/tutorial/reports/fastp_report.html b/docs/files/examples/tutorial/reports/fastp_report.html new file mode 100644 index 0000000..517e703 --- /dev/null +++ b/docs/files/examples/tutorial/reports/fastp_report.html @@ -0,0 +1,257 @@ + + + + + + + + + + + + + + + + + + + Looper: FastP_report objects + +
+ + + +
+
+
+ +

FastP_report

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/fastqc_report_r1.html b/docs/files/examples/tutorial/reports/fastqc_report_r1.html new file mode 100644 index 0000000..4a24217 --- /dev/null +++ b/docs/files/examples/tutorial/reports/fastqc_report_r1.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: FastQC report r1 objects + +
+ + + +
+
+
+ +

FastQC report r1

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/fragment_distribution.html b/docs/files/examples/tutorial/reports/fragment_distribution.html new file mode 100644 index 0000000..eecaada --- /dev/null +++ b/docs/files/examples/tutorial/reports/fragment_distribution.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: Fragment distribution objects + +
+ + + +
+
+
+ +

Fragment distribution

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/library_complexity.html b/docs/files/examples/tutorial/reports/library_complexity.html new file mode 100644 index 0000000..f538584 --- /dev/null +++ b/docs/files/examples/tutorial/reports/library_complexity.html @@ -0,0 +1,257 @@ + + + + + + + + + + + + + + + + + + + Looper: Library complexity objects + +
+ + + +
+
+
+ +

Library complexity

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/minus_frif.html b/docs/files/examples/tutorial/reports/minus_frif.html new file mode 100644 index 0000000..0cfe8e7 --- /dev/null +++ b/docs/files/examples/tutorial/reports/minus_frif.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: Minus FRiF objects + +
+ + + +
+
+
+ +

Minus FRiF

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/mrna_contamination.html b/docs/files/examples/tutorial/reports/mrna_contamination.html new file mode 100644 index 0000000..755e9ac --- /dev/null +++ b/docs/files/examples/tutorial/reports/mrna_contamination.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: mRNA contamination objects + +
+ + + +
+
+
+ +

mRNA contamination

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/objects.html b/docs/files/examples/tutorial/reports/objects.html new file mode 100644 index 0000000..15a22c0 --- /dev/null +++ b/docs/files/examples/tutorial/reports/objects.html @@ -0,0 +1,264 @@ + + + + + + + + + + + + + + + + + + + Looper: Objects + +
+ + + +
+
+

Objects

+ +
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/pause_index.html b/docs/files/examples/tutorial/reports/pause_index.html new file mode 100644 index 0000000..2896a13 --- /dev/null +++ b/docs/files/examples/tutorial/reports/pause_index.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: Pause index objects + +
+ + + +
+
+
+ +

Pause index

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/plus_frif.html b/docs/files/examples/tutorial/reports/plus_frif.html new file mode 100644 index 0000000..95fa452 --- /dev/null +++ b/docs/files/examples/tutorial/reports/plus_frif.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: Plus FRiF objects + +
+ + + +
+
+
+ +

Plus FRiF

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/samples.html b/docs/files/examples/tutorial/reports/samples.html new file mode 100644 index 0000000..d00541a --- /dev/null +++ b/docs/files/examples/tutorial/reports/samples.html @@ -0,0 +1,246 @@ + + + + + + + + + + + + + + + + + + + Looper: Samples + +
+ + + +
+
+

Samples

+
+ + + tutorial + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/status.html b/docs/files/examples/tutorial/reports/status.html new file mode 100644 index 0000000..734f7dd --- /dev/null +++ b/docs/files/examples/tutorial/reports/status.html @@ -0,0 +1,299 @@ + + + + + + + + + + + + + + + + + + + Looper: status + +
+ + + +
+
+

Status by sample

+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
Sample nameStatusLog fileRuntimePeak memory use
+ tutorial + + Completed + + PEPPRO_log.md + + 0:21:53 + + 3.4646 GB +
+
+
+
+ + + + + + +
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/tss_enrichment.html b/docs/files/examples/tutorial/reports/tss_enrichment.html new file mode 100644 index 0000000..d72c786 --- /dev/null +++ b/docs/files/examples/tutorial/reports/tss_enrichment.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + Looper: TSS enrichment objects + +
+ + + +
+
+
+ +

TSS enrichment

+ + + +
Links
+
+ + tutorial + +
+ + +
+
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/reports/tutorial.html b/docs/files/examples/tutorial/reports/tutorial.html new file mode 100644 index 0000000..afa1aca --- /dev/null +++ b/docs/files/examples/tutorial/reports/tutorial.html @@ -0,0 +1,523 @@ + + + + + + + + + + + + + + + + + + + Looper: tutorial + +
+ + + +
+
+

Sample name: tutorial

+
+
+

+ + + Log file + + + Pipeline profile + + + Pipeline commands + + + Stats summary file + +

+
+
+
+

Looper stats summary

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
sample_nametutorial
organismhuman
protocolPROSEQ
read_typepaired
read1tutorial_r1.fq.gz
read2tutorial_r2.fq.gz
File_mb50.42
Read_typepaired
Genomehg38
Raw_reads2000000
Fastq_reads2000000
Trimmed_reads497796
Trim_loss_rate75.11
Aligned_reads_human_rDNA5860.0
Alignment_rate_human_rDNA1.18
Mapped_reads431804
QC_filtered_reads274898
Aligned_reads156906.0
Alignment_rate31.52
Total_efficiency7.85
Read_depth1.17
Mitochondrial_reads5357
Maximum_read_length30
NRF1.0
PBC115975.0
PBC215975.0
Unmapped_reads468224
TSS_Plus_Score33.3
TSS_Minus_Score4.3
Pause_index228.97
Plus FRiP0.07
Minus FRiP0.07
mRNA_contamination4.26
Time0:09:52
Success11-27-14:04:51
+
+
+ +
+ +
+ + \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_cleanup.sh b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_cleanup.sh new file mode 100755 index 0000000..5b6184e --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_cleanup.sh @@ -0,0 +1,3 @@ +DIR="$(cd -P -- "$(dirname -- "$0")" && pwd -P)" +cd ${DIR} +rm /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/PEPPRO_cleanup.sh diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_commands.sh b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_commands.sh new file mode 100644 index 0000000..7b5e76c --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_commands.sh @@ -0,0 +1,105 @@ +# Pipeline started at 11-27 13:54:59 + +ln -sf /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz +ln -sf /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz +gzip -f -d -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq +gzip -f -d -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq +( (fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq --adapter_sequence TGGAATTCTCGGGTGCCAAGG --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.json --report_title 'tutorial' --stdout ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt | seqtk trimfq -L 30 - | seqtk seq -r - > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt +fastqc --noextract --outdir /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastqc /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq +( (fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq --adapter_sequence GATCGTCGGACTGTAGAACTCTGAAC --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json --report_title 'tutorial' --stdout ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt | seqtk trimfq -L 30 - | seqtk seq -r - > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt +cp /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed_dups.fastq +fastq_pair -t 1800000 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq +mv /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq.paired.fq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq +mv /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq.paired.fq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq +touch repaired.flag +mkfifo /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2 +perl /scratch/jps3dp/tools/databio/peppro/tools/filter_paired_fq.pl /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq +(bowtie2 -p 1 -k 1 -D 20 -R 3 -N 1 -L 20 -i S,1,0.50 -x /scratch/jps3dp/DATA/genomes/human_rDNA/bowtie2_index/default/human_rDNA --rg-id tutorial -U /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq --un /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2 > /dev/null) 2>/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_bt_aln_summary.log +grep 'aligned exactly 1 time' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_bt_aln_summary.log | awk '{print $1}' +bowtie2 -p 1 --very-sensitive -X 2000 --rg-id tutorial -x /scratch/jps3dp/DATA/genomes/hg38/bowtie2_index/default/hg38 --rf -1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq -2 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq | samtools view -bS - -@ 1 | samtools sort - -@ 1 -T /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tmp7b3a497a -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam +samtools view -q 10 -b -@ 1 -U /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_fail_qc.bam /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam +samtools depth -b /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_pre-mRNA.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | awk '{counter++;sum+=$3}END{print sum/counter}' +gzip -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq +gzip -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq +samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam +samtools idxstats /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam | grep -we 'chrM' -we 'chrMT' -we 'M' -we 'MT' -we 'rCRSd' -we 'rCRSd_3k'| cut -f 3 +samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam +samtools idxstats /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | cut -f 1 | grep -vwe 'chrM' -vwe 'chrMT' -vwe 'M' -vwe 'MT' -vwe 'rCRSd' -vwe 'rCRSd_3k'| xargs samtools view -b -@ 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_noMT.bam +mv /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_noMT.bam /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam +samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam +samtools view -b -f 64 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | samtools sort - -@ 1 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam +samtools view -b -f 128 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | samtools sort - -@ 1 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE2.bam +samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam +/scratch/jps3dp/tools/databio/peppro/tools/bamQC.py --silent -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -c 1 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv +awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "NRF") c=i } getline; print $c }' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv +awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "PBC1") c=i } getline; print $c }' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv +awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "PBC2") c=i } getline; print $c }' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv +samtools view -b -@ 1 -f 12 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_unmap.bam +samtools view -c -f 4 -@ 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam +samtools view -bh -F 20 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam +samtools view -bh -f 16 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam +sed -n -e '/[[:space:]]+/w /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/plus_TSS.tsv' -e '/[[:space:]]-/w /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/minus_TSS.tsv' /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_TSS.bed +/scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/plus_TSS.tsv -p ends -c 1 -z -v -s 6 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_TssEnrichment.txt +/scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/minus_TSS.tsv -p ends -c 1 -z -v -s 6 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_TssEnrichment.txt +Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R tss -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_TssEnrichment.txt /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_TssEnrichment.txt +samtools view -H /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam | grep 'SN:' | awk -F':' '{print $2,$3}' | awk -F' ' -v OFS=' ' '{print $1,$3}' > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt +grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/ensembl_gtf/default/hg38_ensembl_TSS.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_tss.bed +grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/ensembl_gtf/default/hg38_ensembl_gene_body.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_gene_body.bed +bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_tss.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | awk '$7>0' | sort -k4,4 -k7,7nr | sort -k4,4 -u > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_TSS_density.bed +bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_gene_body.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | awk '$7>0' | sort -k4 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_gene_body_density.bed +join --nocheck-order -j4 -o 1.1 1.2 1.3 1.4 1.6 1.7 2.2 2.3 2.7 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_TSS_density.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_gene_body_density.bed | awk -v OFS=' ' '{print $1, $2, $3, $4, ($6/($3-$2))/($9/($8-$7)), $5}' | env LC_COLLATE=C sort -k1,1 -k2,2n > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed +sort -k5,5n /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed | awk ' { a[i++]=$5; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' +Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R pi -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed +gzip -f -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed +samtools view -@ 4 -c -L /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_pre-mRNA.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam +samtools view -@ 4 -c -L /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_pre-mRNA.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam +perl /scratch/jps3dp/tools/databio/peppro/tools/fragment_length_dist.pl /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLen.txt +sort -n /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLen.txt | uniq -c > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragCount.txt +Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R frag -l /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLen.txt -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragCount.txt -p /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.pdf -t /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.txt +ln -sf /scratch/jps3dp/DATA/genomes/hg38/feat_annotation/default/hg38_annotations.bed.gz /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed.gz +gzip -f -d -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed.gz > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed +cut -f 4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed | sort -u +awk -F' ' '{print>"/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/"$4}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed +mv "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3' UTR" "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR" +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_minus_coverage.bed +mv "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5' UTR" "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR" +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_minus_coverage.bed +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_minus_coverage.bed +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_minus_coverage.bed +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_minus_coverage.bed +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_minus_coverage.bed +mv "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter Flanking Region" "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region" +cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_plus_coverage.bed +bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_minus_coverage.bed +samtools view -@ 1 -q 10 -c -F4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam +Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R frif -n tutorial -r 16265 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.pdf --bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_plus_coverage.bed +samtools view -@ 1 -q 10 -c -F4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam +Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R frif -n tutorial -r 15685 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.pdf --bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_minus_coverage.bed +grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_exons.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_exons_sort.bed +grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_introns.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_introns_sort.bed +bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_exons_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_coverage.bed +bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_introns_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_coverage.bed +awk -v OFS=' ' '{chrom[$4] = $1; if($4!=prev4) {chromStart[$4] = $2} strand[$4] = $6; readCount[$4] += $7; exonCount[$4] += 1; geneSizeKB[$4] += (sqrt(($3-$2+0.00000001)^2)/1000); gene[$4] = $4; chromEnd[$4]=$3; prev4=$4} END { for (a in readCount) { print chrom[a], chromStart[a], chromEnd[a], gene[a], (readCount[a]/0.156906)/geneSizeKB[a], strand[a]}}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_coverage.bed | awk '$5>0' | sort -k4 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_rpkm.bed +awk -v OFS=' ' '{chrom[$4] = $1; if($4!=prev4) {chromStart[$4] = $2} strand[$4] = $6; readCount[$4] += $7; exonCount[$4] += 1; geneSizeKB[$4] += (sqrt(($3-$2+0.00000001)^2)/1000); gene[$4] = $4; chromEnd[$4]=$3; prev4=$4} END { for (a in readCount) { print chrom[a], chromStart[a], chromEnd[a], gene[a], (readCount[a]/0.156906)/geneSizeKB[a], strand[a]}}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_coverage.bed | awk '$5>0' | sort -k4 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_rpkm.bed +join --nocheck-order -a1 -a2 -j4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_rpkm.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_rpkm.bed | awk -v OFS=' ' 'NF==11 {print $7, $8, $9, $1, ($10/$5), $11}' | sort -k1,1 -k2,2n > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed +awk '{print $5}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed | sort -n | awk ' { a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' +Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R mrna -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed --raw +gzip -f -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed +samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam +/scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -c /scratch/jps3dp/DATA/genomes/hg38/fasta/default/hg38.chrom.sizes -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_plus_body_0-mer.bw -p 1 --variable-step --tail-edge +samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam +/scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -c /scratch/jps3dp/DATA/genomes/hg38/fasta/default/hg38.chrom.sizes -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_minus_body_0-mer.bw -p 1 --variable-step --tail-edge diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_completed.flag b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_completed.flag new file mode 100644 index 0000000..e69de29 diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_log.md b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_log.md new file mode 100644 index 0000000..776d9b8 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_log.md @@ -0,0 +1,1068 @@ +### Pipeline run code and environment: + +* Command: `/scratch/jps3dp/tools/databio/peppro/pipelines/peppro.py --sample-name tutorial --genome hg38 --input /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz --single-or-paired paired --input2 /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz --protocol PROSEQ --prealignments human_rDNA -O /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline -M 8000` +* Compute host: udc-ba26-29c0 +* Working dir: /sfs/lustre/bahamut/scratch/jps3dp/DATA/proseq +* Outfolder: /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/ +* Pipeline started at: (11-27 13:55:04) elapsed: 5.0 _TIME_ + +### Version log: + +* Python version: 3.6.5 +* Pypiper dir: `/sfs/qumulo/qhome/jps3dp/.local/lib/python3.6/site-packages/pypiper` +* Pypiper version: 0.12.1 +* Pipeline dir: `/sfs/lustre/bahamut/scratch/jps3dp/tools/databio/peppro/pipelines` +* Pipeline version: 0.8.5 +* Pipeline hash: 1bce5897a1e47486f13b18f1f56420b279847613 +* Pipeline branch: * dev +* Pipeline date: 2019-11-27 11:21:24 -0500 +* Pipeline diff: 1 file changed, 1 insertion(+), 7 deletions(-) + +### Arguments passed to pipeline: + +* `TSS_name`: `None` +* `adapter`: `fastp` +* `anno_name`: `None` +* `complexity`: `False` +* `config_file`: `peppro.yaml` +* `cores`: `1` +* `coverage`: `False` +* `dedup`: `seqkit` +* `dirty`: `False` +* `ensembl_gene_body`: `None` +* `ensembl_tss`: `None` +* `exon_name`: `None` +* `force_follow`: `False` +* `genome_assembly`: `hg38` +* `input`: `['/scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz']` +* `input2`: `['/scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz']` +* `intron_name`: `None` +* `keep`: `False` +* `logdev`: `False` +* `max_len`: `30` +* `mem`: `8000` +* `new_start`: `False` +* `no_fifo`: `False` +* `output_parent`: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline` +* `paired_end`: `True` +* `pre_name`: `None` +* `prealignments`: `['human_rDNA']` +* `protocol`: `PROSEQ` +* `recover`: `False` +* `sample_name`: `tutorial` +* `scale`: `False` +* `search_file`: `None` +* `silent`: `False` +* `single_or_paired`: `paired` +* `sob`: `False` +* `testmode`: `False` +* `trimmer`: `seqtk` +* `umi_len`: `0` +* `verbosity`: `None` + +---------------------------------------- + +Local input file: /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz +Local input file: /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz + +> `File_mb` 50.42 PEPPRO _RES_ + +> `Read_type` paired PEPPRO _RES_ + +> `Genome` hg38 PEPPRO _RES_ +Detected PRO input + +### Merge/link and fastq conversion: (11-27 13:55:05) elapsed: 1.0 _TIME_ + +Number of input file sets: 2 +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz` + +> `ln -sf /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz` (318626) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. + PID: 318626; Command: ln; Return code: 0; Memory used: 0.0GB + +Local input file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz` + +> `ln -sf /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz` (318628) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. + PID: 318628; Command: ln; Return code: 0; Memory used: 0.0GB + +Local input file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz' +Found .fastq.gz file +Found .fastq.gz file +Missing stat 'Raw_reads' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq` + +> `gzip -f -d -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq` (318629) +
+
+Command completed. Elapsed time: 0:00:02. Running peak memory: 0.002GB. + PID: 318629; Command: gzip; Return code: 0; Memory used: 0.002GB + + +> `gzip -f -d -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq` (318634) +
+
+Command completed. Elapsed time: 0:00:02. Running peak memory: 0.002GB. + PID: 318634; Command: gzip; Return code: 0; Memory used: 0.002GB + + +> `Raw_reads` 2000000 PEPPRO _RES_ + +> `Fastq_reads` 2000000 PEPPRO _RES_ +['/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R1.fastq.gz', '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/tutorial_R2.fastq.gz'] + +### FASTQ processing: (11-27 13:55:11) elapsed: 5.0 _TIME_ + +Missing stat 'Aligned_reads' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq` + +> `( (fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq --adapter_sequence TGGAATTCTCGGGTGCCAAGG --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.json --report_title 'tutorial' --stdout ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt | seqtk trimfq -L 30 - | seqtk seq -r - > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt` (318658) +
+
+Command completed. Elapsed time: 0:00:11. Running peak memory: 0.123GB. + PID: 318658; Command: ; Return code: 0; Memory used: 0.123GB + +Evaluating read trimming + +> `Trimmed_reads` 497796 PEPPRO _RES_ + +> `Trim_loss_rate` 75.11 PEPPRO _RES_ +Targetless command, running... + +> `fastqc --noextract --outdir /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastqc /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq` (318685) +
+Picked up JAVA_TOOL_OPTIONS: -Xss1280k
+Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/scratch/jps3dp/tmp
+Started analysis of tutorial_R1_processed.fastq
+Approx 5% complete for tutorial_R1_processed.fastq
+Approx 10% complete for tutorial_R1_processed.fastq
+Approx 15% complete for tutorial_R1_processed.fastq
+Approx 20% complete for tutorial_R1_processed.fastq
+Approx 25% complete for tutorial_R1_processed.fastq
+Approx 30% complete for tutorial_R1_processed.fastq
+Approx 35% complete for tutorial_R1_processed.fastq
+Approx 40% complete for tutorial_R1_processed.fastq
+Approx 45% complete for tutorial_R1_processed.fastq
+Approx 50% complete for tutorial_R1_processed.fastq
+Approx 55% complete for tutorial_R1_processed.fastq
+Approx 60% complete for tutorial_R1_processed.fastq
+Approx 65% complete for tutorial_R1_processed.fastq
+Approx 70% complete for tutorial_R1_processed.fastq
+Approx 75% complete for tutorial_R1_processed.fastq
+Approx 80% complete for tutorial_R1_processed.fastq
+Approx 85% complete for tutorial_R1_processed.fastq
+Approx 90% complete for tutorial_R1_processed.fastq
+Approx 95% complete for tutorial_R1_processed.fastq
+Analysis complete for tutorial_R1_processed.fastq
+
+Command completed. Elapsed time: 0:00:13. Running peak memory: 0.164GB. + PID: 318685; Command: fastqc; Return code: 0; Memory used: 0.164GB + +> `FastQC report r1` fastqc/tutorial_R1_processed_fastqc.html FastQC report r1 None PEPPRO _OBJ_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq` + +> `( (fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq --adapter_sequence GATCGTCGGACTGTAGAACTCTGAAC --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json --report_title 'tutorial' --stdout ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt | seqtk trimfq -L 30 - | seqtk seq -r - > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq ) 2> /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt` (318711) +
+
+Command completed. Elapsed time: 0:00:08. Running peak memory: 0.429GB. + PID: 318711; Command: ; Return code: 0; Memory used: 0.429GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed_dups.fastq` + +> `cp /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed_dups.fastq` (318724) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.429GB. + PID: 318724; Command: cp; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq.paired.fq`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq.paired.fq` + +> `fastq_pair -t 1800000 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq` (318725) +
+Left paired: 452944		Right paired: 452944
+Left single: 44852		Right single: 20558
+Writing the paired reads to /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq.paired.fq and /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq.paired.fq.
+Writing the single reads to /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq.single.fq and /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq.single.fq
+
+Command completed. Elapsed time: 0:00:03. Running peak memory: 0.429GB. + PID: 318725; Command: fastq_pair; Return code: 0; Memory used: 0.079GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/repaired.flag` + +> `mv /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq.paired.fq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq` (318728) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.429GB. + PID: 318728; Command: mv; Return code: 0; Memory used: 0.0GB + + +> `mv /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq.paired.fq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq` (318730) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.429GB. + PID: 318730; Command: mv; Return code: 0; Memory used: 0.0GB + + +> `touch repaired.flag` (318731) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.429GB. + PID: 318731; Command: touch; Return code: 0; Memory used: 0.002GB + + +### Plot adapter insertion distribution (11-27 13:55:46) elapsed: 35.0 _TIME_ + +Skipping sample degradation plotting... +This requires using 'cutadapt' for adapter removal. + +### Prealignments (11-27 13:55:46) elapsed: 0.0 _TIME_ + +Missing stat 'Aligned_reads' +Prealignment assemblies: ['human_rDNA'] + +### Map to human_rDNA (11-27 13:55:46) elapsed: 0.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2` + +> `mkfifo /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2` (318732) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 0.429GB. + PID: 318732; Command: mkfifo; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_bt_aln_summary.log`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq.gz` + +> `perl /scratch/jps3dp/tools/databio/peppro/tools/filter_paired_fq.pl /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2_trimmed.fastq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq` (318733) +
+
+Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_bt_aln_summary.log`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq.gz` + +> `(bowtie2 -p 1 -k 1 -D 20 -R 3 -N 1 -L 20 -i S,1,0.50 -x /scratch/jps3dp/DATA/genomes/human_rDNA/bowtie2_index/default/human_rDNA --rg-id tutorial -U /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1_processed.fastq --un /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/human_rDNA_bt2 > /dev/null) 2>/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_bt_aln_summary.log` (318734) +
+not gzipping output
+
+Command completed. Elapsed time: 0:00:09. Running peak memory: 0.429GB. + PID: 318734; Command: bowtie2; Return code: 0; Memory used: 0.032GB + + +> `grep 'aligned exactly 1 time' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_bt_aln_summary.log | awk '{print $1}'` + +> `Aligned_reads_human_rDNA` 5860.0 PEPPRO _RES_ + +> `Alignment_rate_human_rDNA` 1.18 PEPPRO _RES_ + +### Map to genome (11-27 13:55:54) elapsed: 9.0 _TIME_ + +Missing stat 'Aligned_reads' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam` + +> `bowtie2 -p 1 --very-sensitive -X 2000 --rg-id tutorial -x /scratch/jps3dp/DATA/genomes/hg38/bowtie2_index/default/hg38 --rf -1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq -2 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq | samtools view -bS - -@ 1 | samtools sort - -@ 1 -T /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tmp7b3a497a -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam` (318749,318750,318751) +
+2930 reads skipped
+0 reads lost
+450014 reads; of these:
+  450014 (100.00%) were paired; of these:
+    399197 (88.71%) aligned concordantly 0 times
+    36510 (8.11%) aligned concordantly exactly 1 time
+    14307 (3.18%) aligned concordantly >1 times
+    ----
+    399197 pairs aligned concordantly 0 times; of these:
+      1862 (0.47%) aligned discordantly 1 time
+    ----
+    397335 pairs aligned 0 times concordantly or discordantly; of these:
+      794670 mates make up the pairs; of these:
+        468224 (58.92%) aligned 0 times
+        239606 (30.15%) aligned exactly 1 time
+        86840 (10.93%) aligned >1 times
+47.98% overall alignment rate
+
+Command completed. Elapsed time: 0:05:51. Running peak memory: 3.465GB. + PID: 318749; Command: bowtie2; Return code: 0; Memory used: 3.465GB + PID: 318751; Command: samtools; Return code: 0; Memory used: 0.201GB + PID: 318750; Command: samtools; Return code: 0; Memory used: 0.004GB + + +> `samtools view -q 10 -b -@ 1 -U /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_fail_qc.bam /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam` (319341) +
+
+Command completed. Elapsed time: 0:00:08. Running peak memory: 3.465GB. + PID: 319341; Command: samtools; Return code: 0; Memory used: 0.008GB + + +> `samtools depth -b /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_pre-mRNA.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | awk '{counter++;sum+=$3}END{print sum/counter}'` + +> `Mapped_reads` 431804 PEPPRO _RES_ + +> `QC_filtered_reads` 274898 PEPPRO _RES_ + +> `Aligned_reads` 156906.0 PEPPRO _RES_ + +> `Alignment_rate` 31.52 PEPPRO _RES_ + +> `Total_efficiency` 7.85 PEPPRO _RES_ + +> `Read_depth` 1.17 PEPPRO _RES_ + +### Compress all unmapped read files (11-27 14:02:12) elapsed: 378.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq.gz` + +> `gzip -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R1.fq` (319372) +
+
+Command completed. Elapsed time: 0:00:04. Running peak memory: 3.465GB. + PID: 319372; Command: gzip; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq.gz` + +> `gzip -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/prealignments/tutorial_human_rDNA_unmap_R2.fq` (319378) +
+
+Command completed. Elapsed time: 0:00:05. Running peak memory: 3.465GB. + PID: 319378; Command: gzip; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam.bai` + +> `samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam` (319386) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319386; Command: samtools; Return code: 0; Memory used: 0.001GB + + +> `samtools idxstats /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam | grep -we 'chrM' -we 'chrMT' -we 'M' -we 'MT' -we 'rCRSd' -we 'rCRSd_3k'| cut -f 3` + +> `Mitochondrial_reads` 5357 PEPPRO _RES_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_noMT.bam` + +> `samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam` (319392) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319392; Command: samtools; Return code: 0; Memory used: 0.0GB + + +> `samtools idxstats /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | cut -f 1 | grep -vwe 'chrM' -vwe 'chrMT' -vwe 'M' -vwe 'MT' -vwe 'rCRSd' -vwe 'rCRSd_3k'| xargs samtools view -b -@ 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_noMT.bam` (319394,319395,319396,319397) +
+
+Command completed. Elapsed time: 0:00:03. Running peak memory: 3.465GB. + PID: 319394; Command: samtools; Return code: 0; Memory used: 0.0GB + PID: 319396; Command: grep; Return code: 0; Memory used: 0.0GB + PID: 319395; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319397; Command: xargs; Return code: 0; Memory used: 0.011GB + + +> `mv /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_noMT.bam /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam` (319402) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319402; Command: mv; Return code: 0; Memory used: 0.001GB + + +> `samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam` (319403) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319403; Command: samtools; Return code: 0; Memory used: 0.0GB + + +### Split BAM file (11-27 14:02:25) elapsed: 13.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE2.bam` + +> `samtools view -b -f 64 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | samtools sort - -@ 1 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam` (319406,319407) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319406; Command: samtools; Return code: 0; Memory used: 0.004GB + PID: 319407; Command: samtools; Return code: 0; Memory used: 0.013GB + + +> `samtools view -b -f 128 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_sort.bam | samtools sort - -@ 1 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE2.bam` (319411,319412) +
+
+Command completed. Elapsed time: 0:00:04. Running peak memory: 3.465GB. + PID: 319411; Command: samtools; Return code: 0; Memory used: 0.004GB + PID: 319412; Command: samtools; Return code: 0; Memory used: 0.068GB + +Missing stat 'Maximum_read_length' + +> `Maximum_read_length` 30 PEPPRO _RES_ + +### Calculate NRF, PBC1, and PBC2 (11-27 14:02:30) elapsed: 5.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam.bai` + +> `samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam` (319418) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319418; Command: samtools; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv` + +> `/scratch/jps3dp/tools/databio/peppro/tools/bamQC.py --silent -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -c 1 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv` (319419) +
+Configured logger 'root' using pararead v0.6
+Registering input file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam'
+Temporary files will be stored in: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tmp_tutorial_PE1_hfixb9vw'
+Processing with 1 cores...
+Discarding 158 chunk(s) of reads: ['chrM', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random', 'chr1_KI270709v1_random', 'chr1_KI270710v1_random', 'chr1_KI270711v1_random', 'chr1_KI270712v1_random', 'chr1_KI270713v1_random', 'chr1_KI270714v1_random', 'chr2_KI270715v1_random', 'chr2_KI270716v1_random', 'chr3_GL000221v1_random', 'chr4_GL000008v2_random', 'chr5_GL000208v1_random', 'chr9_KI270717v1_random', 'chr9_KI270718v1_random', 'chr9_KI270719v1_random', 'chr9_KI270720v1_random', 'chr11_KI270721v1_random', 'chr14_GL000009v2_random', 'chr14_GL000194v1_random', 'chr14_KI270723v1_random', 'chr14_KI270724v1_random', 'chr14_KI270726v1_random', 'chr15_KI270727v1_random', 'chr16_KI270728v1_random', 'chr17_KI270729v1_random', 'chr17_KI270730v1_random', 'chr22_KI270731v1_random', 'chr22_KI270732v1_random', 'chr22_KI270734v1_random', 'chr22_KI270735v1_random', 'chr22_KI270736v1_random', 'chr22_KI270737v1_random', 'chr22_KI270739v1_random', 'chrY_KI270740v1_random', 'chrUn_KI270302v1', 'chrUn_KI270304v1', 'chrUn_KI270303v1', 'chrUn_KI270305v1', 'chrUn_KI270322v1', 'chrUn_KI270320v1', 'chrUn_KI270310v1', 'chrUn_KI270316v1', 'chrUn_KI270315v1', 'chrUn_KI270312v1', 'chrUn_KI270311v1', 'chrUn_KI270317v1', 'chrUn_KI270412v1', 'chrUn_KI270411v1', 'chrUn_KI270414v1', 'chrUn_KI270419v1', 'chrUn_KI270418v1', 'chrUn_KI270420v1', 'chrUn_KI270424v1', 'chrUn_KI270417v1', 'chrUn_KI270422v1', 'chrUn_KI270423v1', 'chrUn_KI270425v1', 'chrUn_KI270429v1', 'chrUn_KI270466v1', 'chrUn_KI270465v1', 'chrUn_KI270467v1', 'chrUn_KI270435v1', 'chrUn_KI270438v1', 'chrUn_KI270468v1', 'chrUn_KI270510v1', 'chrUn_KI270509v1', 'chrUn_KI270518v1', 'chrUn_KI270508v1', 'chrUn_KI270516v1', 'chrUn_KI270512v1', 'chrUn_KI270519v1', 'chrUn_KI270522v1', 'chrUn_KI270511v1', 'chrUn_KI270515v1', 'chrUn_KI270507v1', 'chrUn_KI270517v1', 'chrUn_KI270529v1', 'chrUn_KI270528v1', 'chrUn_KI270530v1', 'chrUn_KI270539v1', 'chrUn_KI270538v1', 'chrUn_KI270544v1', 'chrUn_KI270548v1', 'chrUn_KI270583v1', 'chrUn_KI270587v1', 'chrUn_KI270580v1', 'chrUn_KI270581v1', 'chrUn_KI270579v1', 'chrUn_KI270589v1', 'chrUn_KI270590v1', 'chrUn_KI270584v1', 'chrUn_KI270582v1', 'chrUn_KI270588v1', 'chrUn_KI270593v1', 'chrUn_KI270591v1', 'chrUn_KI270330v1', 'chrUn_KI270329v1', 'chrUn_KI270334v1', 'chrUn_KI270333v1', 'chrUn_KI270335v1', 'chrUn_KI270338v1', 'chrUn_KI270340v1', 'chrUn_KI270336v1', 'chrUn_KI270337v1', 'chrUn_KI270363v1', 'chrUn_KI270364v1', 'chrUn_KI270362v1', 'chrUn_KI270366v1', 'chrUn_KI270378v1', 'chrUn_KI270379v1', 'chrUn_KI270389v1', 'chrUn_KI270390v1', 'chrUn_KI270387v1', 'chrUn_KI270395v1', 'chrUn_KI270396v1', 'chrUn_KI270388v1', 'chrUn_KI270394v1', 'chrUn_KI270386v1', 'chrUn_KI270391v1', 'chrUn_KI270383v1', 'chrUn_KI270393v1', 'chrUn_KI270384v1', 'chrUn_KI270392v1', 'chrUn_KI270381v1', 'chrUn_KI270385v1', 'chrUn_KI270382v1', 'chrUn_KI270376v1', 'chrUn_KI270374v1', 'chrUn_KI270372v1', 'chrUn_KI270373v1', 'chrUn_KI270375v1', 'chrUn_KI270371v1', 'chrUn_KI270448v1', 'chrUn_KI270521v1', 'chrUn_GL000220v1', 'chrUn_GL000224v1', 'chrUn_KI270741v1', 'chrUn_GL000226v1', 'chrUn_GL000213v1', 'chrUn_KI270743v1', 'chrUn_KI270744v1', 'chrUn_KI270745v1', 'chrUn_KI270746v1', 'chrUn_KI270747v1', 'chrUn_KI270748v1', 'chrUn_KI270749v1', 'chrUn_KI270751v1', 'chrUn_KI270752v1', 'chrUn_KI270753v1', 'chrUn_KI270754v1', 'chrUn_KI270755v1', 'chrUn_KI270756v1', 'chrUn_KI270757v1', 'chrUn_GL000214v1', 'chrUn_KI270742v1', 'chrUn_GL000216v2']
+Keeping 37 chunk(s) of reads: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chr1_KI270708v1_random', 'chr14_GL000225v1_random', 'chr14_KI270722v1_random', 'chr14_KI270725v1_random', 'chr17_GL000205v2_random', 'chr22_KI270733v1_random', 'chr22_KI270738v1_random', 'chrUn_KI270442v1', 'chrUn_GL000195v1', 'chrUn_GL000219v1', 'chrUn_KI270750v1', 'chrUn_GL000218v1', 'chrEBV']
+
+Command completed. Elapsed time: 0:00:02. Running peak memory: 3.465GB. + PID: 319419; Command: /scratch/jps3dp/tools/databio/peppro/tools/bamQC.py; Return code: 0; Memory used: 0.034GB + + +> `awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "NRF") c=i } getline; print $c }' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv` + +> `awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "PBC1") c=i } getline; print $c }' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv` + +> `awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "PBC2") c=i } getline; print $c }' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_bamQC.tsv` + +> `NRF` 1.0 PEPPRO _RES_ + +> `PBC1` 15975.0 PEPPRO _RES_ + +> `PBC2` 15975.0 PEPPRO _RES_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_unmap.bam` + +> `samtools view -b -@ 1 -f 12 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_unmap.bam` (319425) +
+
+Command completed. Elapsed time: 0:00:02. Running peak memory: 3.465GB. + PID: 319425; Command: samtools; Return code: 0; Memory used: 0.006GB + + +> `samtools view -c -f 4 -@ 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_temp.bam` + +> `Unmapped_reads` 468224 PEPPRO _RES_ + +### Split BAM by strand (11-27 14:02:35) elapsed: 5.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam` + +> `samtools view -bh -F 20 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam` (319430) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319430; Command: samtools; Return code: 0; Memory used: 0.002GB + + +> `samtools view -bh -f 16 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam` (319432) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319432; Command: samtools; Return code: 0; Memory used: 0.002GB + + +### Calculate TSS enrichment (11-27 14:02:36) elapsed: 1.0 _TIME_ + +Missing stat 'TSS_Minus_Score' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/plus_TSS.tsv`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/minus_TSS.tsv` + +> `sed -n -e '/[[:space:]]+/w /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/plus_TSS.tsv' -e '/[[:space:]]-/w /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/minus_TSS.tsv' /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_TSS.bed` (319434) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319434; Command: sed; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_TssEnrichment.txt` + +> `/scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/plus_TSS.tsv -p ends -c 1 -z -v -s 6 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_TssEnrichment.txt` (319436) +
+
+Command completed. Elapsed time: 0:00:04. Running peak memory: 3.465GB. + PID: 319436; Command: /scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py; Return code: 0; Memory used: 0.078GB + + +> `TSS_Plus_Score` 33.3 PEPPRO _RES_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_TssEnrichment.txt` + +> `/scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/minus_TSS.tsv -p ends -c 1 -z -v -s 6 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_TssEnrichment.txt` (319444) +
+
+Command completed. Elapsed time: 0:00:04. Running peak memory: 3.465GB. + PID: 319444; Command: /scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py; Return code: 0; Memory used: 0.079GB + + +> `TSS_Minus_Score` 4.3 PEPPRO _RES_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.pdf` + +> `Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R tss -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_TssEnrichment.txt /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_TssEnrichment.txt` (319451) +
+
+Generating TSS plot with /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_TssEnrichment.txt and /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_TssEnrichment.txt
+TSS enrichment plot completed!
+
+
+Command completed. Elapsed time: 0:00:07. Running peak memory: 3.465GB. + PID: 319451; Command: Rscript; Return code: 0; Memory used: 0.136GB + +> `TSS enrichment` QC_hg38/tutorial_TSSenrichment.pdf TSS enrichment QC_hg38/tutorial_TSSenrichment.png PEPPRO _OBJ_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt` + +> `samtools view -H /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam | grep 'SN:' | awk -F':' '{print $2,$3}' | awk -F' ' -v OFS=' ' '{print $1,$3}' > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt` (319486,319488,319489,319490) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319486; Command: samtools; Return code: 0; Memory used: 0.0GB + PID: 319489; Command: awk; Return code: 0; Memory used: 0.0GB + PID: 319488; Command: grep; Return code: 0; Memory used: 0.0GB + PID: 319490; Command: awk; Return code: 0; Memory used: 0.0GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt` (319492) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319492; Command: cut; Return code: 0; Memory used: 0.002GB + + +### Calculate Pause Index (PI) (11-27 14:02:51) elapsed: 16.0 _TIME_ + +Missing stat 'Pause_index' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_tss.bed`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_gene_body.bed` + +> `grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/ensembl_gtf/default/hg38_ensembl_TSS.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_tss.bed` (319494,319495) +
+
+Command completed. Elapsed time: 0:00:03. Running peak memory: 3.465GB. + PID: 319494; Command: grep; Return code: 0; Memory used: 0.003GB + PID: 319495; Command: bedtools; Return code: 0; Memory used: 0.093GB + + +> `grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/ensembl_gtf/default/hg38_ensembl_gene_body.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_gene_body.bed` (319499,319500) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319499; Command: grep; Return code: 0; Memory used: 0.003GB + PID: 319500; Command: bedtools; Return code: 0; Memory used: 0.022GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_TSS_density.bed` + +> `bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_tss.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | awk '$7>0' | sort -k4,4 -k7,7nr | sort -k4,4 -u > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_TSS_density.bed` (319503,319504,319505,319506) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319503; Command: bedtools; Return code: 0; Memory used: 0.005GB + PID: 319505; Command: sort; Return code: 0; Memory used: 0.002GB + PID: 319504; Command: awk; Return code: 0; Memory used: 0.001GB + PID: 319506; Command: sort; Return code: 0; Memory used: 0.003GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_gene_body_density.bed` + +> `bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_ensembl_gene_body.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | awk '$7>0' | sort -k4 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_gene_body_density.bed` (319508,319509,319510) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319509; Command: awk; Return code: 0; Memory used: 0.001GB + PID: 319508; Command: bedtools; Return code: 0; Memory used: 0.017GB + PID: 319510; Command: sort; Return code: 0; Memory used: 0.003GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed` + +> `join --nocheck-order -j4 -o 1.1 1.2 1.3 1.4 1.6 1.7 2.2 2.3 2.7 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_TSS_density.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_gene_body_density.bed | awk -v OFS=' ' '{print $1, $2, $3, $4, ($6/($3-$2))/($9/($8-$7)), $5}' | env LC_COLLATE=C sort -k1,1 -k2,2n > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed` (319513,319514,319515) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319513; Command: join; Return code: 0; Memory used: 0.0GB + PID: 319514; Command: awk; Return code: 0; Memory used: 0.0GB + PID: 319515; Command: env; Return code: 0; Memory used: 0.003GB + + +> `sort -k5,5n /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed | awk ' { a[i++]=$5; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'` + +> `Pause_index` 228.97 PEPPRO _RES_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.pdf` + +> `Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R pi -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed` (319521) +
+Pause index plot completed!
+
+
+Command completed. Elapsed time: 0:00:07. Running peak memory: 3.465GB. + PID: 319521; Command: Rscript; Return code: 0; Memory used: 0.201GB + +> `Pause index` QC_hg38/tutorial_pause_index.pdf Pause index QC_hg38/tutorial_pause_index.png PEPPRO _OBJ_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed.gz` + +> `gzip -f -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.bed` (319553) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319553; Command: gzip; Return code: 0; Memory used: 0.0GB + + +### Calculate FRiP (11-27 14:03:04) elapsed: 12.0 _TIME_ + + +> `samtools view -@ 4 -c -L /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_pre-mRNA.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam` +156906.0 11064 + +> `Plus FRiP` 0.07 PEPPRO _RES_ + +> `samtools view -@ 4 -c -L /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_pre-mRNA.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam` +156906.0 10420 + +> `Minus FRiP` 0.07 PEPPRO _RES_ + +### Plot fragment distribution (11-27 14:03:04) elapsed: 0.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.pdf` + +> `perl /scratch/jps3dp/tools/databio/peppro/tools/fragment_length_dist.pl /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLen.txt` (319566) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319566; Command: perl; Return code: 0; Memory used: 0.0GB + + +> `sort -n /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLen.txt | uniq -c > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragCount.txt` (319568,319569) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319568; Command: sort; Return code: 0; Memory used: 0.0GB + PID: 319569; Command: uniq; Return code: 0; Memory used: 0.002GB + + +> `Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R frag -l /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLen.txt -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragCount.txt -p /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.pdf -t /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.txt` (319571) +
+Fragment distribution plot completed!
+
+
+Command completed. Elapsed time: 0:00:05. Running peak memory: 3.465GB. + PID: 319571; Command: Rscript; Return code: 0; Memory used: 0.234GB + +> `Fragment distribution` QC_hg38/tutorial_fragLenDistribution.pdf Fragment distribution QC_hg38/tutorial_fragLenDistribution.png PEPPRO _OBJ_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed` + +> `ln -sf /scratch/jps3dp/DATA/genomes/hg38/feat_annotation/default/hg38_annotations.bed.gz /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed.gz` (319602) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319602; Command: ln; Return code: 0; Memory used: 0.0GB + + +> `gzip -f -d -c /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed.gz > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed` (319603) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319603; Command: gzip; Return code: 0; Memory used: 0.002GB + + +### Calculate fraction of reads in features (FRiF) (11-27 14:03:10) elapsed: 6.0 _TIME_ + + +> `cut -f 4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed | sort -u` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3' UTR` + +> `awk -F' ' '{print>"/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/"$4}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/raw/hg38_annotations.bed` (319609) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319609; Command: awk; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR` + +> `mv "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3' UTR" "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR"` (319612) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319612; Command: mv; Return code: 0; Memory used: 0.0GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed` (319613,319614,319615,319616) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319613; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319614; Command: grep; Return code: 0; Memory used: 0.002GB + PID: 319616; Command: bedtools; Return code: 0; Memory used: 0.005GB + PID: 319615; Command: cut; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_plus_coverage.bed` (319618) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319618; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/3_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_minus_coverage.bed` (319621) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319621; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target exists: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5' UTR` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR` + +> `mv "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5' UTR" "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR"` (319623) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319623; Command: mv; Return code: 0; Memory used: 0.0GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed` (319624,319625,319626,319627) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319624; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319625; Command: grep; Return code: 0; Memory used: 0.002GB + PID: 319627; Command: bedtools; Return code: 0; Memory used: 0.03GB + PID: 319626; Command: cut; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_plus_coverage.bed` (319630) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319630; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/5_UTR_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_minus_coverage.bed` (319632) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319632; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target exists: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed` (319634,319635,319636,319637) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319634; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319636; Command: cut; Return code: 0; Memory used: 0.001GB + PID: 319635; Command: grep; Return code: 0; Memory used: 0.002GB + PID: 319637; Command: bedtools; Return code: 0; Memory used: 0.005GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_plus_coverage.bed` (319640) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319640; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Enhancer_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_minus_coverage.bed` (319642) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319642; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target exists: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed` (319644,319645,319646,319647) +
+
+Command completed. Elapsed time: 0:00:04. Running peak memory: 3.465GB. + PID: 319644; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319645; Command: grep; Return code: 0; Memory used: 0.003GB + PID: 319647; Command: bedtools; Return code: 0; Memory used: 0.158GB + PID: 319646; Command: cut; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_plus_coverage.bed` (319661) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319661; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Exon_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_minus_coverage.bed` (319664) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319664; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target exists: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed` (319667,319668,319669,319670) +
+
+Command completed. Elapsed time: 0:00:02. Running peak memory: 3.465GB. + PID: 319667; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319670; Command: bedtools; Return code: 0; Memory used: 0.073GB + PID: 319668; Command: grep; Return code: 0; Memory used: 0.002GB + PID: 319669; Command: cut; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_plus_coverage.bed` (319704) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319704; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Intron_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_minus_coverage.bed` (319706) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319706; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target exists: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed` (319709,319710,319711,319712) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319709; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319710; Command: grep; Return code: 0; Memory used: 0.003GB + PID: 319712; Command: bedtools; Return code: 0; Memory used: 0.005GB + PID: 319711; Command: cut; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_plus_coverage.bed` (319714) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319714; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_minus_coverage.bed` (319716) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319716; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target exists: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter Flanking Region` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region` + +> `mv "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter Flanking Region" "/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region"` (319718) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319718; Command: mv; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed` + +> `cut -f 1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | grep -wf - /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region | cut -f 1-3 | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed` (319719,319721,319722,319723) +
+
+Command completed. Elapsed time: 0:00:02. Running peak memory: 3.465GB. + PID: 319719; Command: cut; Return code: 0; Memory used: 0.0GB + PID: 319722; Command: cut; Return code: 0; Memory used: 0.001GB + PID: 319721; Command: grep; Return code: 0; Memory used: 0.002GB + PID: 319723; Command: bedtools; Return code: 0; Memory used: 0.004GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_plus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_plus_coverage.bed` (319726) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319726; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_minus_coverage.bed` + +> `bedtools coverage -sorted -counts -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/Promoter_Flanking_Region_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_minus_coverage.bed` (319728) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319728; Command: bedtools; Return code: 0; Memory used: 0.002GB + + +### Plot FRiF (11-27 14:03:29) elapsed: 19.0 _TIME_ + + +> `samtools view -@ 1 -q 10 -c -F4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.pdf` + +> `Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R frif -n tutorial -r 16265 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.pdf --bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_plus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_plus_coverage.bed` (319731) +
+Cumulative FRiF plot completed!
+
+
+Command completed. Elapsed time: 0:00:27. Running peak memory: 3.465GB. + PID: 319731; Command: Rscript; Return code: 0; Memory used: 0.44GB + +> `Plus FRiF` QC_hg38/tutorial_plus_frif.pdf Plus FRiF QC_hg38/tutorial_plus_frif.png PEPPRO _OBJ_ + +> `samtools view -@ 1 -q 10 -c -F4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam` +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.pdf` + +> `Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R frif -n tutorial -r 15685 -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.pdf --bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_3_UTR_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_5_UTR_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Enhancer_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Exon_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Intron_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_minus_coverage.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_Promoter_Flanking_Region_minus_coverage.bed` (319781) +
+Cumulative FRiF plot completed!
+
+
+Command completed. Elapsed time: 0:00:26. Running peak memory: 3.465GB. + PID: 319781; Command: Rscript; Return code: 0; Memory used: 0.446GB + +> `Minus FRiF` QC_hg38/tutorial_minus_frif.pdf Minus FRiF QC_hg38/tutorial_minus_frif.png PEPPRO _OBJ_ + +### Calculate mRNA contamination (11-27 14:04:23) elapsed: 54.0 _TIME_ + +Missing stat 'mRNA_contamination' +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_exons_sort.bed`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_introns_sort.bed` + +> `grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_exons.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_exons_sort.bed` (319833,319834) +
+
+Command completed. Elapsed time: 0:00:06. Running peak memory: 3.465GB. + PID: 319834; Command: bedtools; Return code: 0; Memory used: 0.08GB + PID: 319833; Command: grep; Return code: 0; Memory used: 0.005GB + + +> `grep -wf /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_keep.txt /scratch/jps3dp/DATA/genomes/hg38/refgene_anno/default/hg38_introns.bed | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt | bedtools sort -i stdin -faidx /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_introns_sort.bed` (319841,319842,319843) +
+
+Command completed. Elapsed time: 0:00:07. Running peak memory: 3.465GB. + PID: 319842; Command: bedtools; Return code: 0; Memory used: 0.084GB + PID: 319841; Command: grep; Return code: 0; Memory used: 0.005GB + PID: 319843; Command: bedtools; Return code: 0; Memory used: 0.092GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_coverage.bed`,`/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_coverage.bed` + +> `bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_exons_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_coverage.bed` (319851) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319851; Command: bedtools; Return code: 0; Memory used: 0.002GB + + +> `bedtools coverage -sorted -counts -s -a /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/hg38_introns_sort.bed -b /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_PE1.bam -g /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/chr_order.txt > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_coverage.bed` (319853) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319853; Command: bedtools; Return code: 0; Memory used: 0.002GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_rpkm.bed` + +> `awk -v OFS=' ' '{chrom[$4] = $1; if($4!=prev4) {chromStart[$4] = $2} strand[$4] = $6; readCount[$4] += $7; exonCount[$4] += 1; geneSizeKB[$4] += (sqrt(($3-$2+0.00000001)^2)/1000); gene[$4] = $4; chromEnd[$4]=$3; prev4=$4} END { for (a in readCount) { print chrom[a], chromStart[a], chromEnd[a], gene[a], (readCount[a]/0.156906)/geneSizeKB[a], strand[a]}}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_coverage.bed | awk '$5>0' | sort -k4 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_rpkm.bed` (319862,319863,319864) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319862; Command: awk; Return code: 0; Memory used: 0.009GB + PID: 319864; Command: sort; Return code: 0; Memory used: 0.003GB + PID: 319863; Command: awk; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_rpkm.bed` + +> `awk -v OFS=' ' '{chrom[$4] = $1; if($4!=prev4) {chromStart[$4] = $2} strand[$4] = $6; readCount[$4] += $7; exonCount[$4] += 1; geneSizeKB[$4] += (sqrt(($3-$2+0.00000001)^2)/1000); gene[$4] = $4; chromEnd[$4]=$3; prev4=$4} END { for (a in readCount) { print chrom[a], chromStart[a], chromEnd[a], gene[a], (readCount[a]/0.156906)/geneSizeKB[a], strand[a]}}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_coverage.bed | awk '$5>0' | sort -k4 > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_rpkm.bed` (319866,319867,319868) +
+
+Command completed. Elapsed time: 0:00:01. Running peak memory: 3.465GB. + PID: 319866; Command: awk; Return code: 0; Memory used: 0.007GB + PID: 319868; Command: sort; Return code: 0; Memory used: 0.003GB + PID: 319867; Command: awk; Return code: 0; Memory used: 0.001GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed` + +> `join --nocheck-order -a1 -a2 -j4 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_introns_rpkm.bed /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exons_rpkm.bed | awk -v OFS=' ' 'NF==11 {print $7, $8, $9, $1, ($10/$5), $11}' | sort -k1,1 -k2,2n > /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed` (319871,319872,319873) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319871; Command: join; Return code: 0; Memory used: 0.0GB + PID: 319872; Command: awk; Return code: 0; Memory used: 0.0GB + PID: 319873; Command: sort; Return code: 0; Memory used: 0.002GB + + +> `awk '{print $5}' /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed | sort -n | awk ' { a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'` + +> `mRNA_contamination` 4.26 PEPPRO _RES_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.pdf` + +> `Rscript /scratch/jps3dp/tools/databio/peppro/tools/PEPPRO.R mrna -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed --raw` (319879) +
+mRNA contamination plot completed!
+
+
+Command completed. Elapsed time: 0:00:05. Running peak memory: 3.465GB. + PID: 319879; Command: Rscript; Return code: 0; Memory used: 0.215GB + +> `mRNA contamination` QC_hg38/tutorial_mRNA_contamination.pdf mRNA contamination QC_hg38/tutorial_mRNA_contamination.png PEPPRO _OBJ_ +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed.gz` + +> `gzip -f -f /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/QC_hg38/tutorial_exon_intron_ratios.bed` (319922) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319922; Command: gzip; Return code: 0; Memory used: 0.001GB + + +### Produce bigWig files (11-27 14:04:44) elapsed: 21.0 _TIME_ + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_plus_body_0-mer.bw` + +> `samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam` (319924) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 319924; Command: samtools; Return code: 0; Memory used: 0.0GB + + +> `/scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam -c /scratch/jps3dp/DATA/genomes/hg38/fasta/default/hg38.chrom.sizes -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_plus_body_0-mer.bw -p 1 --variable-step --tail-edge` (319925) +
+Registering input file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_plus.bam'
+Temporary files will be stored in: 'tmp_tutorial_plus_cuttrace_oxinje4r'
+Processing with 1 cores...
+Discarding 165 chunk(s) of reads: ['chrM', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random', 'chr1_KI270708v1_random', 'chr1_KI270709v1_random', 'chr1_KI270710v1_random', 'chr1_KI270711v1_random', 'chr1_KI270712v1_random', 'chr1_KI270713v1_random', 'chr1_KI270714v1_random', 'chr2_KI270715v1_random', 'chr2_KI270716v1_random', 'chr3_GL000221v1_random', 'chr4_GL000008v2_random', 'chr5_GL000208v1_random', 'chr9_KI270717v1_random', 'chr9_KI270718v1_random', 'chr9_KI270719v1_random', 'chr9_KI270720v1_random', 'chr11_KI270721v1_random', 'chr14_GL000009v2_random', 'chr14_GL000225v1_random', 'chr14_GL000194v1_random', 'chr14_KI270723v1_random', 'chr14_KI270724v1_random', 'chr14_KI270726v1_random', 'chr15_KI270727v1_random', 'chr16_KI270728v1_random', 'chr17_GL000205v2_random', 'chr17_KI270729v1_random', 'chr17_KI270730v1_random', 'chr22_KI270731v1_random', 'chr22_KI270732v1_random', 'chr22_KI270734v1_random', 'chr22_KI270735v1_random', 'chr22_KI270736v1_random', 'chr22_KI270737v1_random', 'chr22_KI270739v1_random', 'chrY_KI270740v1_random', 'chrUn_KI270302v1', 'chrUn_KI270304v1', 'chrUn_KI270303v1', 'chrUn_KI270305v1', 'chrUn_KI270322v1', 'chrUn_KI270320v1', 'chrUn_KI270310v1', 'chrUn_KI270316v1', 'chrUn_KI270315v1', 'chrUn_KI270312v1', 'chrUn_KI270311v1', 'chrUn_KI270317v1', 'chrUn_KI270412v1', 'chrUn_KI270411v1', 'chrUn_KI270414v1', 'chrUn_KI270419v1', 'chrUn_KI270418v1', 'chrUn_KI270420v1', 'chrUn_KI270424v1', 'chrUn_KI270417v1', 'chrUn_KI270422v1', 'chrUn_KI270423v1', 'chrUn_KI270425v1', 'chrUn_KI270429v1', 'chrUn_KI270442v1', 'chrUn_KI270466v1', 'chrUn_KI270465v1', 'chrUn_KI270467v1', 'chrUn_KI270435v1', 'chrUn_KI270438v1', 'chrUn_KI270468v1', 'chrUn_KI270510v1', 'chrUn_KI270509v1', 'chrUn_KI270518v1', 'chrUn_KI270508v1', 'chrUn_KI270516v1', 'chrUn_KI270512v1', 'chrUn_KI270519v1', 'chrUn_KI270522v1', 'chrUn_KI270511v1', 'chrUn_KI270515v1', 'chrUn_KI270507v1', 'chrUn_KI270517v1', 'chrUn_KI270529v1', 'chrUn_KI270528v1', 'chrUn_KI270530v1', 'chrUn_KI270539v1', 'chrUn_KI270538v1', 'chrUn_KI270544v1', 'chrUn_KI270548v1', 'chrUn_KI270583v1', 'chrUn_KI270587v1', 'chrUn_KI270580v1', 'chrUn_KI270581v1', 'chrUn_KI270579v1', 'chrUn_KI270589v1', 'chrUn_KI270590v1', 'chrUn_KI270584v1', 'chrUn_KI270582v1', 'chrUn_KI270588v1', 'chrUn_KI270593v1', 'chrUn_KI270591v1', 'chrUn_KI270330v1', 'chrUn_KI270329v1', 'chrUn_KI270334v1', 'chrUn_KI270333v1', 'chrUn_KI270335v1', 'chrUn_KI270338v1', 'chrUn_KI270340v1', 'chrUn_KI270336v1', 'chrUn_KI270337v1', 'chrUn_KI270363v1', 'chrUn_KI270364v1', 'chrUn_KI270362v1', 'chrUn_KI270366v1', 'chrUn_KI270378v1', 'chrUn_KI270379v1', 'chrUn_KI270389v1', 'chrUn_KI270390v1', 'chrUn_KI270387v1', 'chrUn_KI270395v1', 'chrUn_KI270396v1', 'chrUn_KI270388v1', 'chrUn_KI270394v1', 'chrUn_KI270386v1', 'chrUn_KI270391v1', 'chrUn_KI270383v1', 'chrUn_KI270393v1', 'chrUn_KI270384v1', 'chrUn_KI270392v1', 'chrUn_KI270381v1', 'chrUn_KI270385v1', 'chrUn_KI270382v1', 'chrUn_KI270376v1', 'chrUn_KI270374v1', 'chrUn_KI270372v1', 'chrUn_KI270373v1', 'chrUn_KI270375v1', 'chrUn_KI270371v1', 'chrUn_KI270448v1', 'chrUn_KI270521v1', 'chrUn_GL000220v1', 'chrUn_GL000224v1', 'chrUn_KI270741v1', 'chrUn_GL000226v1', 'chrUn_GL000213v1', 'chrUn_KI270743v1', 'chrUn_KI270744v1', 'chrUn_KI270745v1', 'chrUn_KI270746v1', 'chrUn_KI270747v1', 'chrUn_KI270748v1', 'chrUn_KI270749v1', 'chrUn_KI270750v1', 'chrUn_KI270751v1', 'chrUn_KI270752v1', 'chrUn_KI270753v1', 'chrUn_KI270754v1', 'chrUn_KI270755v1', 'chrUn_KI270756v1', 'chrUn_KI270757v1', 'chrUn_GL000214v1', 'chrUn_KI270742v1', 'chrUn_GL000216v2', 'chrUn_GL000218v1', 'chrEBV']
+Keeping 30 chunk(s) of reads: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chr14_KI270722v1_random', 'chr14_KI270725v1_random', 'chr22_KI270733v1_random', 'chr22_KI270738v1_random', 'chrUn_GL000195v1', 'chrUn_GL000219v1']
+Reduce step (merge files)...
+Merging 30 files into output file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_plus_body_0-mer.bw'
+
+Command completed. Elapsed time: 0:00:03. Running peak memory: 3.465GB. + PID: 319925; Command: /scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py; Return code: 0; Memory used: 0.032GB + +Target to produce: `/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_minus_body_0-mer.bw` + +> `samtools index /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam` (320065) +
+
+Command completed. Elapsed time: 0:00:00. Running peak memory: 3.465GB. + PID: 320065; Command: samtools; Return code: 0; Memory used: 0.001GB + + +> `/scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py -i /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam -c /scratch/jps3dp/DATA/genomes/hg38/fasta/default/hg38.chrom.sizes -o /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_minus_body_0-mer.bw -p 1 --variable-step --tail-edge` (320066) +
+Registering input file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/aligned_hg38/tutorial_minus.bam'
+Temporary files will be stored in: 'tmp_tutorial_minus_cuttrace_785gqf3g'
+Processing with 1 cores...
+Discarding 162 chunk(s) of reads: ['chrM', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random', 'chr1_KI270709v1_random', 'chr1_KI270710v1_random', 'chr1_KI270711v1_random', 'chr1_KI270712v1_random', 'chr1_KI270713v1_random', 'chr1_KI270714v1_random', 'chr2_KI270715v1_random', 'chr2_KI270716v1_random', 'chr3_GL000221v1_random', 'chr4_GL000008v2_random', 'chr5_GL000208v1_random', 'chr9_KI270717v1_random', 'chr9_KI270718v1_random', 'chr9_KI270719v1_random', 'chr9_KI270720v1_random', 'chr11_KI270721v1_random', 'chr14_GL000009v2_random', 'chr14_KI270722v1_random', 'chr14_GL000194v1_random', 'chr14_KI270723v1_random', 'chr14_KI270724v1_random', 'chr14_KI270725v1_random', 'chr14_KI270726v1_random', 'chr15_KI270727v1_random', 'chr16_KI270728v1_random', 'chr17_KI270729v1_random', 'chr17_KI270730v1_random', 'chr22_KI270731v1_random', 'chr22_KI270732v1_random', 'chr22_KI270733v1_random', 'chr22_KI270734v1_random', 'chr22_KI270735v1_random', 'chr22_KI270736v1_random', 'chr22_KI270737v1_random', 'chr22_KI270738v1_random', 'chr22_KI270739v1_random', 'chrY_KI270740v1_random', 'chrUn_KI270302v1', 'chrUn_KI270304v1', 'chrUn_KI270303v1', 'chrUn_KI270305v1', 'chrUn_KI270322v1', 'chrUn_KI270320v1', 'chrUn_KI270310v1', 'chrUn_KI270316v1', 'chrUn_KI270315v1', 'chrUn_KI270312v1', 'chrUn_KI270311v1', 'chrUn_KI270317v1', 'chrUn_KI270412v1', 'chrUn_KI270411v1', 'chrUn_KI270414v1', 'chrUn_KI270419v1', 'chrUn_KI270418v1', 'chrUn_KI270420v1', 'chrUn_KI270424v1', 'chrUn_KI270417v1', 'chrUn_KI270422v1', 'chrUn_KI270423v1', 'chrUn_KI270425v1', 'chrUn_KI270429v1', 'chrUn_KI270466v1', 'chrUn_KI270465v1', 'chrUn_KI270467v1', 'chrUn_KI270435v1', 'chrUn_KI270438v1', 'chrUn_KI270468v1', 'chrUn_KI270510v1', 'chrUn_KI270509v1', 'chrUn_KI270518v1', 'chrUn_KI270508v1', 'chrUn_KI270516v1', 'chrUn_KI270512v1', 'chrUn_KI270519v1', 'chrUn_KI270522v1', 'chrUn_KI270511v1', 'chrUn_KI270515v1', 'chrUn_KI270507v1', 'chrUn_KI270517v1', 'chrUn_KI270529v1', 'chrUn_KI270528v1', 'chrUn_KI270530v1', 'chrUn_KI270539v1', 'chrUn_KI270538v1', 'chrUn_KI270544v1', 'chrUn_KI270548v1', 'chrUn_KI270583v1', 'chrUn_KI270587v1', 'chrUn_KI270580v1', 'chrUn_KI270581v1', 'chrUn_KI270579v1', 'chrUn_KI270589v1', 'chrUn_KI270590v1', 'chrUn_KI270584v1', 'chrUn_KI270582v1', 'chrUn_KI270588v1', 'chrUn_KI270593v1', 'chrUn_KI270591v1', 'chrUn_KI270330v1', 'chrUn_KI270329v1', 'chrUn_KI270334v1', 'chrUn_KI270333v1', 'chrUn_KI270335v1', 'chrUn_KI270338v1', 'chrUn_KI270340v1', 'chrUn_KI270336v1', 'chrUn_KI270337v1', 'chrUn_KI270363v1', 'chrUn_KI270364v1', 'chrUn_KI270362v1', 'chrUn_KI270366v1', 'chrUn_KI270378v1', 'chrUn_KI270379v1', 'chrUn_KI270389v1', 'chrUn_KI270390v1', 'chrUn_KI270387v1', 'chrUn_KI270395v1', 'chrUn_KI270396v1', 'chrUn_KI270388v1', 'chrUn_KI270394v1', 'chrUn_KI270386v1', 'chrUn_KI270391v1', 'chrUn_KI270383v1', 'chrUn_KI270393v1', 'chrUn_KI270384v1', 'chrUn_KI270392v1', 'chrUn_KI270381v1', 'chrUn_KI270385v1', 'chrUn_KI270382v1', 'chrUn_KI270376v1', 'chrUn_KI270374v1', 'chrUn_KI270372v1', 'chrUn_KI270373v1', 'chrUn_KI270375v1', 'chrUn_KI270371v1', 'chrUn_KI270448v1', 'chrUn_KI270521v1', 'chrUn_GL000220v1', 'chrUn_GL000224v1', 'chrUn_KI270741v1', 'chrUn_GL000226v1', 'chrUn_GL000213v1', 'chrUn_KI270743v1', 'chrUn_KI270744v1', 'chrUn_KI270745v1', 'chrUn_KI270746v1', 'chrUn_KI270747v1', 'chrUn_KI270748v1', 'chrUn_KI270749v1', 'chrUn_KI270751v1', 'chrUn_KI270752v1', 'chrUn_KI270753v1', 'chrUn_KI270754v1', 'chrUn_KI270755v1', 'chrUn_KI270756v1', 'chrUn_KI270757v1', 'chrUn_GL000214v1', 'chrUn_KI270742v1', 'chrUn_GL000216v2']
+Keeping 33 chunk(s) of reads: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chr1_KI270708v1_random', 'chr14_GL000225v1_random', 'chr17_GL000205v2_random', 'chrUn_KI270442v1', 'chrUn_GL000195v1', 'chrUn_GL000219v1', 'chrUn_KI270750v1', 'chrUn_GL000218v1', 'chrEBV']
+Reduce step (merge files)...
+Merging 33 files into output file: '/project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/signal_hg38/tutorial_minus_body_0-mer.bw'
+
+Command completed. Elapsed time: 0:00:03. Running peak memory: 3.465GB. + PID: 320066; Command: /scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py; Return code: 0; Memory used: 0.032GB + +Starting cleanup: 55 files; 3 conditional files for cleanup + +Cleaning up flagged intermediate files. . . + +Cleaning up conditional list. . . + +### Pipeline completed. Epilogue +* Elapsed time (this run): 0:09:52 +* Total elapsed time (all runs): 0:21:54 +* Peak memory (this run): 3.4646 GB +* Pipeline completed time: 2019-11-27 14:04:51 diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_profile.tsv b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_profile.tsv new file mode 100644 index 0000000..961fbda --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/PEPPRO_profile.tsv @@ -0,0 +1,141 @@ +# Pipeline started at 11-27 13:54:59 + +# pid hash cid runtime mem cmd lock +318626 bc58a717e9 1 0:00:00.050000 0.0001 ln lock.raw__tutorial_R1.fastq.gz +318628 54db041654 2 0:00:00.050000 0.0001 ln lock.raw__tutorial_R2.fastq.gz +318629 16fb38e0ee 3 0:00:01.660000 0.0024 gzip lock.fastq__tutorial_R2.fastq +318634 37badac3e3 4 0:00:01.620000 0.0024 gzip lock.fastq__tutorial_R2.fastq +318658 e86b68512a 5 0:00:10.530000 0.1226 lock.fastq__tutorial_R1_processed.fastq +318685 be5a6f7f18 5f 0:00:13.480000 0.164 fastqc lock.trimmed_fastqc +318711 7c535e5abb 6 0:00:07.560000 0.429 lock.fastq__tutorial_R2_trimmed.fastq +318724 e458d35d5a 7 0:00:00.350000 0.0015 cp lock.fastq__tutorial_R2_trimmed_dups.fastq +318725 d8e98f9576 8 0:00:02.510000 0.0786 fastq_pair lock.fastq__tutorial_R2_trimmed.fastq.paired.fq +318728 9a17915cbe 9 0:00:00.080000 0.0001 mv lock.fastq__repaired.flag +318730 dff4c2ab62 10 0:00:00.060000 0.0001 mv lock.fastq__repaired.flag +318731 947f1f1a82 11 0:00:00.090000 0.0015 touch lock.fastq__repaired.flag +318732 296b5185e1 12 0:00:00.070000 0.0015 mkfifo lock.prealignments__human_rDNA_bt2 +318734 3a5dd57f4b 14 0:00:08.510000 0.0323 bowtie2 lock.prealignments__tutorial_human_rDNA_unmap_R2.fq.gz +318749 a907588c6e 15 0:05:45.670000 3.4646 bowtie2 lock.aligned_hg38__tutorial_sort.bam +318751 a907588c6e 17 0:05:50.780000 0.201 samtools lock.aligned_hg38__tutorial_sort.bam +318750 a907588c6e 16 0:05:50.800000 0.0035 samtools lock.aligned_hg38__tutorial_sort.bam +319341 8aff7ce549 18 0:00:08.070000 0.0076 samtools lock.aligned_hg38__tutorial_sort.bam +319372 6389e6be8a 19 0:00:04.330000 0.0009 gzip lock.prealignments__tutorial_human_rDNA_unmap_R1.fq.gz +319378 af7bed1671 20 0:00:04.550000 0.0009 gzip lock.prealignments__tutorial_human_rDNA_unmap_R2.fq.gz +319386 075c415608 21 0:00:00.830000 0.0006 samtools lock.aligned_hg38__tutorial_temp.bam.bai +319392 30e98b1907 22 0:00:00.420000 0.0004 samtools lock.aligned_hg38__tutorial_noMT.bam +319394 c06ec34b6c 23 0:00:00.180000 0.0 samtools lock.aligned_hg38__tutorial_noMT.bam +319396 c06ec34b6c 25 0:00:00.280000 0.0 grep lock.aligned_hg38__tutorial_noMT.bam +319395 c06ec34b6c 24 0:00:00.310000 0.0 cut lock.aligned_hg38__tutorial_noMT.bam +319397 c06ec34b6c 26 0:00:02.530000 0.0107 xargs lock.aligned_hg38__tutorial_noMT.bam +319402 66b5f83986 27 0:00:00.060000 0.0009 mv lock.aligned_hg38__tutorial_noMT.bam +319403 30e98b1907 28 0:00:00.420000 0.0001 samtools lock.aligned_hg38__tutorial_noMT.bam +319406 9b6b7f26f3 29 0:00:00.640000 0.0036 samtools lock.aligned_hg38__tutorial_PE2.bam +319407 9b6b7f26f3 30 0:00:00.880000 0.0134 samtools lock.aligned_hg38__tutorial_PE2.bam +319411 179e15ae28 31 0:00:02.280000 0.0036 samtools lock.aligned_hg38__tutorial_PE2.bam +319412 179e15ae28 32 0:00:04 0.0684 samtools lock.aligned_hg38__tutorial_PE2.bam +319418 d77e2f2d9e 33 0:00:00.150000 0.0006 samtools lock.aligned_hg38__tutorial_PE1.bam.bai +319419 6687104fcf 34 0:00:02.450000 0.0338 /scratch/jps3dp/tools/databio/peppro/tools/bamQC.py lock.QC_hg38__tutorial_bamQC.tsv +319425 57f22d174b 35 0:00:01.500000 0.0064 samtools lock.aligned_hg38__tutorial_unmap.bam +319430 3ddef1ab52 36 0:00:00.260000 0.0017 samtools lock.aligned_hg38__tutorial_minus.bam +319432 2a8fbc2447 37 0:00:00.230000 0.0025 samtools lock.aligned_hg38__tutorial_minus.bam +319434 84c64c13a5 38 0:00:00.310000 0.0011 sed lock.QC_hg38__minus_TSS.tsv +319436 2b230a4960 39 0:00:03.660000 0.0778 /scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py lock.QC_hg38__tutorial_plus_TssEnrichment.txt +319444 912a997442 40 0:00:03.600000 0.0795 /scratch/jps3dp/tools/databio/peppro/tools/pyTssEnrichment.py lock.QC_hg38__tutorial_minus_TssEnrichment.txt +319451 bd4fb93a7a 41 0:00:07.310000 0.1359 Rscript lock.QC_hg38__tutorial_TSSenrichment.pdf +319486 7fa9f4b7a9 42 0:00:00.090000 0.0 samtools lock.QC_hg38__chr_order.txt +319489 7fa9f4b7a9 44 0:00:00.130000 0.0 awk lock.QC_hg38__chr_order.txt +319488 7fa9f4b7a9 43 0:00:00.170000 0.0 grep lock.QC_hg38__chr_order.txt +319490 7fa9f4b7a9 45 0:00:00.210000 0.0 awk lock.QC_hg38__chr_order.txt +319492 4b9e68a218 46 0:00:00.060000 0.0025 cut lock.QC_hg38__chr_keep.txt +319494 31a05f4fa4 47 0:00:02.560000 0.0027 grep lock.QC_hg38__hg38_ensembl_gene_body.bed +319495 31a05f4fa4 48 0:00:02.780000 0.093 bedtools lock.QC_hg38__hg38_ensembl_gene_body.bed +319499 30a1072307 49 0:00:00.520000 0.0027 grep lock.QC_hg38__hg38_ensembl_gene_body.bed +319500 30a1072307 50 0:00:00.600000 0.0223 bedtools lock.QC_hg38__hg38_ensembl_gene_body.bed +319503 0e544d6fa3 51 0:00:00.570000 0.0055 bedtools lock.QC_hg38__tutorial_TSS_density.bed +319505 0e544d6fa3 53 0:00:00.690000 0.0021 sort lock.QC_hg38__tutorial_TSS_density.bed +319504 0e544d6fa3 52 0:00:00.700000 0.0011 awk lock.QC_hg38__tutorial_TSS_density.bed +319506 0e544d6fa3 54 0:00:00.720000 0.0028 sort lock.QC_hg38__tutorial_TSS_density.bed +319509 279f147f5e 56 0:00:01.290000 0.0011 awk lock.QC_hg38__tutorial_gene_body_density.bed +319508 279f147f5e 55 0:00:01.310000 0.0174 bedtools lock.QC_hg38__tutorial_gene_body_density.bed +319510 279f147f5e 57 0:00:01.330000 0.0028 sort lock.QC_hg38__tutorial_gene_body_density.bed +319513 bee2eaa093 58 0:00:00.080000 0.0 join lock.QC_hg38__tutorial_pause_index.bed +319514 bee2eaa093 59 0:00:00.140000 0.0 awk lock.QC_hg38__tutorial_pause_index.bed +319515 bee2eaa093 60 0:00:00.170000 0.0028 env lock.QC_hg38__tutorial_pause_index.bed +319521 01f414dda9 61 0:00:06.580000 0.2014 Rscript lock.QC_hg38__tutorial_pause_index.pdf +319553 ef73e45263 62 0:00:00.080000 0.0001 gzip lock.QC_hg38__tutorial_pause_index.bed.gz +319566 5a3b4bf31d 63 0:00:00.300000 0.0003 perl lock.QC_hg38__tutorial_fragLenDistribution.pdf +319568 24bfa2082e 64 0:00:00.060000 0.0 sort lock.QC_hg38__tutorial_fragLenDistribution.pdf +319569 24bfa2082e 65 0:00:00.210000 0.0015 uniq lock.QC_hg38__tutorial_fragLenDistribution.pdf +319571 24fff46f7c 66 0:00:04.870000 0.2339 Rscript lock.QC_hg38__tutorial_fragLenDistribution.pdf +319602 f7d296320b 67 0:00:00.060000 0.0004 ln lock.raw__hg38_annotations.bed +319603 7a5bc973b4 68 0:00:00.660000 0.0017 gzip lock.raw__hg38_annotations.bed +319609 d33ef2ebcd 69 0:00:01.480000 0.0023 awk lock.QC_hg38__3' UTR +319612 0447423dfc 70 0:00:00.060000 0.0001 mv lock.QC_hg38__3_UTR +319613 fead95d6c5 71 0:00:00.070000 0.0 cut lock.QC_hg38__3_UTR_sort.bed +319614 fead95d6c5 72 0:00:00.910000 0.0024 grep lock.QC_hg38__3_UTR_sort.bed +319616 fead95d6c5 74 0:00:00.970000 0.0052 bedtools lock.QC_hg38__3_UTR_sort.bed +319615 fead95d6c5 73 0:00:00.980000 0.0007 cut lock.QC_hg38__3_UTR_sort.bed +319618 392a89d5c4 75 0:00:00.220000 0.0017 bedtools lock.QC_hg38__tutorial_3_UTR_plus_coverage.bed +319621 eeb6820d7f 76 0:00:00.220000 0.0017 bedtools lock.QC_hg38__tutorial_3_UTR_minus_coverage.bed +319623 b18650b338 78 0:00:00.060000 0.0001 mv lock.QC_hg38__5_UTR +319624 eb23799279 79 0:00:00.070000 0.0 cut lock.QC_hg38__5_UTR_sort.bed +319625 eb23799279 80 0:00:00.790000 0.0024 grep lock.QC_hg38__5_UTR_sort.bed +319627 eb23799279 82 0:00:00.840000 0.0298 bedtools lock.QC_hg38__5_UTR_sort.bed +319626 eb23799279 81 0:00:00.860000 0.0007 cut lock.QC_hg38__5_UTR_sort.bed +319630 0b12fca9d9 83 0:00:00.220000 0.0018 bedtools lock.QC_hg38__tutorial_5_UTR_plus_coverage.bed +319632 7100119195 84 0:00:00.220000 0.0018 bedtools lock.QC_hg38__tutorial_5_UTR_minus_coverage.bed +319634 830e1624fc 86 0:00:00.070000 0.0 cut lock.QC_hg38__Enhancer_sort.bed +319636 830e1624fc 88 0:00:01.090000 0.0007 cut lock.QC_hg38__Enhancer_sort.bed +319635 830e1624fc 87 0:00:01.120000 0.0024 grep lock.QC_hg38__Enhancer_sort.bed +319637 830e1624fc 89 0:00:01.160000 0.0055 bedtools lock.QC_hg38__Enhancer_sort.bed +319640 b3db68078c 90 0:00:00.340000 0.0017 bedtools lock.QC_hg38__tutorial_Enhancer_plus_coverage.bed +319642 78af0e48a9 91 0:00:00.260000 0.0017 bedtools lock.QC_hg38__tutorial_Enhancer_minus_coverage.bed +319644 0f2ad92adf 93 0:00:00.070000 0.0 cut lock.QC_hg38__Exon_sort.bed +319645 0f2ad92adf 94 0:00:03.880000 0.0027 grep lock.QC_hg38__Exon_sort.bed +319647 0f2ad92adf 96 0:00:04.200000 0.1583 bedtools lock.QC_hg38__Exon_sort.bed +319646 0f2ad92adf 95 0:00:04.220000 0.0007 cut lock.QC_hg38__Exon_sort.bed +319661 303daf4354 97 0:00:00.620000 0.0018 bedtools lock.QC_hg38__tutorial_Exon_plus_coverage.bed +319664 d2401c1ff1 98 0:00:00.660000 0.0017 bedtools lock.QC_hg38__tutorial_Exon_minus_coverage.bed +319667 cad847accf 100 0:00:00.070000 0.0 cut lock.QC_hg38__Intron_sort.bed +319670 cad847accf 103 0:00:02.140000 0.073 bedtools lock.QC_hg38__Intron_sort.bed +319668 cad847accf 101 0:00:02.170000 0.0024 grep lock.QC_hg38__Intron_sort.bed +319669 cad847accf 102 0:00:02.180000 0.0007 cut lock.QC_hg38__Intron_sort.bed +319704 67069d4162 104 0:00:00.740000 0.0017 bedtools lock.QC_hg38__tutorial_Intron_plus_coverage.bed +319706 225222b305 105 0:00:00.740000 0.0017 bedtools lock.QC_hg38__tutorial_Intron_minus_coverage.bed +319709 25a9a2bd43 107 0:00:00.080000 0.0 cut lock.QC_hg38__Promoter_sort.bed +319710 25a9a2bd43 108 0:00:00.460000 0.0027 grep lock.QC_hg38__Promoter_sort.bed +319712 25a9a2bd43 110 0:00:00.480000 0.0051 bedtools lock.QC_hg38__Promoter_sort.bed +319711 25a9a2bd43 109 0:00:00.500000 0.0007 cut lock.QC_hg38__Promoter_sort.bed +319714 e056efedcf 111 0:00:00.180000 0.0017 bedtools lock.QC_hg38__tutorial_Promoter_plus_coverage.bed +319716 b469a5be96 112 0:00:00.180000 0.0017 bedtools lock.QC_hg38__tutorial_Promoter_minus_coverage.bed +319718 93366b9194 114 0:00:00.060000 0.0009 mv lock.QC_hg38__Promoter_Flanking_Region +319719 4a4529dbb7 115 0:00:00.080000 0.0 cut lock.QC_hg38__Promoter_Flanking_Region_sort.bed +319722 4a4529dbb7 117 0:00:01.470000 0.0007 cut lock.QC_hg38__Promoter_Flanking_Region_sort.bed +319721 4a4529dbb7 116 0:00:01.500000 0.0024 grep lock.QC_hg38__Promoter_Flanking_Region_sort.bed +319723 4a4529dbb7 118 0:00:01.550000 0.0043 bedtools lock.QC_hg38__Promoter_Flanking_Region_sort.bed +319726 71fc964b4e 119 0:00:00.300000 0.0017 bedtools lock.QC_hg38__tutorial_Promoter_Flanking_Region_plus_coverage.bed +319728 709432f626 120 0:00:00.300000 0.0017 bedtools lock.QC_hg38__tutorial_Promoter_Flanking_Region_minus_coverage.bed +319731 f414b6c24b 121 0:00:27 0.4401 Rscript lock.QC_hg38__tutorial_plus_frif.pdf +319781 1f97078095 122 0:00:26.380000 0.4457 Rscript lock.QC_hg38__tutorial_minus_frif.pdf +319834 75bccc9ff7 124 0:00:06.150000 0.08 bedtools lock.QC_hg38__hg38_introns_sort.bed +319833 75bccc9ff7 123 0:00:06.170000 0.0046 grep lock.QC_hg38__hg38_introns_sort.bed +319842 4ca5d798a5 126 0:00:06.430000 0.0844 bedtools lock.QC_hg38__hg38_introns_sort.bed +319841 4ca5d798a5 125 0:00:06.460000 0.0047 grep lock.QC_hg38__hg38_introns_sort.bed +319843 4ca5d798a5 127 0:00:06.670000 0.0921 bedtools lock.QC_hg38__hg38_introns_sort.bed +319851 adcdfb3954 128 0:00:00.420000 0.0015 bedtools lock.QC_hg38__tutorial_introns_coverage.bed +319853 fc29720e6d 129 0:00:00.830000 0.0017 bedtools lock.QC_hg38__tutorial_introns_coverage.bed +319862 470a321be3 130 0:00:00.710000 0.0085 awk lock.QC_hg38__tutorial_exons_rpkm.bed +319864 470a321be3 132 0:00:00.720000 0.0028 sort lock.QC_hg38__tutorial_exons_rpkm.bed +319863 470a321be3 131 0:00:00.740000 0.0011 awk lock.QC_hg38__tutorial_exons_rpkm.bed +319866 0d463ff070 133 0:00:00.680000 0.0075 awk lock.QC_hg38__tutorial_introns_rpkm.bed +319868 0d463ff070 135 0:00:00.700000 0.0028 sort lock.QC_hg38__tutorial_introns_rpkm.bed +319867 0d463ff070 134 0:00:00.720000 0.0011 awk lock.QC_hg38__tutorial_introns_rpkm.bed +319871 d7f2ff45c5 136 0:00:00.070000 0.0 join lock.QC_hg38__tutorial_exon_intron_ratios.bed +319872 d7f2ff45c5 137 0:00:00.120000 0.0 awk lock.QC_hg38__tutorial_exon_intron_ratios.bed +319873 d7f2ff45c5 138 0:00:00.130000 0.0023 sort lock.QC_hg38__tutorial_exon_intron_ratios.bed +319879 8d0b25d0e3 139 0:00:05.270000 0.2154 Rscript lock.QC_hg38__tutorial_mRNA_contamination.pdf +319922 a4cff6ee58 140 0:00:00.070000 0.0007 gzip lock.QC_hg38__tutorial_exon_intron_ratios.bed.gz +319924 7555202fb3 141 0:00:00.150000 0.0004 samtools lock.signal_hg38__tutorial_plus_body_0-mer.bw +319925 96c844a6a8 142 0:00:03.420000 0.0318 /scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py lock.signal_hg38__tutorial_plus_body_0-mer.bw +320065 c2dfeb1835 143 0:00:00.150000 0.0005 samtools lock.signal_hg38__tutorial_minus_body_0-mer.bw +320066 bc2f567218 144 0:00:02.630000 0.0318 /scratch/jps3dp/tools/databio/peppro/tools/bamSitesToWig.py lock.signal_hg38__tutorial_minus_body_0-mer.bw diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.pdf new file mode 100644 index 0000000..a34d11e Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.png new file mode 100644 index 0000000..6f0ea1b Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_TSSenrichment.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.pdf new file mode 100644 index 0000000..138500b Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.png new file mode 100644 index 0000000..36b5a6b Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_fragLenDistribution.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.pdf new file mode 100644 index 0000000..29caa89 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.png new file mode 100644 index 0000000..c35b71d Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_mRNA_contamination.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.pdf new file mode 100644 index 0000000..75bb9d9 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.png new file mode 100644 index 0000000..3f34955 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_minus_frif.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.pdf new file mode 100644 index 0000000..8acdfb7 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.png new file mode 100644 index 0000000..38e943f Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_pause_index.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.pdf new file mode 100644 index 0000000..9c962f4 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.png new file mode 100644 index 0000000..40c2264 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_plus_frif.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_preseq_plot.pdf b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_preseq_plot.pdf new file mode 100644 index 0000000..6668df4 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_preseq_plot.pdf differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_preseq_plot.png b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_preseq_plot.png new file mode 100644 index 0000000..b6c8287 Binary files /dev/null and b/docs/files/examples/tutorial/results_pipeline/tutorial/QC_hg38/tutorial_preseq_plot.png differ diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.html b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.html new file mode 100644 index 0000000..39aea42 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.html @@ -0,0 +1,3513 @@ +fastp report at 2019-11-27 13:55:21 + + + +
+

tutorial +
+ +
+
General
+
+ + + + + + +
fastp version:0.19.4 (https://github.com/OpenGene/fastp)
sequencing:single end (46 cycles)
mean length before filtering:45bp
mean length after filtering:41bp
duplication rate:25.115256% (may be overestimated since this is SE data)
+
+
Before filtering
+
+ + + + + + +
total reads:1000.000000 K
total bases:45.253865 M
Q20 bases:43.967646 M (97.157770%)
Q30 bases:43.636909 M (96.426922%)
GC content:54.417765%
+
+
After filtering
+
+ + + + + + +
total reads:497.796000 K
total bases:20.903897 M
Q20 bases:20.456786 M (97.861112%)
Q30 bases:20.301335 M (97.117466%)
GC content:53.349895%
+
+
Filtering result
+
+ + + + + +
reads passed filters:497.796000 K (49.779600%)
reads with low quality:13.053000 K (1.305300%)
reads with too many N:8 (0.000800%)
reads too short:489.143000 K (48.914300%)
+
+
+
+
+ +
+
Adapter or bad ligation of read1
+
+ + + + + + + + + + +
SequenceOccurrences
TGGAATTCTCGGGTGCC13226
TGGAATTCTCGGGTGCCA9608
TGGAATTCTCGGGTGCCAAGG10676
TGGAATTCTCGGGTGCCAAGGAACTCC7569
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC8351
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA383032
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAAT18321
other adapter sequences148213
+
+
+
+
+ +
+
+
+
+ + +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 46
AAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
AAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
AACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC34 (0.060105%)
AACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
AACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
AACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
AAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
AAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
AAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC55 (0.097229%)
AAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
AAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
AAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
AAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
AAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC59 (0.104300%)
AAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
AATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
AATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA25 (0.044195%)
AATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC51 (0.090158%)
AATTCTCGGGTGCCAAGGAA46 (0.040660%)
AATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
ACAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
ACAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
ACATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
ACCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
ACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
ACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
ACCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
ACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
ACGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
ACGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ACGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC41 (0.072480%)
ACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
ACTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
ACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA24 (0.042427%)
ACTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
ACTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
AGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
AGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC34 (0.060105%)
AGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
AGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
AGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
AGCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
AGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
AGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
AGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
AGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
AGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC26 (0.045963%)
AGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
AGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
AGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
AGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
AGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
AGCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC15 (0.026517%)
AGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
AGGAATTCTCGGGTGCCAAG118 (0.104300%)
AGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
AGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC86 (0.152031%)
AGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
AGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
AGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
AGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC22 (0.038892%)
AGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
AGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC68 (0.120211%)
AGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG31 (0.054802%)
AGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
AGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC26 (0.045963%)
AGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
AGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
AGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
AGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC59 (0.104300%)
AGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
AGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
AGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA32 (0.056570%)
AGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
AGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
AGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC57 (0.100765%)
ATAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
ATACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
ATAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
ATATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC57 (0.100765%)
ATCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
ATCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC56 (0.098997%)
ATCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
ATGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC56 (0.098997%)
ATGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC69 (0.121979%)
ATGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC135 (0.238654%)
ATGGAATTCTCGGGTGCCAA5179 (4.577731%)
ATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT58 (0.112786%)
ATGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
ATGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
ATGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC79 (0.139657%)
ATGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC59 (0.104300%)
ATTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
ATTCTCGGGTGCCAAGGAAC43 (0.038008%)
ATTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ATTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
ATTGGAATTCTCGGGTGCCA1375 (1.215366%)
ATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA30 (0.053034%)
ATTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
ATTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC85 (0.150263%)
CAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC75 (0.132585%)
CAAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC10 (0.017678%)
CAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
CAAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
CAAGGAACTCCAGTCACGCC86 (0.076016%)
CAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC78 (0.137889%)
CAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
CACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
CACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
CACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
CACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
CACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
CACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
CACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
CACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC85 (0.150263%)
CACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC22 (0.038892%)
CACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
CAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC46 (0.081319%)
CAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
CAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
CAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
CAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
CAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC30 (0.053034%)
CAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
CAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
CATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC74 (0.130818%)
CATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC28 (0.049499%)
CATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA18 (0.031820%)
CATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
CATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
CCAAGGAACTCCAGTCACGC38 (0.033588%)
CCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
CCACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
CCACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG23 (0.040660%)
CCACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC53 (0.093694%)
CCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
CCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC53 (0.093694%)
CCAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG39 (0.068944%)
CCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
CCATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
CCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC37 (0.065409%)
CCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC64 (0.113140%)
CCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC69 (0.121979%)
CCCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG32 (0.056570%)
CCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC46 (0.081319%)
CCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
CCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC54 (0.095461%)
CCCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG27 (0.047731%)
CCCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG38 (0.067177%)
CCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
CCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC64 (0.113140%)
CCCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG14 (0.024749%)
CCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
CCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
CCCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG34 (0.060105%)
CCCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG27 (0.047731%)
CCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
CCCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
CCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
CCCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
CCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
CCCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
CCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC58 (0.102533%)
CCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC50 (0.088390%)
CCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC41 (0.072480%)
CCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
CCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
CCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
CCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC53 (0.093694%)
CCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC25 (0.044195%)
CCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
CCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
CCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
CCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC68 (0.120211%)
CCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA7 (0.012375%)
CCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC70 (0.123746%)
CCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
CCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC45 (0.079551%)
CGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC19 (0.033588%)
CGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
CGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
CGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
CGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
CGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC38 (0.067177%)
CGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
CGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
CGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
CGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
CGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
CGCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC26 (0.045963%)
CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT24 (0.042427%)
CGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
CGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
CGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
CGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
CGGGTGCCAAGGAACTCCAG57 (0.050382%)
CGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
CGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
CGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
CGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA16 (0.028285%)
CGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
CGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC41 (0.072480%)
CTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC60 (0.106068%)
CTACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
CTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC51 (0.090158%)
CTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
CTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
CTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
CTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
CTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CTCGGGTGCCAAGGAACTCC39 (0.034472%)
CTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC30 (0.053034%)
CTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC26 (0.045963%)
CTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC54 (0.095461%)
CTGGAATTCTCGGGTGCCAA6923 (6.119256%)
CTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT35 (0.068060%)
CTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
CTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC32 (0.056570%)
CTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
CTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CTTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC22 (0.038892%)
CTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
CTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA15 (0.026517%)
CTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
CTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
GAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
GAATTCTCGGGTGCCAAGGA39 (0.034472%)
GAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
GACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC25 (0.044195%)
GAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC66 (0.116675%)
GAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
GAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
GAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
GAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
GAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
GAGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG30 (0.053034%)
GAGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG42 (0.074248%)
GAGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG42 (0.074248%)
GAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
GAGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG34 (0.060105%)
GAGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG35 (0.061873%)
GAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
GAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
GAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GAGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
GAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC99 (0.175013%)
GATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
GATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA13 (0.022981%)
GATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
GATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
GCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC117 (0.206833%)
GCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
GCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC75 (0.132585%)
GCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC73 (0.129050%)
GCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
GCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
GCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
GCATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
GCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
GCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
GCCAAGGAACTCCAGTCACG39 (0.034472%)
GCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
GCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC88 (0.155567%)
GCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC41 (0.072480%)
GCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
GCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC86 (0.152031%)
GCCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG29 (0.051266%)
GCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
GCCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG29 (0.051266%)
GCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC64 (0.113140%)
GCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
GCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
GCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
GCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC62 (0.109604%)
GCGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC25 (0.044195%)
GCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
GCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC69 (0.121979%)
GCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC63 (0.111372%)
GCGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG37 (0.065409%)
GCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
GCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC56 (0.098997%)
GCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
GCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
GCGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GCGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
GCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
GCGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
GCTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC45 (0.079551%)
GCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
GCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
GCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
GCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATC20 (0.038892%)
GCTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
GCTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
GCTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA6 (0.010607%)
GCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
GGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
GGAATTCTCGGGTGCCAAGG162 (0.143192%)
GGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCG5 (0.009723%)
GGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
GGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG29 (0.051266%)
GGAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG39 (0.068944%)
GGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
GGAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG24 (0.042427%)
GGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC21 (0.037124%)
GGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC58 (0.102533%)
GGCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
GGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
GGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
GGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
GGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC55 (0.097229%)
GGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC24 (0.042427%)
GGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC27 (0.047731%)
GGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC45 (0.079551%)
GGCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
GGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
GGCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG19 (0.033588%)
GGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC55 (0.097229%)
GGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
GGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
GGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
GGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
GGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
GGGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC27 (0.047731%)
GGGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG39 (0.068944%)
GGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
GGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
GGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
GGGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG28 (0.049499%)
GGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
GGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC23 (0.040660%)
GGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
GGGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
GGGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG21 (0.037124%)
GGGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG35 (0.061873%)
GGGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG40 (0.070712%)
GGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
GGGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG24 (0.042427%)
GGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG42 (0.074248%)
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG20 (0.035356%)
GGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC85 (0.150263%)
GGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
GGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC61 (0.107836%)
GGGTGCCAAGGAACTCCAGT120 (0.106068%)
GGGTGCCTGGAATTCTCGGG14 (0.012375%)
GGGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC46 (0.081319%)
GGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC13 (0.022981%)
GGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
GGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC69 (0.121979%)
GGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
GGTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GGTGCCAAGGAACTCCAGTC76 (0.067177%)
GGTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG34 (0.060105%)
GGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC63 (0.111372%)
GGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA6 (0.010607%)
GGTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG28 (0.049499%)
GGTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG38 (0.067177%)
GGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
GGTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG30 (0.053034%)
GGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC76 (0.134353%)
GGTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
GGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
GTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
GTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
GTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
GTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
GTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
GTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
GTGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
GTGCCAAGGAACTCCAGTCA22 (0.019446%)
GTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC67 (0.118443%)
GTGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC46 (0.081319%)
GTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC70 (0.123746%)
GTGGAATTCTCGGGTGCCAA4479 (3.958999%)
GTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT47 (0.091396%)
GTGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG28 (0.049499%)
GTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GTGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG24 (0.042427%)
GTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
GTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
GTGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
GTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC57 (0.100765%)
GTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC67 (0.118443%)
GTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
GTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
GTTGGAATTCTCGGGTGCCA1640 (1.449600%)
GTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA15 (0.026517%)
GTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
GTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC102 (0.180316%)
NNNNNNNNNNNNNNNNNNNN317 (0.280197%)
TAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
TAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC38 (0.067177%)
TAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
TAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
TACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
TACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
TACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
TACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
TAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC54 (0.095461%)
TAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC94 (0.166174%)
TAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
TAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
TAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC56 (0.098997%)
TATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
TATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA20 (0.035356%)
TATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
TATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC78 (0.137889%)
TCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
TCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC25 (0.044195%)
TCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
TCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
TCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
TCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
TCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
TCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
TCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
TCGGGTGCCAAGGAACTCCA111 (0.098113%)
TCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
TCTCGGGTGCCAAGGAACTC75 (0.066293%)
TCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
TCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA22 (0.038892%)
TGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
TGAATTCTCGGGTGCCAAGG200 (0.176780%)
TGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
TGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
TGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
TGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC71 (0.125514%)
TGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
TGCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
TGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC76 (0.134353%)
TGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC16 (0.028285%)
TGCCAAGGAACTCCAGTCAC31 (0.027401%)
TGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
TGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC67 (0.118443%)
TGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
TGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
TGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC67 (0.118443%)
TGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
TGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
TGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
TGGAATTCTCGGGTGCCAAG145 (0.128166%)
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTC131 (0.254741%)
TGGAATTCTCGGGTGCCTGG135 (0.119327%)
TGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC72 (0.127282%)
TGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
TGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC77 (0.136121%)
TGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
TGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
TGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
TGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
TGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
TGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
TGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
TGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC70 (0.123746%)
TGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
TGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
TGTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC11 (0.019446%)
TGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
TGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC62 (0.109604%)
TGTGGAATTCTCGGGTGCCA1254 (1.108414%)
TGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA13 (0.022981%)
TGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
TGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC89 (0.157335%)
TTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
TTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
TTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC37 (0.065409%)
TTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
TTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
TTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
TTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
TTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
TTCTCGGGTGCCAAGGAACT27 (0.023865%)
TTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
TTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
TTGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG32 (0.056570%)
TTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC58 (0.102533%)
TTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
TTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
TTGGAATTCTCGGGTGCCAA1870 (1.652897%)
TTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT79 (0.153622%)
TTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC79 (0.139657%)
TTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC77 (0.136121%)
TTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC58 (0.102533%)
TTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
TTTGGAATTCTCGGGTGCCA1831 (1.618425%)
TTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA15 (0.026517%)
TTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC72 (0.127282%)
TTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC58 (0.102533%)
TTTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC21 (0.037124%)
TTTTTTTTTTTTTTTTTTTT247 (0.218324%)
+
+ +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + + + + + + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 46
AATTCTCGGGTGCCAAGGAA16 (0.030616%)
CAAGGAACTCCAGTCACGCC46 (0.088022%)
CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT41 (0.156909%)
CGGGTGCCAAGGAACTCCAG19 (0.036357%)
CTCGGGTGCCAAGGAACTCC21 (0.040184%)
GCCAAGGAACTCCAGTCACG12 (0.022962%)
GGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCG2 (0.008419%)
GGGTGCCAAGGAACTCCAGT31 (0.059319%)
GGTGCCAAGGAACTCCAGTC42 (0.080368%)
TCGGGTGCCAAGGAACTCCA21 (0.040184%)
TCTCGGGTGCCAAGGAACTC49 (0.093762%)
TTCTCGGGTGCCAAGGAACT19 (0.036357%)
TTTTTTTTTTTTTTTTTTTT300 (0.574056%)
+
+ +
+
+ +

+ \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.json b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.json new file mode 100644 index 0000000..6eff6d1 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.json @@ -0,0 +1,752 @@ +{ + "summary": { + "before_filtering": { + "total_reads":1000000, + "total_bases":45253865, + "q20_bases":43967646, + "q30_bases":43636909, + "q20_rate":0.971578, + "q30_rate":0.964269, + "read1_mean_length":45, + "gc_content":0.544178 + }, + "after_filtering": { + "total_reads":497796, + "total_bases":20903897, + "q20_bases":20456786, + "q30_bases":20301335, + "q20_rate":0.978611, + "q30_rate":0.971175, + "read1_mean_length":41, + "gc_content":0.533499 + } + }, + "filtering_result": { + "passed_filter_reads": 497796, + "low_quality_reads": 13053, + "too_many_N_reads": 8, + "too_short_reads": 489143, + "too_long_reads": 0 + }, + "duplication": { + "rate": 0.251153, + "histogram": [503744,11613,7640,5342,3985,2834,2119,1608,1219,874,732,569,470,397,319,258,236,179,165,125,108,103,83,65,57,48,44,43,34,28,239], + "mean_gc": [0.480399,0.47184,0.467902,0.467371,0.464419,0.464187,0.462171,0.467381,0.465097,0.465648,0.461261,0.46912,0.45627,0.457875,0.47601,0.462183,0.452193,0.47061,0.474819,0.466855,0.445534,0.455968,0.464068,0.484163,0.466323,0.479167,0.455704,0.431555,0.458131,0.465266,0.457216] + }, + "adapter_cutting": { + "adapter_trimmed_reads": 598996, + "adapter_trimmed_bases": 20040284, + "read1_adapter_sequence": "TGGAATTCTCGGGTGCCAAGG", + "read1_adapter_counts": {"TGGAATTCTCGGGTGCC":13226, "TGGAATTCTCGGGTGCCA":9608, "TGGAATTCTCGGGTGCCAAGG":10676, "TGGAATTCTCGGGTGCCAAGGAACTCC":7569, "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":8351, "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA":383032, "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAAT":18321, "others":148213} + }, + "read1_before_filtering": { + "total_reads": 1000000, + "total_bases": 45253865, + "q20_bases": 43967646, + "q30_bases": 43636909, + "total_cycles": 46, + "quality_curves": { + "A":[30.825,31.2657,31.4639,31.4486,31.4701,35.1037,35.0473,35.0462,29.2943,34.3377,35.4184,35.0449,35.0191,34.7388,34.8436,34.855,34.8743,35.0673,35.4237,35.407,35.4116,35.1336,35.3507,34.991,35.0165,35.1529,35.1368,35.248,35.2393,35.1017,35.0664,34.8135,34.7857,34.7447,34.8149,35.0829,35.1229,34.7796,34.6786,34.8595,34.783,34.7735,34.6327,33.8781,35.1474,34.6511], + "T":[31.5874,31.6519,31.7482,31.7856,31.7864,35.2857,35.2607,35.2695,34.93,35.4955,35.5748,35.463,35.3973,35.2732,35.2216,35.4814,35.3842,35.4626,35.5958,35.6074,35.6002,35.5315,35.5951,35.4782,35.4045,35.3112,35.3535,35.5531,35.538,35.3607,35.347,35.3206,35.392,35.3024,35.2876,35.288,35.469,35.4355,35.2452,35.2433,35.1889,34.8702,35.0138,35.1155,35.1567,34.7483], + "C":[31.6474,31.6396,31.7754,31.8059,31.8197,35.382,35.3296,35.3459,35.7253,35.6156,35.6202,35.6008,35.5569,35.6171,35.4983,35.3351,35.4399,35.4535,35.6175,35.5962,35.5956,35.5052,35.562,35.5001,35.4802,35.4274,35.4352,35.4928,35.4697,35.3313,35.2862,35.3946,35.2492,35.3685,35.362,35.305,35.3884,35.2054,35.3247,35.0867,35.2704,34.7263,35.0728,35.0187,34.9901,34.5319], + "G":[30.994,31.4042,31.6051,31.6401,31.6655,35.1563,35.0983,35.1032,35.1066,34.7424,35.1775,35.3779,35.3458,35.4484,35.3565,35.4238,35.3309,35.41,35.4303,35.4535,35.4857,35.4385,35.4623,35.4419,35.3814,35.2563,35.2569,35.4561,35.3765,35.2343,35.2151,35.2878,35.2521,35.2882,35.2342,35.1407,35.3602,35.1777,35.1811,35.0089,35.0724,34.8611,34.8826,34.8794,34.8105,34.685], + "mean":[31.0743,31.2937,31.4543,31.485,31.5004,35.02,34.9692,34.971,34.4948,34.7427,35.1241,35.0241,34.9936,35.0753,35.0282,35.1029,35.1163,35.1861,35.2667,35.274,35.2923,35.242,35.2635,35.199,35.1668,35.0124,35.0116,35.2275,35.1745,34.9748,34.9478,35.071,35.0514,35.0506,35.0396,35.1553,35.3492,35.2677,35.1959,34.9697,35.16,34.8248,34.9741,34.8271,35.0771,34.6511] + }, + "content_curves": { + "A":[0.261857,0.268095,0.211688,0.201831,0.201457,0.188008,0.204672,0.212125,0.065411,0.092826,0.141561,0.567159,0.552973,0.11433,0.113464,0.106365,0.107895,0.120065,0.110839,0.122845,0.125413,0.137053,0.140272,0.145609,0.15958,0.574379,0.559845,0.146432,0.154294,0.568209,0.552768,0.142272,0.133742,0.131647,0.144863,0.542575,0.132827,0.136031,0.149721,0.548434,0.138468,0.135123,0.143501,0.16222,0.566997,0], + "T":[0.241992,0.263236,0.214082,0.21003,0.212406,0.220349,0.234819,0.281611,0.556641,0.160461,0.147852,0.123155,0.164815,0.588928,0.584086,0.150477,0.557249,0.133987,0.133518,0.128561,0.145741,0.550515,0.128334,0.122974,0.126179,0.129226,0.127741,0.130084,0.132402,0.131377,0.134164,0.151636,0.548358,0.14436,0.136034,0.134135,0.149881,0.541099,0.13293,0.124204,0.126291,0.131345,0.130483,0.129414,0.137012,0.380379], + "C":[0.272413,0.215889,0.229589,0.264438,0.257089,0.260867,0.243539,0.190242,0.12897,0.146404,0.152226,0.177508,0.16473,0.158349,0.166135,0.606719,0.179492,0.580986,0.161947,0.15555,0.143524,0.142381,0.160735,0.581664,0.566584,0.144489,0.141001,0.140405,0.14183,0.146175,0.166327,0.565839,0.176678,0.580319,0.570976,0.161966,0.15826,0.173377,0.570445,0.175035,0.571023,0.182177,0.581659,0.575252,0.161032,0.363115], + "G":[0.216942,0.246339,0.338201,0.317263,0.32261,0.324327,0.31053,0.309584,0.24254,0.593871,0.551922,0.125738,0.111041,0.13195,0.129873,0.129998,0.148925,0.158524,0.587255,0.586595,0.578856,0.163609,0.564198,0.143307,0.141209,0.1454,0.164931,0.576626,0.565019,0.147773,0.14028,0.133802,0.134769,0.137225,0.141684,0.16131,0.559023,0.149488,0.146892,0.152326,0.164216,0.551324,0.144345,0.133098,0.134935,0.256432], + "N":[0.006796,0.006441,0.00644,0.006438,0.006438,0.006449,0.00644,0.006438,0.006438,0.006438,0.006439,0.00644,0.006441,0.006443,0.006442,0.006441,0.006439,0.006438,0.006441,0.006449,0.006466,0.006442,0.006461,0.006446,0.006448,0.006506,0.006482,0.006453,0.006455,0.006466,0.006461,0.006451,0.006453,0.006449,0.006443,1.41045e-05,9.07335e-06,5.04096e-06,1.3107e-05,1.00829e-06,2.01848e-06,3.23089e-05,1.21308e-05,1.62851e-05,2.42268e-05,7.40712e-05], + "GC":[0.489355,0.462228,0.56779,0.581701,0.579699,0.585194,0.554069,0.499826,0.37151,0.740275,0.704148,0.303246,0.275771,0.290299,0.296008,0.736717,0.328417,0.73951,0.749202,0.742145,0.72238,0.30599,0.724933,0.724971,0.707793,0.289889,0.305932,0.717031,0.706849,0.293948,0.306607,0.699641,0.311447,0.717544,0.71266,0.323276,0.717284,0.322865,0.717337,0.327361,0.735239,0.7335,0.726004,0.70835,0.295967,0.619547] + }, + "kmer_count": { + "AAAAA":55223, "AAAAT":34547, "AAAAC":26093, "AAAAG":24400, "AAATA":26058, "AAATT":25637, "AAATC":18572, "AAATG":29168, "AAACA":26239, "AAACT":21408, "AAACC":21078, "AAACG":6935, "AAAGA":23650, "AAAGT":22271, "AAAGC":19974, "AAAGG":23804, + "AATAA":22554, "AATAT":39601, "AATAC":15859, "AATAG":11613, "AATTA":19326, "AATTT":27811, "AATTC":623965, "AATTG":23892, "AATCA":16603, "AATCT":18208, "AATCC":20428, "AATCG":4840, "AATGA":17077, "AATGT":17382, "AATGC":16201, "AATGG":50661, + "AACAA":24529, "AACAT":18198, "AACAC":19622, "AACAG":18920, "AACTA":14428, "AACTT":17160, "AACTC":514868, "AACTG":21875, "AACCA":19983, "AACCT":20429, "AACCC":23604, "AACCG":7369, "AACGA":6285, "AACGT":6011, "AACGC":8988, "AACGG":7871, + "AAGAA":25562, "AAGAT":16706, "AAGAC":16120, "AAGAG":23269, "AAGTA":14183, "AAGTT":19949, "AAGTC":17665, "AAGTG":29348, "AAGCA":23623, "AAGCT":22927, "AAGCC":23918, "AAGCG":14431, "AAGGA":531891, "AAGGT":22558, "AAGGC":26058, "AAGGG":28271, + "ATAAA":23041, "ATAAT":17541, "ATAAC":10379, "ATAAG":11653, "ATATA":14677, "ATATT":19068, "ATATC":25228, "ATATG":19492, "ATACA":16417, "ATACT":14478, "ATACC":11131, "ATACG":4725, "ATAGA":9628, "ATAGT":12225, "ATAGC":11536, "ATAGG":12116, + "ATTAA":16504, "ATTAT":19022, "ATTAC":14205, "ATTAG":13156, "ATTTA":18483, "ATTTT":38693, "ATTTC":20098, "ATTTG":29632, "ATTCA":17817, "ATTCT":624362, "ATTCC":19174, "ATTCG":6470, "ATTGA":11418, "ATTGT":16318, "ATTGC":15796, "ATTGG":53560, + "ATCAA":15276, "ATCAT":15347, "ATCAC":17368, "ATCAG":13777, "ATCTA":12542, "ATCTT":19714, "ATCTC":29642, "ATCTG":21763, "ATCCA":20685, "ATCCT":21763, "ATCCC":24738, "ATCCG":8628, "ATCGA":5135, "ATCGT":5789, "ATCGC":8481, "ATCGG":7533, + "ATGAA":17596, "ATGAT":15870, "ATGAC":11553, "ATGAG":18127, "ATGTA":12373, "ATGTT":21508, "ATGTC":12858, "ATGTG":28088, "ATGCA":18294, "ATGCT":22940, "ATGCC":24502, "ATGCG":10366, "ATGGA":142965, "ATGGT":22893, "ATGGC":26245, "ATGGG":26040, + "ACAAA":28007, "ACAAT":17123, "ACAAC":15826, "ACAAG":19276, "ACATA":13069, "ACATT":19079, "ACATC":12613, "ACATG":25005, "ACACA":27195, "ACACT":19838, "ACACC":24140, "ACACG":10568, "ACAGA":22069, "ACAGT":24702, "ACAGC":26533, "ACAGG":27635, + "ACTAA":13820, "ACTAT":12654, "ACTAC":14681, "ACTAG":11335, "ACTTA":12028, "ACTTT":24353, "ACTTC":17640, "ACTTG":24807, "ACTCA":22583, "ACTCT":18707, "ACTCC":517231, "ACTCG":9284, "ACTGA":15887, "ACTGT":16278, "ACTGC":23006, "ACTGG":48473, + "ACCAA":20122, "ACCAT":21096, "ACCAC":25951, "ACCAG":24166, "ACCTA":13759, "ACCTT":22101, "ACCTC":27126, "ACCTG":32295, "ACCCA":27110, "ACCCT":24968, "ACCCC":28194, "ACCCG":18332, "ACCGA":7521, "ACCGT":8368, "ACCGC":17069, "ACCGG":11654, + "ACGAA":6843, "ACGAT":20578, "ACGAC":8975, "ACGAG":9792, "ACGTA":4394, "ACGTT":7984, "ACGTC":6315, "ACGTG":14559, "ACGCA":12090, "ACGCT":13007, "ACGCC":486805, "ACGCG":11420, "ACGGA":8816, "ACGGT":9158, "ACGGC":13456, "ACGGG":14626, + "AGAAA":28481, "AGAAT":20511, "AGAAC":15969, "AGAAG":22725, "AGATA":12273, "AGATT":16743, "AGATC":12925, "AGATG":24966, "AGACA":20304, "AGACT":17163, "AGACC":17802, "AGACG":9072, "AGAGA":25988, "AGAGT":23849, "AGAGC":24036, "AGAGG":32410, + "AGTAA":14291, "AGTAT":13368, "AGTAC":10385, "AGTAG":14769, "AGTTA":12904, "AGTTT":23612, "AGTTC":22459, "AGTTG":26660, "AGTCA":496175, "AGTCT":17780, "AGTCC":32302, "AGTCG":5872, "AGTGA":20525, "AGTGT":19842, "AGTGC":26523, "AGTGG":57726, + "AGCAA":23871, "AGCAT":22486, "AGCAC":24521, "AGCAG":32373, "AGCTA":21541, "AGCTT":26593, "AGCTC":25117, "AGCTG":44345, "AGCCA":34228, "AGCCT":43454, "AGCCC":37988, "AGCCG":21193, "AGCGA":15462, "AGCGT":14175, "AGCGC":24410, "AGCGG":20103, + "AGGAA":535101, "AGGAT":20197, "AGGAC":17593, "AGGAG":38355, "AGGTA":15526, "AGGTT":25980, "AGGTC":17425, "AGGTG":45422, "AGGCA":36601, "AGGCT":42129, "AGGCC":36280, "AGGCG":28626, "AGGGA":28642, "AGGGT":32152, "AGGGC":37922, "AGGGG":41749, + "TAAAA":27778, "TAAAT":20682, "TAAAC":13474, "TAAAG":15264, "TAATA":13511, "TAATT":20492, "TAATC":14672, "TAATG":19704, "TAACA":14478, "TAACT":14082, "TAACC":11400, "TAACG":5651, "TAAGA":13680, "TAAGT":14465, "TAAGC":13743, "TAAGG":16143, + "TATAA":14337, "TATAT":16827, "TATAC":10003, "TATAG":9624, "TATTA":13635, "TATTT":29547, "TATTC":13982, "TATTG":22622, "TATCA":11503, "TATCT":26182, "TATCC":13128, "TATCG":5363, "TATGA":11739, "TATGT":17052, "TATGC":14576, "TATGG":43613, + "TACAA":16263, "TACAT":15489, "TACAC":14312, "TACAG":25831, "TACTA":12355, "TACTT":17867, "TACTC":14971, "TACTG":21174, "TACCA":17699, "TACCT":19845, "TACCC":15186, "TACCG":7405, "TACGA":4960, "TACGT":6625, "TACGC":7389, "TACGG":7970, + "TAGAA":14700, "TAGAT":11129, "TAGAC":8577, "TAGAG":15821, "TAGTA":10838, "TAGTT":15873, "TAGTC":12728, "TAGTG":20570, "TAGCA":16185, "TAGCT":21497, "TAGCC":18677, "TAGCG":10544, "TAGGA":16220, "TAGGT":15912, "TAGGC":18717, "TAGGG":22573, + "TTAAA":23479, "TTAAT":17769, "TTAAC":12487, "TTAAG":14941, "TTATA":13954, "TTATT":23785, "TTATC":13594, "TTATG":21307, "TTACA":18116, "TTACT":16411, "TTACC":16110, "TTACG":5529, "TTAGA":12568, "TTAGT":15507, "TTAGC":16581, "TTAGG":17875, + "TTTAA":23308, "TTTAT":23585, "TTTAC":15439, "TTTAG":16446, "TTTTA":26959, "TTTTT":231675, "TTTTC":27328, "TTTTG":39731, "TTTCA":24188, "TTTCT":31504, "TTTCC":28384, "TTTCG":7712, "TTTGA":18362, "TTTGT":26430, "TTTGC":21832, "TTTGG":74998, + "TTCAA":21767, "TTCAT":19897, "TTCAC":21650, "TTCAG":23750, "TTCTA":23129, "TTCTT":29817, "TTCTC":621418, "TTCTG":31953, "TTCCA":27216, "TTCCT":32195, "TTCCC":32474, "TTCCG":11498, "TTCGA":5785, "TTCGT":6808, "TTCGC":9169, "TTCGG":10691, + "TTGAA":19443, "TTGAT":14643, "TTGAC":10795, "TTGAG":20864, "TTGTA":14447, "TTGTT":25042, "TTGTC":14986, "TTGTG":31359, "TTGCA":20299, "TTGCT":27827, "TTGCC":27693, "TTGCG":12518, "TTGGA":190740, "TTGGT":23636, "TTGGC":27637, "TTGGG":34514, + "TCAAA":21635, "TCAAT":15900, "TCAAC":13325, "TCAAG":21810, "TCATA":10935, "TCATT":18985, "TCATC":14051, "TCATG":21195, "TCACA":22013, "TCACT":25706, "TCACC":29214, "TCACG":486803, "TCAGA":20852, "TCAGT":18958, "TCAGC":26909, "TCAGG":25283, + "TCTAA":13321, "TCTAT":14094, "TCTAC":18502, "TCTAG":16086, "TCTTA":13142, "TCTTT":25929, "TCTTC":23016, "TCTTG":27552, "TCTCA":24185, "TCTCT":30903, "TCTCC":35162, "TCTCG":607957, "TCTGA":18462, "TCTGT":21662, "TCTGC":24477, "TCTGG":49989, + "TCCAA":20838, "TCCAT":22773, "TCCAC":25455, "TCCAG":513119, "TCCTA":14944, "TCCTT":27608, "TCCTC":31884, "TCCTG":39675, "TCCCA":42000, "TCCCT":34791, "TCCCC":38837, "TCCCG":24627, "TCCGA":22098, "TCCGT":9428, "TCCGC":20715, "TCCGG":16127, + "TCGAA":5968, "TCGAT":5653, "TCGAC":5195, "TCGAG":9898, "TCGTA":3647, "TCGTT":6922, "TCGTC":5945, "TCGTG":12590, "TCGCA":7880, "TCGCT":12952, "TCGCC":16597, "TCGCG":10913, "TCGGA":7559, "TCGGT":9723, "TCGGC":16094, "TCGGG":594588, + "TGAAA":21250, "TGAAT":20718, "TGAAC":15120, "TGAAG":16829, "TGATA":10203, "TGATT":16868, "TGATC":15183, "TGATG":21972, "TGACA":16981, "TGACT":17134, "TGACC":16566, "TGACG":6636, "TGAGA":21180, "TGAGT":19474, "TGAGC":25151, "TGAGG":31037, + "TGTAA":17462, "TGTAT":15760, "TGTAC":10890, "TGTAG":13266, "TGTTA":14013, "TGTTT":27543, "TGTTC":17401, "TGTTG":30744, "TGTCA":15963, "TGTCT":22150, "TGTCC":19858, "TGTCG":7704, "TGTGA":18777, "TGTGT":29039, "TGTGC":25742, "TGTGG":60798, + "TGCAA":22414, "TGCAT":20466, "TGCAC":23073, "TGCAG":31715, "TGCTA":17184, "TGCTT":29990, "TGCTC":23124, "TGCTG":48445, "TGCCA":569786, "TGCCT":46856, "TGCCC":38195, "TGCCG":17740, "TGCGA":9969, "TGCGT":14682, "TGCGC":20502, "TGCGG":18676, + "TGGAA":630261, "TGGAT":18801, "TGGAC":14118, "TGGAG":29964, "TGGTA":15609, "TGGTT":25359, "TGGTC":19268, "TGGTG":44823, "TGGCA":30392, "TGGCT":38145, "TGGCC":38538, "TGGCG":25253, "TGGGA":35090, "TGGGT":34602, "TGGGC":39605, "TGGGG":44512, + "CAAAA":30274, "CAAAT":22830, "CAAAC":19560, "CAAAG":25664, "CAATA":35724, "CAATT":20014, "CAATC":13655, "CAATG":28557, "CAACA":22947, "CAACT":19283, "CAACC":20281, "CAACG":8767, "CAAGA":22122, "CAAGT":25093, "CAAGC":25237, "CAAGG":550432, + "CATAA":12822, "CATAT":14999, "CATAC":11189, "CATAG":11331, "CATTA":13818, "CATTT":29453, "CATTC":18526, "CATTG":28533, "CATCA":15777, "CATCT":23093, "CATCC":20968, "CATCG":6711, "CATGA":16582, "CATGT":22666, "CATGC":23507, "CATGG":71747, + "CACAA":22440, "CACAT":22299, "CACAC":29770, "CACAG":30120, "CACTA":14297, "CACTT":27185, "CACTC":26548, "CACTG":40540, "CACCA":35285, "CACCT":36196, "CACCC":36291, "CACCG":18778, "CACGA":11889, "CACGT":12885, "CACGC":492868, "CACGG":17097, + "CAGAA":23939, "CAGAT":17269, "CAGAC":19700, "CAGAG":33691, "CAGTA":14132, "CAGTT":22652, "CAGTC":506526, "CAGTG":38787, "CAGCA":35685, "CAGCT":42816, "CAGCC":57090, "CAGCG":26376, "CAGGA":33839, "CAGGT":31441, "CAGGC":46277, "CAGGG":40292, + "CTAAA":16827, "CTAAT":15245, "CTAAC":11706, "CTAAG":14620, "CTATA":10862, "CTATT":18412, "CTATC":10905, "CTATG":23234, "CTACA":23601, "CTACT":21892, "CTACC":18373, "CTACG":8850, "CTAGA":13645, "CTAGT":15658, "CTAGC":18000, "CTAGG":22301, + "CTTAA":14475, "CTTAT":14705, "CTTAC":13391, "CTTAG":14010, "CTTTA":16602, "CTTTT":31572, "CTTTC":24367, "CTTTG":37161, "CTTCA":22202, "CTTCT":29743, "CTTCC":35151, "CTTCG":9888, "CTTGA":18180, "CTTGT":21766, "CTTGC":22475, "CTTGG":79049, + "CTCAA":23273, "CTCAT":17524, "CTCAC":28900, "CTCAG":35229, "CTCTA":15415, "CTCTT":23526, "CTCTC":32425, "CTCTG":39660, "CTCCA":517020, "CTCCT":40820, "CTCCC":54346, "CTCCG":23761, "CTCGA":9143, "CTCGT":13678, "CTCGC":18205, "CTCGG":605004, + "CTGAA":20060, "CTGAT":14406, "CTGAC":19167, "CTGAG":29895, "CTGTA":18034, "CTGTT":20631, "CTGTC":20668, "CTGTG":30926, "CTGCA":30993, "CTGCT":33104, "CTGCC":44179, "CTGCG":19441, "CTGGA":203162, "CTGGT":23048, "CTGGC":35364, "CTGGG":48625, + "CCAAA":26595, "CCAAT":76192, "CCAAC":21988, "CCAAG":551449, "CCATA":12671, "CCATT":26993, "CCATC":23226, "CCATG":45459, "CCACA":29577, "CCACT":35715, "CCACC":43528, "CCACG":20792, "CCAGA":26927, "CCAGT":507843, "CCAGC":56072, "CCAGG":51305, + "CCTAA":13607, "CCTAT":15887, "CCTAC":16811, "CCTAG":18100, "CCTTA":14433, "CCTTT":29102, "CCTTC":29351, "CCTTG":41094, "CCTCA":31312, "CCTCT":36045, "CCTCC":53848, "CCTCG":20018, "CCTGA":23658, "CCTGT":27633, "CCTGC":37128, "CCTGG":104012, + "CCCAA":30597, "CCCAT":29310, "CCCAC":39158, "CCCAG":59554, "CCCTA":15899, "CCCTT":30632, "CCCTC":35685, "CCCTG":54282, "CCCCA":45826, "CCCCT":37133, "CCCCC":41599, "CCCCG":36870, "CCCGA":16688, "CCCGT":18103, "CCCGC":45255, "CCCGG":38317, + "CCGAA":7795, "CCGAT":7933, "CCGAC":27348, "CCGAG":21329, "CCGTA":5029, "CCGTT":11320, "CCGTC":12724, "CCGTG":24105, "CCGCA":19706, "CCGCT":29088, "CCGCC":54750, "CCGCG":32697, "CCGGA":15047, "CCGGT":13067, "CCGGC":31844, "CCGGG":34316, + "CGAAA":6689, "CGAAT":6613, "CGAAC":7514, "CGAAG":9458, "CGATA":4113, "CGATT":8487, "CGATC":21759, "CGATG":15070, "CGACA":11568, "CGACT":9424, "CGACC":12210, "CGACG":22922, "CGAGA":12362, "CGAGT":13367, "CGAGC":17015, "CGAGG":24021, + "CGTAA":4222, "CGTAT":5837, "CGTAC":4800, "CGTAG":6245, "CGTTA":6142, "CGTTT":11239, "CGTTC":8845, "CGTTG":17667, "CGTCA":6863, "CGTCT":11002, "CGTCC":13963, "CGTCG":6103, "CGTGA":9959, "CGTGT":12995, "CGTGC":16397, "CGTGG":50964, + "CGCAA":11871, "CGCAT":13332, "CGCAC":14783, "CGCAG":24019, "CGCTA":10566, "CGCTT":20057, "CGCTC":21746, "CGCTG":36683, "CGCCA":483325, "CGCCT":32660, "CGCCC":41085, "CGCCG":34813, "CGCGA":11953, "CGCGT":15911, "CGCGC":35171, "CGCGG":28951, + "CGGAA":11198, "CGGAT":8250, "CGGAC":11312, "CGGAG":18027, "CGGTA":6818, "CGGTT":10985, "CGGTC":10507, "CGGTG":24345, "CGGCA":16149, "CGGCT":22832, "CGGCC":32538, "CGGCG":29175, "CGGGA":20815, "CGGGT":593174, "CGGGC":29840, "CGGGG":33211, + "GAAAA":24817, "GAAAT":20739, "GAAAC":17062, "GAAAG":18556, "GAATA":11875, "GAATT":632098, "GAATC":12606, "GAATG":20121, "GAACA":14577, "GAACT":517081, "GAACC":17164, "GAACG":6109, "GAAGA":18206, "GAAGT":16535, "GAAGC":20610, "GAAGG":22629, + "GATAA":10292, "GATAT":10852, "GATAC":7996, "GATAG":9691, "GATTA":13160, "GATTT":18560, "GATTC":14862, "GATTG":17938, "GATCA":16046, "GATCT":19389, "GATCC":20140, "GATCG":8837, "GATGA":14331, "GATGT":14664, "GATGC":15659, "GATGG":47696, + "GACAA":14671, "GACAT":12196, "GACAC":16969, "GACAG":20999, "GACTA":9495, "GACTT":15660, "GACTC":16682, "GACTG":19882, "GACCA":15374, "GACCT":17618, "GACCC":22910, "GACCG":9949, "GACGA":20673, "GACGT":6288, "GACGC":12396, "GACGG":11805, + "GAGAA":22236, "GAGAT":18674, "GAGAC":20606, "GAGAG":28092, "GAGTA":11847, "GAGTT":24827, "GAGTC":18574, "GAGTG":30765, "GAGCA":21670, "GAGCT":26523, "GAGCC":33293, "GAGCG":18176, "GAGGA":28457, "GAGGT":29950, "GAGGC":46973, "GAGGG":38666, + "GTAAA":12736, "GTAAT":16401, "GTAAC":9856, "GTAAG":11491, "GTATA":9543, "GTATT":16492, "GTATC":8714, "GTATG":17992, "GTACA":11164, "GTACT":12562, "GTACC":10782, "GTACG":5520, "GTAGA":11338, "GTAGT":13090, "GTAGC":14100, "GTAGG":13180, + "GTTAA":11897, "GTTAT":13401, "GTTAC":11243, "GTTAG":12784, "GTTTA":13873, "GTTTT":27741, "GTTTC":18673, "GTTTG":29490, "GTTCA":20459, "GTTCT":24846, "GTTCC":17722, "GTTCG":6865, "GTTGA":13305, "GTTGT":18113, "GTTGC":20166, "GTTGG":62287, + "GTCAA":11429, "GTCAT":12405, "GTCAC":496745, "GTCAG":16697, "GTCTA":10352, "GTCTT":17742, "GTCTC":23908, "GTCTG":22926, "GTCCA":16680, "GTCCT":20258, "GTCCC":26742, "GTCCG":24885, "GTCGA":5492, "GTCGT":5262, "GTCGC":10196, "GTCGG":9079, + "GTGAA":16405, "GTGAT":18835, "GTGAC":15708, "GTGAG":25447, "GTGTA":11527, "GTGTT":21718, "GTGTC":16644, "GTGTG":41502, "GTGCA":25764, "GTGCT":32606, "GTGCC":588167, "GTGCG":18233, "GTGGA":156486, "GTGGT":32497, "GTGGC":36802, "GTGGG":37668, + "GCAAA":20015, "GCAAT":20263, "GCAAC":18741, "GCAAG":23343, "GCATA":11407, "GCATT":22219, "GCATC":15223, "GCATG":37750, "GCACA":21639, "GCACT":26638, "GCACC":26735, "GCACG":15202, "GCAGA":22284, "GCAGT":28911, "GCAGC":45041, "GCAGG":40406, + "GCTAA":15136, "GCTAT":18284, "GCTAC":19990, "GCTAG":18093, "GCTTA":13983, "GCTTT":27592, "GCTTC":25286, "GCTTG":43193, "GCTCA":25633, "GCTCT":26900, "GCTCC":38171, "GCTCG":14899, "GCTGA":24042, "GCTGT":23618, "GCTGC":38663, "GCTGG":105129, + "GCCAA":601690, "GCCAT":33211, "GCCAC":36026, "GCCAG":36611, "GCCTA":17279, "GCCTT":32597, "GCCTC":48379, "GCCTG":67680, "GCCCA":39958, "GCCCT":39002, "GCCCC":50423, "GCCCG":37502, "GCCGA":15650, "GCCGT":15089, "GCCGC":47554, "GCCGG":27214, + "GCGAA":8846, "GCGAT":13749, "GCGAC":14582, "GCGAG":21402, "GCGTA":6966, "GCGTT":15587, "GCGTC":12333, "GCGTG":35552, "GCGCA":20418, "GCGCT":30547, "GCGCC":38792, "GCGCG":32830, "GCGGA":16573, "GCGGT":19194, "GCGGC":38456, "GCGGG":35615, + "GGAAA":23836, "GGAAT":633027, "GGAAC":521260, "GGAAG":25491, "GGATA":9722, "GGATT":20796, "GGATC":14193, "GGATG":26264, "GGACA":16457, "GGACT":18120, "GGACC":19201, "GGACG":11445, "GGAGA":26690, "GGAGT":26400, "GGAGC":30601, "GGAGG":53083, + "GGTAA":12273, "GGTAT":15668, "GGTAC":12166, "GGTAG":15281, "GGTTA":13345, "GGTTT":24730, "GGTTC":20147, "GGTTG":34735, "GGTCA":16415, "GGTCT":23372, "GGTCC":21522, "GGTCG":9147, "GGTGA":23533, "GGTGT":26195, "GGTGC":595350, "GGTGG":92124, + "GGCAA":22919, "GGCAT":26591, "GGCAC":26709, "GGCAG":43958, "GGCTA":18293, "GGCTT":29848, "GGCTC":35992, "GGCTG":60231, "GGCCA":34285, "GGCCT":41493, "GGCCC":46107, "GGCCG":31623, "GGCGA":18270, "GGCGT":23260, "GGCGC":37773, "GGCGG":41648, + "GGGAA":26512, "GGGAT":23710, "GGGAC":23254, "GGGAG":47972, "GGGTA":15757, "GGGTT":29311, "GGGTC":23396, "GGGTG":624958, "GGGCA":34457, "GGGCT":41166, "GGGCC":46250, "GGGCG":36118, "GGGGA":33765, "GGGGT":36877, "GGGGC":49693, "GGGGG":61798 + }, + "overrepresented_sequences": { + "AAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "AAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "AACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":34, + "AACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "AACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "AACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "AAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "AAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "AAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":55, + "AAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "AAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "AAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "AAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "AAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":59, + "AAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "AATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "AATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":25, + "AATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":51, + "AATTCTCGGGTGCCAAGGAA":46, + "AATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "ACAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "ACAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "ACATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "ACCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "ACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "ACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "ACCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "ACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "ACGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "ACGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ACGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":41, + "ACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "ACTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "ACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":24, + "ACTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "ACTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "AGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "AGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":34, + "AGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "AGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "AGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "AGCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "AGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "AGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "AGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "AGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "AGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":26, + "AGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "AGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "AGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "AGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "AGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "AGCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":15, + "AGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "AGGAATTCTCGGGTGCCAAG":118, + "AGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "AGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":86, + "AGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "AGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "AGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "AGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":22, + "AGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "AGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":68, + "AGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":31, + "AGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "AGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":26, + "AGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "AGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "AGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "AGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":59, + "AGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "AGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "AGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":32, + "AGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "AGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "AGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":57, + "ATAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "ATACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "ATAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "ATATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":57, + "ATCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "ATCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":56, + "ATCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "ATGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":56, + "ATGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":69, + "ATGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":135, + "ATGGAATTCTCGGGTGCCAA":5179, + "ATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":58, + "ATGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "ATGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "ATGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":79, + "ATGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":59, + "ATTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "ATTCTCGGGTGCCAAGGAAC":43, + "ATTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ATTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "ATTGGAATTCTCGGGTGCCA":1375, + "ATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":30, + "ATTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "ATTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":85, + "CAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":75, + "CAAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":10, + "CAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "CAAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "CAAGGAACTCCAGTCACGCC":86, + "CAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":78, + "CAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "CACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "CACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "CACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "CACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "CACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "CACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "CACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "CACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":85, + "CACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":22, + "CACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "CAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":46, + "CAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "CAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "CAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "CAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "CAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":30, + "CAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "CAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "CATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":74, + "CATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":28, + "CATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":18, + "CATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "CATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "CCAAGGAACTCCAGTCACGC":38, + "CCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "CCACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "CCACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":23, + "CCACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":53, + "CCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "CCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":53, + "CCAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":39, + "CCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "CCATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "CCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":37, + "CCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":64, + "CCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":69, + "CCCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":32, + "CCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":46, + "CCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "CCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":54, + "CCCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":27, + "CCCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":38, + "CCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "CCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":64, + "CCCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":14, + "CCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "CCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "CCCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":34, + "CCCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":27, + "CCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "CCCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "CCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "CCCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "CCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "CCCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "CCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":58, + "CCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":50, + "CCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":41, + "CCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "CCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "CCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "CCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":53, + "CCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":25, + "CCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "CCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "CCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "CCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":68, + "CCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":7, + "CCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":70, + "CCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "CCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":45, + "CGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":19, + "CGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "CGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "CGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "CGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "CGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":38, + "CGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "CGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "CGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "CGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "CGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "CGCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":26, + "CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT":24, + "CGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "CGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "CGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "CGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "CGGGTGCCAAGGAACTCCAG":57, + "CGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "CGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "CGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "CGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":16, + "CGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "CGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":41, + "CTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":60, + "CTACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "CTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":51, + "CTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "CTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "CTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "CTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "CTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CTCGGGTGCCAAGGAACTCC":39, + "CTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":30, + "CTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":26, + "CTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":54, + "CTGGAATTCTCGGGTGCCAA":6923, + "CTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":35, + "CTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "CTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":32, + "CTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "CTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CTTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":22, + "CTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "CTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":15, + "CTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "CTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "GAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "GAATTCTCGGGTGCCAAGGA":39, + "GAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "GACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":25, + "GAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":66, + "GAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "GAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "GAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "GAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "GAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "GAGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":30, + "GAGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":42, + "GAGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":42, + "GAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "GAGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":34, + "GAGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":35, + "GAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "GAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "GAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GAGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "GAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":99, + "GATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "GATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":13, + "GATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "GATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "GCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":117, + "GCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "GCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":75, + "GCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":73, + "GCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "GCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "GCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "GCATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "GCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "GCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "GCCAAGGAACTCCAGTCACG":39, + "GCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "GCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":88, + "GCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":41, + "GCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "GCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":86, + "GCCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":29, + "GCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "GCCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":29, + "GCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":64, + "GCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "GCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "GCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "GCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":62, + "GCGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":25, + "GCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "GCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":69, + "GCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":63, + "GCGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":37, + "GCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "GCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":56, + "GCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "GCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "GCGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GCGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "GCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "GCGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "GCTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":45, + "GCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "GCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "GCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "GCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATC":20, + "GCTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "GCTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "GCTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA":6, + "GCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "GGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "GGAATTCTCGGGTGCCAAGG":162, + "GGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCG":5, + "GGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "GGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":29, + "GGAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":39, + "GGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "GGAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":24, + "GGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":21, + "GGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":58, + "GGCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "GGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "GGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "GGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "GGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":55, + "GGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":24, + "GGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":27, + "GGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":45, + "GGCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "GGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "GGCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":19, + "GGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":55, + "GGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "GGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "GGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "GGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "GGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "GGGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC":27, + "GGGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":39, + "GGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "GGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "GGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "GGGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":28, + "GGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "GGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":23, + "GGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "GGGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "GGGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":21, + "GGGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":35, + "GGGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":40, + "GGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "GGGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":24, + "GGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":42, + "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG":20, + "GGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":85, + "GGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "GGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":61, + "GGGTGCCAAGGAACTCCAGT":120, + "GGGTGCCTGGAATTCTCGGG":14, + "GGGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":46, + "GGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":13, + "GGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "GGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":69, + "GGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "GGTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GGTGCCAAGGAACTCCAGTC":76, + "GGTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":34, + "GGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":63, + "GGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":6, + "GGTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":28, + "GGTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":38, + "GGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "GGTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":30, + "GGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":76, + "GGTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "GGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "GTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "GTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "GTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "GTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "GTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "GTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "GTGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "GTGCCAAGGAACTCCAGTCA":22, + "GTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":67, + "GTGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":46, + "GTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":70, + "GTGGAATTCTCGGGTGCCAA":4479, + "GTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":47, + "GTGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":28, + "GTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GTGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":24, + "GTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "GTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "GTGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "GTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":57, + "GTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":67, + "GTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "GTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "GTTGGAATTCTCGGGTGCCA":1640, + "GTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":15, + "GTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "GTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":102, + "NNNNNNNNNNNNNNNNNNNN":317, + "TAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "TAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":38, + "TAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "TAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "TACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "TACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "TACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "TACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "TAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":54, + "TAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":94, + "TAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "TAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "TAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":56, + "TATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "TATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":20, + "TATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "TATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":78, + "TCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "TCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":25, + "TCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "TCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "TCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "TCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "TCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "TCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "TCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "TCGGGTGCCAAGGAACTCCA":111, + "TCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "TCTCGGGTGCCAAGGAACTC":75, + "TCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "TCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":22, + "TGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "TGAATTCTCGGGTGCCAAGG":200, + "TGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "TGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "TGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "TGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":71, + "TGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "TGCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "TGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":76, + "TGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":16, + "TGCCAAGGAACTCCAGTCAC":31, + "TGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "TGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":67, + "TGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "TGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "TGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":67, + "TGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "TGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "TGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "TGGAATTCTCGGGTGCCAAG":145, + "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTC":131, + "TGGAATTCTCGGGTGCCTGG":135, + "TGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":72, + "TGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "TGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":77, + "TGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "TGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "TGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "TGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "TGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "TGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "TGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "TGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":70, + "TGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "TGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "TGTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":11, + "TGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "TGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":62, + "TGTGGAATTCTCGGGTGCCA":1254, + "TGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":13, + "TGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "TGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":89, + "TTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "TTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "TTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":37, + "TTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "TTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "TTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "TTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "TTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "TTCTCGGGTGCCAAGGAACT":27, + "TTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "TTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "TTGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":32, + "TTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":58, + "TTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "TTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "TTGGAATTCTCGGGTGCCAA":1870, + "TTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":79, + "TTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":79, + "TTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":77, + "TTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":58, + "TTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "TTTGGAATTCTCGGGTGCCA":1831, + "TTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":15, + "TTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":72, + "TTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":58, + "TTTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":21, + "TTTTTTTTTTTTTTTTTTTT":247 } + }, + "read1_after_filtering": { + "total_reads": 497796, + "total_bases": 20903897, + "q20_bases": 20456786, + "q30_bases": 20301335, + "total_cycles": 46, + "quality_curves": { + "A":[31.2294,31.5038,31.6412,31.6343,31.6499,35.3846,35.3174,35.179,29.499,34.3508,35.4831,34.1509,33.7906,34.9636,35.1017,35.154,35.1372,35.2852,35.5428,35.5214,35.5195,35.2927,35.4608,35.1304,35.1124,33.931,33.9263,35.4028,35.393,34.2387,34.2948,35.0348,35.0266,34.9663,34.9694,33.8772,35.2847,35.0588,34.9437,34.0013,35.0035,34.9558,35.0212,34.3869,34.4151,34.9608], + "T":[31.6989,31.7269,31.813,31.8339,31.8321,35.4149,35.399,35.4282,34.9054,35.5516,35.627,35.5148,35.4606,35.3368,35.1416,35.5041,35.1962,35.496,35.6486,35.6607,35.6564,35.4945,35.6356,35.5023,35.4418,35.351,35.3955,35.6073,35.6008,35.4272,35.391,35.3695,35.2359,35.3385,35.3526,35.3373,35.5239,35.2352,35.2943,35.2832,35.2925,35.1967,35.1907,35.2366,35.2946,34.995], + "C":[31.7481,31.7312,31.848,31.8605,31.8688,35.4977,35.4929,35.4678,35.7508,35.6658,35.6665,35.6329,35.5937,35.6439,35.5291,35.2843,35.4746,35.1805,35.6665,35.6512,35.6542,35.5468,35.6178,35.3836,35.3007,35.478,35.487,35.5555,35.5347,35.3807,35.3303,35.0579,35.301,35.2075,35.1599,35.3702,35.4638,35.2103,34.9253,35.1284,34.9168,35.1121,35.0812,34.9454,35.2082,35.079], + "G":[31.0861,31.5289,31.7017,31.7247,31.7458,35.3052,35.2381,35.2488,35.158,34.9126,35.2894,35.4298,35.4105,35.4955,35.4179,35.4867,35.4115,35.4819,35.4671,35.5159,35.541,35.4888,35.458,35.4847,35.4304,35.3145,35.3219,35.4564,35.4278,35.3371,35.3271,35.3734,35.3208,35.3572,35.3076,35.2383,35.3131,35.28,35.2836,35.1727,35.1823,35.0648,35.0097,35.0101,34.9348,34.7628], + "mean":[31.4479,31.6159,31.7443,31.7661,31.7768,35.3928,35.3461,35.3116,34.6842,35.213,35.53,35.2118,35.1055,35.3879,35.3052,35.3542,35.3199,35.3499,35.5851,35.5893,35.5931,35.4567,35.5421,35.3729,35.3181,35.0235,35.0473,35.5045,35.4878,35.0899,35.091,35.2035,35.2252,35.2164,35.1948,34.9685,35.3989,35.1962,35.1021,34.9027,35.0913,35.0845,35.0762,34.9036,34.9953,34.9608] + }, + "content_curves": { + "A":[0.253976,0.279982,0.219371,0.193756,0.193537,0.181438,0.21788,0.215568,0.0995167,0.153953,0.225056,0.237668,0.231153,0.202726,0.215622,0.201175,0.205791,0.221936,0.2024,0.22951,0.234452,0.247313,0.244454,0.24638,0.249719,0.24693,0.239311,0.241437,0.244322,0.254153,0.24345,0.237582,0.235991,0.23746,0.240977,0.241306,0.232234,0.241307,0.246552,0.242179,0.23706,0.232102,0.230151,0.230668,0.206967,0], + "T":[0.270717,0.29173,0.226378,0.21781,0.222348,0.241611,0.254074,0.296756,0.223855,0.278699,0.261923,0.207517,0.262352,0.26774,0.274976,0.234259,0.249685,0.233869,0.242524,0.226946,0.241071,0.241635,0.228428,0.232161,0.230344,0.234725,0.232644,0.232192,0.232769,0.230492,0.231413,0.244757,0.239029,0.23599,0.234994,0.233683,0.240108,0.23518,0.235476,0.23652,0.234816,0.247934,0.245417,0.242449,0.255816,0.353318], + "C":[0.257282,0.179696,0.209518,0.258819,0.246808,0.244803,0.196088,0.112403,0.246193,0.277178,0.287929,0.336712,0.308271,0.289333,0.283016,0.343673,0.295523,0.304177,0.294947,0.292919,0.266674,0.259374,0.271264,0.279362,0.272125,0.26832,0.268933,0.266767,0.26895,0.270127,0.281413,0.279585,0.283248,0.288293,0.282408,0.272267,0.278011,0.281316,0.279326,0.281201,0.280404,0.280907,0.279059,0.271034,0.279558,0.37276], + "G":[0.217653,0.248588,0.344734,0.329615,0.337307,0.332136,0.331955,0.375272,0.430435,0.290171,0.225092,0.218101,0.198218,0.240195,0.226384,0.220892,0.249002,0.240018,0.260125,0.250615,0.257776,0.251678,0.255839,0.242084,0.247805,0.249971,0.259048,0.259591,0.253943,0.245212,0.24371,0.238069,0.241722,0.238245,0.241617,0.252732,0.249644,0.24219,0.238641,0.2401,0.247717,0.239026,0.245357,0.255844,0.257645,0.273864], + "N":[0.000371638,4.01771e-06,0,0,0,1.20531e-05,2.00886e-06,0,0,0,0,2.00886e-06,6.02657e-06,6.02657e-06,2.00886e-06,2.00886e-06,0,0,4.03924e-06,1.01722e-05,2.66651e-05,0,1.45914e-05,1.26088e-05,8.49686e-06,5.38341e-05,6.30625e-05,1.32141e-05,1.57277e-05,1.59085e-05,1.37857e-05,6.96578e-06,9.38427e-06,1.1852e-05,4.79222e-06,1.21396e-05,2.45901e-06,7.46026e-06,5.03127e-06,0,2.58286e-06,3.10263e-05,1.55618e-05,5.27796e-06,1.41118e-05,5.7578e-05], + "GC":[0.474936,0.428284,0.554251,0.588434,0.584115,0.576939,0.528044,0.487676,0.676629,0.567349,0.513021,0.554814,0.506489,0.529528,0.509399,0.564565,0.544524,0.544195,0.555072,0.543534,0.52445,0.511052,0.527104,0.521446,0.519929,0.518291,0.527981,0.526358,0.522893,0.515339,0.525123,0.517655,0.52497,0.526538,0.524025,0.524999,0.527655,0.523506,0.517967,0.521301,0.528121,0.519933,0.524416,0.526878,0.537203,0.646624] + }, + "kmer_count": { + "AAAAA":46610, "AAAAT":28700, "AAAAC":22259, "AAAAG":21713, "AAATA":22141, "AAATT":20228, "AAATC":15053, "AAATG":19750, "AAACA":22423, "AAACT":18258, "AAACC":19258, "AAACG":5737, "AAAGA":20990, "AAAGT":18922, "AAAGC":17171, "AAAGG":19887, + "AATAA":18806, "AATAT":15902, "AATAC":12767, "AATAG":10077, "AATTA":15378, "AATTT":22531, "AATTC":31680, "AATTG":11588, "AATCA":14061, "AATCT":15431, "AATCC":17925, "AATCG":4295, "AATGA":15186, "AATGT":14220, "AATGC":13397, "AATGG":18390, + "AACAA":20243, "AACAT":15321, "AACAC":16771, "AACAG":16847, "AACTA":11194, "AACTT":14454, "AACTC":24131, "AACTG":15592, "AACCA":16833, "AACCT":17621, "AACCC":21338, "AACCG":6483, "AACGA":4936, "AACGT":4404, "AACGC":7046, "AACGG":6738, + "AAGAA":22188, "AAGAT":13833, "AAGAC":14980, "AAGAG":20116, "AAGTA":12357, "AAGTT":15574, "AAGTC":14481, "AAGTG":20311, "AAGCA":20027, "AAGCT":17222, "AAGCC":19834, "AAGCG":12097, "AAGGA":25731, "AAGGT":15830, "AAGGC":21737, "AAGGG":22673, + "ATAAA":19596, "ATAAT":14281, "ATAAC":8797, "ATAAG":9875, "ATATA":12993, "ATATT":15456, "ATATC":9353, "ATATG":11971, "ATACA":13999, "ATACT":11217, "ATACC":9007, "ATACG":3773, "ATAGA":8705, "ATAGT":10015, "ATAGC":9503, "ATAGG":10283, + "ATTAA":14352, "ATTAT":14266, "ATTAC":12700, "ATTAG":11427, "ATTTA":16869, "ATTTT":33807, "ATTTC":18768, "ATTTG":17058, "ATTCA":15491, "ATTCT":36286, "ATTCC":17372, "ATTCG":5383, "ATTGA":9584, "ATTGT":12566, "ATTGC":12804, "ATTGG":15730, + "ATCAA":13114, "ATCAT":13355, "ATCAC":15417, "ATCAG":12870, "ATCTA":11341, "ATCTT":17514, "ATCTC":20677, "ATCTG":16957, "ATCCA":17661, "ATCCT":19003, "ATCCC":22037, "ATCCG":7655, "ATCGA":4428, "ATCGT":4811, "ATCGC":7740, "ATCGG":5571, + "ATGAA":14924, "ATGAT":13269, "ATGAC":10544, "ATGAG":15535, "ATGTA":11138, "ATGTT":17601, "ATGTC":11778, "ATGTG":18623, "ATGCA":15714, "ATGCT":17072, "ATGCC":18818, "ATGCG":8315, "ATGGA":19182, "ATGGT":18041, "ATGGC":21954, "ATGGG":21406, + "ACAAA":24025, "ACAAT":13192, "ACAAC":13179, "ACAAG":15778, "ACATA":11452, "ACATT":15516, "ACATC":11638, "ACATG":16677, "ACACA":23637, "ACACT":16325, "ACACC":20401, "ACACG":8829, "ACAGA":20723, "ACAGT":20088, "ACAGC":22826, "ACAGG":24717, + "ACTAA":11972, "ACTAT":10245, "ACTAC":11511, "ACTAG":9771, "ACTTA":10897, "ACTTT":21195, "ACTTC":16259, "ACTTG":15901, "ACTCA":17773, "ACTCT":17090, "ACTCC":32533, "ACTCG":7919, "ACTGA":15115, "ACTGT":14711, "ACTGC":21611, "ACTGG":19001, + "ACCAA":16387, "ACCAT":16264, "ACCAC":22252, "ACCAG":20647, "ACCTA":11658, "ACCTT":17505, "ACCTC":25472, "ACCTG":23793, "ACCCA":22880, "ACCCT":20882, "ACCCC":24071, "ACCCG":16051, "ACCGA":6595, "ACCGT":6268, "ACCGC":14547, "ACCGG":10635, + "ACGAA":5418, "ACGAT":18439, "ACGAC":5825, "ACGAG":7978, "ACGTA":3673, "ACGTT":5763, "ACGTC":5781, "ACGTG":8454, "ACGCA":8091, "ACGCT":9038, "ACGCC":21986, "ACGCG":9298, "ACGGA":7956, "ACGGT":7044, "ACGGC":11346, "ACGGG":12714, + "AGAAA":25899, "AGAAT":17478, "AGAAC":14613, "AGAAG":21007, "AGATA":10601, "AGATT":13657, "AGATC":12357, "AGATG":17096, "AGACA":18670, "AGACT":15440, "AGACC":17200, "AGACG":8409, "AGAGA":23922, "AGAGT":19714, "AGAGC":21347, "AGAGG":26342, + "AGTAA":11838, "AGTAT":10463, "AGTAC":9139, "AGTAG":13762, "AGTTA":11224, "AGTTT":19193, "AGTTC":20518, "AGTTG":14762, "AGTCA":20871, "AGTCT":15553, "AGTCC":30027, "AGTCG":5339, "AGTGA":18253, "AGTGT":15330, "AGTGC":21325, "AGTGG":26812, + "AGCAA":19938, "AGCAT":16045, "AGCAC":21974, "AGCAG":28163, "AGCTA":18094, "AGCTT":19010, "AGCTC":23546, "AGCTG":28150, "AGCCA":27996, "AGCCT":33938, "AGCCC":32563, "AGCCG":18905, "AGCGA":13060, "AGCGT":9214, "AGCGC":19956, "AGCGG":17475, + "AGGAA":30058, "AGGAT":16322, "AGGAC":16395, "AGGAG":32535, "AGGTA":13279, "AGGTT":19774, "AGGTC":16102, "AGGTG":27456, "AGGCA":31002, "AGGCT":32549, "AGGCC":30503, "AGGCG":23324, "AGGGA":24349, "AGGGT":21536, "AGGGC":30696, "AGGGG":32146, + "TAAAA":24785, "TAAAT":17082, "TAAAC":11942, "TAAAG":13445, "TAATA":11814, "TAATT":16753, "TAATC":13809, "TAATG":12262, "TAACA":12738, "TAACT":11629, "TAACC":9901, "TAACG":4019, "TAAGA":12543, "TAAGT":11781, "TAAGC":11272, "TAAGG":13235, + "TATAA":12923, "TATAT":14071, "TATAC":8722, "TATAG":8554, "TATTA":12249, "TATTT":25587, "TATTC":12835, "TATTG":12604, "TATCA":10382, "TATCT":13682, "TATCC":11269, "TATCG":3166, "TATGA":10336, "TATGT":13792, "TATGC":11707, "TATGG":15420, + "TACAA":14140, "TACAT":12669, "TACAC":12392, "TACAG":22386, "TACTA":10934, "TACTT":14821, "TACTC":13734, "TACTG":13573, "TACCA":14244, "TACCT":15294, "TACCC":12280, "TACCG":5970, "TACGA":4047, "TACGT":4755, "TACGC":5771, "TACGG":6942, + "TAGAA":13440, "TAGAT":9208, "TAGAC":7969, "TAGAG":13900, "TAGTA":9761, "TAGTT":12685, "TAGTC":11847, "TAGTG":13575, "TAGCA":13724, "TAGCT":16434, "TAGCC":15065, "TAGCG":8364, "TAGGA":13863, "TAGGT":12581, "TAGGC":15720, "TAGGG":16845, + "TTAAA":21481, "TTAAT":14938, "TTAAC":11141, "TTAAG":13177, "TTATA":12492, "TTATT":20214, "TTATC":10943, "TTATG":13136, "TTACA":16602, "TTACT":13829, "TTACC":13894, "TTACG":4527, "TTAGA":11431, "TTAGT":12788, "TTAGC":13894, "TTAGG":15463, + "TTTAA":21783, "TTTAT":20462, "TTTAC":13982, "TTTAG":14764, "TTTTA":25400, "TTTTT":225648, "TTTTC":25437, "TTTTG":25242, "TTTCA":23029, "TTTCT":28670, "TTTCC":25854, "TTTCG":6420, "TTTGA":16399, "TTTGT":22435, "TTTGC":18195, "TTTGG":27175, + "TTCAA":20120, "TTCAT":17467, "TTCAC":19485, "TTCAG":22133, "TTCTA":19535, "TTCTT":25812, "TTCTC":42134, "TTCTG":23445, "TTCCA":24484, "TTCCT":28389, "TTCCC":27973, "TTCCG":10223, "TTCGA":5001, "TTCGT":5284, "TTCGC":7642, "TTCGG":9481, + "TTGAA":16399, "TTGAT":11870, "TTGAC":9685, "TTGAG":18178, "TTGTA":13138, "TTGTT":20666, "TTGTC":13276, "TTGTG":20894, "TTGCA":17705, "TTGCT":21242, "TTGCC":21549, "TTGCG":10028, "TTGGA":22304, "TTGGT":18894, "TTGGC":23836, "TTGGG":29429, + "TCAAA":19715, "TCAAT":13326, "TCAAC":11951, "TCAAG":17816, "TCATA":10083, "TCATT":16654, "TCATC":13453, "TCATG":15489, "TCACA":19355, "TCACT":21913, "TCACC":25380, "TCACG":15348, "TCAGA":20107, "TCAGT":15810, "TCAGC":25142, "TCAGG":23154, + "TCTAA":12540, "TCTAT":12270, "TCTAC":17057, "TCTAG":12975, "TCTTA":12501, "TCTTT":23402, "TCTTC":22315, "TCTTG":19668, "TCTCA":22799, "TCTCT":28441, "TCTCC":33344, "TCTCG":28379, "TCTGA":17941, "TCTGT":20424, "TCTGC":23687, "TCTGG":23934, + "TCCAA":18476, "TCCAT":19120, "TCCAC":23004, "TCCAG":36145, "TCCTA":13893, "TCCTT":24137, "TCCTC":30683, "TCCTG":31647, "TCCCA":37728, "TCCCT":28634, "TCCCC":33189, "TCCCG":21087, "TCCGA":21141, "TCCGT":7556, "TCCGC":19017, "TCCGG":14917, + "TCGAA":5158, "TCGAT":4470, "TCGAC":4770, "TCGAG":8178, "TCGTA":3143, "TCGTT":5517, "TCGTC":5722, "TCGTG":7485, "TCGCA":6848, "TCGCT":10763, "TCGCC":15238, "TCGCG":9217, "TCGGA":6441, "TCGGT":7506, "TCGGC":15415, "TCGGG":27401, + "TGAAA":19389, "TGAAT":14153, "TGAAC":14565, "TGAAG":15676, "TGATA":9177, "TGATT":14092, "TGATC":14643, "TGATG":14860, "TGACA":15003, "TGACT":15222, "TGACC":15652, "TGACG":5750, "TGAGA":19955, "TGAGT":16143, "TGAGC":22307, "TGAGG":27341, + "TGTAA":16261, "TGTAT":13514, "TGTAC":9898, "TGTAG":12603, "TGTTA":12684, "TGTTT":23488, "TGTTC":16425, "TGTTG":19812, "TGTCA":14790, "TGTCT":19955, "TGTCC":18111, "TGTCG":6716, "TGTGA":17053, "TGTGT":24603, "TGTGC":21242, "TGTGG":28263, + "TGCAA":19191, "TGCAT":15849, "TGCAC":20935, "TGCAG":29596, "TGCTA":14471, "TGCTT":22835, "TGCTC":21761, "TGCTG":30730, "TGCCA":31120, "TGCCT":33007, "TGCCC":30040, "TGCCG":15063, "TGCGA":8303, "TGCGT":10346, "TGCGC":16806, "TGCGG":16820, + "TGGAA":35706, "TGGAT":14878, "TGGAC":13576, "TGGAG":27216, "TGGTA":13920, "TGGTT":19909, "TGGTC":18137, "TGGTG":30485, "TGGCA":26279, "TGGCT":30993, "TGGCC":33070, "TGGCG":20150, "TGGGA":31478, "TGGGT":26907, "TGGGC":34424, "TGGGG":36577, + "CAAAA":26109, "CAAAT":17885, "CAAAC":17690, "CAAAG":21870, "CAATA":13046, "CAATT":14803, "CAATC":11651, "CAATG":15342, "CAACA":19824, "CAACT":15405, "CAACC":17674, "CAACG":7129, "CAAGA":19322, "CAAGT":18176, "CAAGC":20384, "CAAGG":30405, + "CATAA":11131, "CATAT":11722, "CATAC":9414, "CATAG":10025, "CATTA":12039, "CATTT":24183, "CATTC":16743, "CATTG":15233, "CATCA":14423, "CATCT":20545, "CATCC":18433, "CATCG":6109, "CATGA":14816, "CATGT":18572, "CATGC":19499, "CATGG":26519, + "CACAA":19088, "CACAT":17429, "CACAC":25556, "CACAG":27190, "CACTA":12165, "CACTT":22176, "CACTC":23959, "CACTG":27362, "CACCA":29398, "CACCT":30522, "CACCC":30570, "CACCG":16322, "CACGA":8047, "CACGT":9326, "CACGC":23699, "CACGG":14064, + "CAGAA":22762, "CAGAT":14763, "CAGAC":18541, "CAGAG":31329, "CAGTA":12127, "CAGTT":17803, "CAGTC":29224, "CAGTG":27465, "CAGCA":31238, "CAGCT":34173, "CAGCC":49618, "CAGCG":22301, "CAGGA":30046, "CAGGT":24907, "CAGGC":40559, "CAGGG":34693, + "CTAAA":15016, "CTAAT":12181, "CTAAC":10341, "CTAAG":12862, "CTATA":9721, "CTATT":14457, "CTATC":9533, "CTATG":13267, "CTACA":19831, "CTACT":18063, "CTACC":14781, "CTACG":7414, "CTAGA":12412, "CTAGT":12312, "CTAGC":14135, "CTAGG":17522, + "CTTAA":13197, "CTTAT":11506, "CTTAC":11863, "CTTAG":12514, "CTTTA":15080, "CTTTT":26238, "CTTTC":22776, "CTTTG":22771, "CTTCA":20639, "CTTCT":25786, "CTTCC":31490, "CTTCG":8846, "CTTGA":16470, "CTTGT":17940, "CTTGC":18680, "CTTGG":29608, + "CTCAA":19638, "CTCAT":15430, "CTCAC":27011, "CTCAG":32182, "CTCTA":14620, "CTCTT":21445, "CTCTC":31107, "CTCTG":30769, "CTCCA":39573, "CTCCT":37874, "CTCCC":48458, "CTCCG":22568, "CTCGA":7637, "CTCGT":7729, "CTCGC":16494, "CTCGG":34956, + "CTGAA":18707, "CTGAT":12845, "CTGAC":18432, "CTGAG":28683, "CTGTA":17418, "CTGTT":18210, "CTGTC":19677, "CTGTG":23388, "CTGCA":29239, "CTGCT":29042, "CTGCC":39268, "CTGCG":17500, "CTGGA":31710, "CTGGT":19509, "CTGGC":31643, "CTGGG":44233, + "CCAAA":23048, "CCAAT":19199, "CCAAC":18853, "CCAAG":31340, "CCATA":10583, "CCATT":19585, "CCATC":20853, "CCATG":24545, "CCACA":25443, "CCACT":26414, "CCACC":37338, "CCACG":17195, "CCAGA":24920, "CCAGT":26701, "CCAGC":48517, "CCAGG":44845, + "CCTAA":12225, "CCTAT":11242, "CCTAC":14151, "CCTAG":15541, "CCTTA":12833, "CCTTT":22258, "CCTTC":26595, "CCTTG":23943, "CCTCA":29799, "CCTCT":31493, "CCTCC":50437, "CCTCG":18442, "CCTGA":22813, "CCTGT":24644, "CCTGC":34380, "CCTGG":44212, + "CCCAA":25769, "CCCAT":19165, "CCCAC":32187, "CCCAG":52975, "CCCTA":13159, "CCCTT":22385, "CCCTC":32403, "CCCTG":34001, "CCCCA":37400, "CCCCT":26872, "CCCCC":33284, "CCCCG":31802, "CCCGA":15336, "CCCGT":11231, "CCCGC":36821, "CCCGG":35060, + "CCGAA":7000, "CCGAT":5491, "CCGAC":26034, "CCGAG":19198, "CCGTA":4188, "CCGTT":7410, "CCGTC":11745, "CCGTG":12740, "CCGCA":16666, "CCGCT":20461, "CCGCC":47960, "CCGCG":28277, "CCGGA":13888, "CCGGT":9785, "CCGGC":29886, "CCGGG":32021, + "CGAAA":5498, "CGAAT":4131, "CGAAC":6988, "CGAAG":8142, "CGATA":3186, "CGATT":5937, "CGATC":21157, "CGATG":7894, "CGACA":8390, "CGACT":7768, "CGACC":11374, "CGACG":21761, "CGAGA":11069, "CGAGT":9397, "CGAGC":14369, "CGAGG":20123, + "CGTAA":3434, "CGTAT":3970, "CGTAC":4119, "CGTAG":5626, "CGTTA":5062, "CGTTT":7983, "CGTTC":7975, "CGTTG":8527, "CGTCA":6080, "CGTCT":9705, "CGTCC":12884, "CGTCG":5854, "CGTGA":8634, "CGTGT":9410, "CGTGC":12908, "CGTGG":17047, + "CGCAA":7508, "CGCAT":7821, "CGCAC":12923, "CGCAG":21499, "CGCTA":7773, "CGCTT":12876, "CGCTC":20036, "CGCTG":20404, "CGCCA":26646, "CGCCT":24959, "CGCCC":33524, "CGCCG":32687, "CGCGA":9837, "CGCGT":10266, "CGCGC":28604, "CGCGG":26706, + "CGGAA":8822, "CGGAT":6483, "CGGAC":11105, "CGGAG":17092, "CGGTA":6061, "CGGTT":7992, "CGGTC":9771, "CGGTG":15385, "CGGCA":13878, "CGGCT":19486, "CGGCC":30938, "CGGCG":27206, "CGGGA":18792, "CGGGT":26511, "CGGGC":28120, "CGGGG":29560, + "GAAAA":22580, "GAAAT":15706, "GAAAC":15752, "GAAAG":17198, "GAATA":9969, "GAATT":30680, "GAATC":11831, "GAATG":12974, "GAACA":13377, "GAACT":21410, "GAACC":16216, "GAACG":5652, "GAAGA":17064, "GAAGT":13719, "GAAGC":18698, "GAAGG":20090, + "GATAA":9037, "GATAT":8430, "GATAC":6928, "GATAG":8126, "GATTA":11953, "GATTT":15359, "GATTC":13672, "GATTG":9830, "GATCA":15337, "GATCT":18177, "GATCC":19329, "GATCG":8391, "GATGA":12658, "GATGT":11677, "GATGC":12634, "GATGG":19958, + "GACAA":12429, "GACAT":10466, "GACAC":15713, "GACAG":20014, "GACTA":8919, "GACTT":13978, "GACTC":15594, "GACTG":15260, "GACCA":14399, "GACCT":16345, "GACCC":21692, "GACCG":9466, "GACGA":19825, "GACGT":4818, "GACGC":11340, "GACGG":11090, + "GAGAA":21097, "GAGAT":15438, "GAGAC":19728, "GAGAG":24375, "GAGTA":10466, "GAGTT":19859, "GAGTC":17183, "GAGTG":19282, "GAGCA":18817, "GAGCT":21126, "GAGCC":29351, "GAGCG":15296, "GAGGA":24826, "GAGGT":22631, "GAGGC":38857, "GAGGG":31869, + "GTAAA":11339, "GTAAT":13749, "GTAAC":8193, "GTAAG":10008, "GTATA":8518, "GTATT":13267, "GTATC":8069, "GTATG":10575, "GTACA":10094, "GTACT":10455, "GTACC":9249, "GTACG":4593, "GTAGA":10759, "GTAGT":11420, "GTAGC":12888, "GTAGG":12073, + "GTTAA":10611, "GTTAT":10463, "GTTAC":9969, "GTTAG":11168, "GTTTA":12542, "GTTTT":23539, "GTTTC":17572, "GTTTG":16880, "GTTCA":19278, "GTTCT":21852, "GTTCC":16078, "GTTCG":6089, "GTTGA":11472, "GTTGT":14213, "GTTGC":16413, "GTTGG":20787, + "GTCAA":9999, "GTCAT":10568, "GTCAC":21277, "GTCAG":16005, "GTCTA":9541, "GTCTT":15514, "GTCTC":23250, "GTCTG":17158, "GTCCA":15198, "GTCCT":17652, "GTCCC":24201, "GTCCG":23293, "GTCGA":5026, "GTCGT":4307, "GTCGC":9445, "GTCGG":8212, + "GTGAA":14019, "GTGAT":15396, "GTGAC":13686, "GTGAG":22219, "GTGTA":10279, "GTGTT":16678, "GTGTC":15199, "GTGTG":27672, "GTGCA":21846, "GTGCT":22500, "GTGCC":32105, "GTGCG":14750, "GTGGA":22711, "GTGGT":25455, "GTGGC":30915, "GTGGG":32368, + "GCAAA":16613, "GCAAT":14298, "GCAAC":16571, "GCAAG":19561, "GCATA":9304, "GCATT":16132, "GCATC":13888, "GCATG":20766, "GCACA":19344, "GCACT":22576, "GCACC":24681, "GCACG":12852, "GCAGA":21105, "GCAGT":23444, "GCAGC":39585, "GCAGG":35807, + "GCTAA":12753, "GCTAT":12758, "GCTAC":16976, "GCTAG":14826, "GCTTA":11662, "GCTTT":20500, "GCTTC":22522, "GCTTG":21523, "GCTCA":24143, "GCTCT":23836, "GCTCC":35858, "GCTCG":13683, "GCTGA":22664, "GCTGT":19635, "GCTGC":35188, "GCTGG":42071, + "GCCAA":31067, "GCCAT":21704, "GCCAC":30482, "GCCAG":32297, "GCCTA":14022, "GCCTT":23298, "GCCTC":46026, "GCCTG":39839, "GCCCA":32473, "GCCCT":28084, "GCCCC":42598, "GCCCG":31818, "GCCGA":14039, "GCCGT":10760, "GCCGC":43337, "GCCGG":26172, + "GCGAA":7044, "GCGAT":9512, "GCGAC":13459, "GCGAG":17868, "GCGTA":5693, "GCGTT":10270, "GCGTC":11498, "GCGTG":18047, "GCGCA":16357, "GCGCT":20217, "GCGCC":33481, "GCGCG":27591, "GCGGA":15167, "GCGGT":14659, "GCGGC":36089, "GCGGG":31990, + "GGAAA":20953, "GGAAT":30925, "GGAAC":21852, "GGAAG":23252, "GGATA":8647, "GGATT":17460, "GGATC":13684, "GGATG":15969, "GGACA":16033, "GGACT":16323, "GGACC":18759, "GGACG":10984, "GGAGA":25065, "GGAGT":21520, "GGAGC":26844, "GGAGG":45156, + "GGTAA":10861, "GGTAT":12050, "GGTAC":10518, "GGTAG":13924, "GGTTA":11832, "GGTTT":19585, "GGTTC":18660, "GGTTG":18269, "GGTCA":15243, "GGTCT":21029, "GGTCC":19908, "GGTCG":8593, "GGTGA":19948, "GGTGT":19741, "GGTGC":35505, "GGTGG":40445, + "GGCAA":19543, "GGCAT":19169, "GGCAC":24469, "GGCAG":39056, "GGCTA":15507, "GGCTT":21183, "GGCTC":34439, "GGCTG":41812, "GGCCA":29972, "GGCCT":32815, "GGCCC":41317, "GGCCG":29331, "GGCGA":15397, "GGCGT":15050, "GGCGC":31836, "GGCGG":37829, + "GGGAA":22626, "GGGAT":18953, "GGGAC":22458, "GGGAG":40896, "GGGTA":13410, "GGGTT":21198, "GGGTC":21709, "GGGTG":43077, "GGGCA":29834, "GGGCT":31786, "GGGCC":41130, "GGGCG":29675, "GGGGA":29903, "GGGGT":26212, "GGGGC":40630, "GGGGG":39682 + }, + "overrepresented_sequences": { + "AATTCTCGGGTGCCAAGGAA":16, + "CAAGGAACTCCAGTCACGCC":46, + "CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT":41, + "CGGGTGCCAAGGAACTCCAG":19, + "CTCGGGTGCCAAGGAACTCC":21, + "GCCAAGGAACTCCAGTCACG":12, + "GGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCG":2, + "GGGTGCCAAGGAACTCCAGT":31, + "GGTGCCAAGGAACTCCAGTC":42, + "TCGGGTGCCAAGGAACTCCA":21, + "TCTCGGGTGCCAAGGAACTC":49, + "TTCTCGGGTGCCAAGGAACT":19, + "TTTTTTTTTTTTTTTTTTTT":300 } + }, + "command": "fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq --adapter_sequence TGGAATTCTCGGGTGCCAAGG --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.json --report_title tutorial --stdout " +} \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt new file mode 100644 index 0000000..97a5c30 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R1_fastp_adapter.txt @@ -0,0 +1,29 @@ +Streaming uncompressed output to STDOUT... + +Read1 before filtering: +total reads: 1000000 +total bases: 37678766 +Q20 bases: 34709953(92.1207%) +Q30 bases: 34018346(90.2852%) + +Read1 after filtering: +total reads: 473502 +total bases: 16881839 +Q20 bases: 16178706(95.835%) +Q30 bases: 15892953(94.1423%) + +Filtering result: +reads passed filter: 473502 +reads failed due to low quality: 66799 +reads failed due to too many N: 105 +reads failed due to too short: 459594 +reads with adapter trimmed: 540203 +bases trimmed due to adapters: 14696777 + +Duplication rate (may be overestimated since this is SE data): 28.6979% + +JSON report: /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json +HTML report: /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html + +fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq --adapter_sequence GATCGTCGGACTGTAGAACTCTGAAC --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json --report_title tutorial --stdout +fastp v0.19.4, time used: 7 seconds diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html new file mode 100644 index 0000000..54baf7f --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html @@ -0,0 +1,2725 @@ +fastp report at 2019-11-27 13:55:42 + + + +
+

tutorial +
+ +
+
General
+
+ + + + + + +
fastp version:0.19.4 (https://github.com/OpenGene/fastp)
sequencing:single end (38 cycles)
mean length before filtering:37bp
mean length after filtering:35bp
duplication rate:28.697855% (may be overestimated since this is SE data)
+
+
Before filtering
+
+ + + + + + +
total reads:1000.000000 K
total bases:37.678766 M
Q20 bases:34.709953 M (92.120727%)
Q30 bases:34.018346 M (90.285191%)
GC content:51.863917%
+
+
After filtering
+
+ + + + + + +
total reads:473.502000 K
total bases:16.881839 M
Q20 bases:16.178706 M (95.834974%)
Q30 bases:15.892953 M (94.142309%)
GC content:52.756835%
+
+
Filtering result
+
+ + + + + +
reads passed filters:473.502000 K (47.350200%)
reads with low quality:66.799000 K (6.679900%)
reads with too many N:105 (0.010500%)
reads too short:459.594000 K (45.959400%)
+
+
+
+
+ +
+
Adapter or bad ligation of read1
+
+ + + + + + + + + + +
SequenceOccurrences
GATCGTCGG14299
GATCGTCGGAC6547
GATCGTCGGACT5538
GATCGTCGGACTG5881
GATCGTCGGACTGT17243
GATCGTCGGACTGTAGAACTCTGAACGTG7306
GATCGTCGGACTGTAGAACTCTGAACGTGT404556
other adapter sequences78833
+
+
+
+
+ +
+
+
+
+ + +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 38
AAAAAAAAAAAAAAAAAAAA78 (0.082805%)
AAAGATCGTCGGACTGTAGA233 (0.247354%)
AACGATCGTCGGACTGTAGA304 (0.322728%)
AAGGATCGTCGGACTGTAGA292 (0.309989%)
AATGATCGTCGGACTGTAGA322 (0.341837%)
ACAGATCGTCGGACTGTAGA183 (0.194274%)
ACCGATCGTCGGACTGTAGA203 (0.215506%)
ACCTGATCGTCGGACTGTAG144 (0.152871%)
ACGGATCGTCGGACTGTAGA198 (0.210198%)
ACTGATCGTCGGACTGTAGA162 (0.171980%)
ACTGTAGAACTCTGAACGTG18 (0.019109%)
AGATCGTCGGACTGTAGAAC38 (0.040341%)
AGCGATCGTCGGACTGTAGA427 (0.453306%)
AGCTGATCGTCGGACTGTAG152 (0.161364%)
AGGGATCGTCGGACTGTAGA291 (0.308927%)
AGTGATCGTCGGACTGTAGA232 (0.246293%)
ATAGATCGTCGGACTGTAGA294 (0.312112%)
ATCGATCGTCGGACTGTAGA188 (0.199582%)
ATCGTCGGACTGTAGAACTC36 (0.038218%)
ATGATCGTCGGACTGTAGAA17 (0.018047%)
ATGGATCGTCGGACTGTAGA204 (0.216568%)
ATTGATCGTCGGACTGTAGA306 (0.324851%)
CAAAGATCGTCGGACTGTAG136 (0.144378%)
CAAGATCGTCGGACTGTAGA191 (0.202767%)
CAAGGATCGTCGGACTGTAG138 (0.146502%)
CACGATCGTCGGACTGTAGA248 (0.263278%)
CACTGATCGTCGGACTGTAG182 (0.193212%)
CAGGATCGTCGGACTGTAGA105 (0.111469%)
CATCGATCGTCGGACTGTAG120 (0.127393%)
CATGATCGTCGGACTGTAGA191 (0.202767%)
CATGGATCGTCGGACTGTAG132 (0.140132%)
CCAAGATCGTCGGACTGTAG225 (0.238861%)
CCAGATCGTCGGACTGTAGA137 (0.145440%)
CCAGGATCGTCGGACTGTAG183 (0.194274%)
CCATGATCGTCGGACTGTAG187 (0.198520%)
CCCAGATCGTCGGACTGTAG212 (0.225060%)
CCCCTGATCGTCGGACTGTA138 (0.146502%)
CCCGATCGTCGGACTGTAGA334 (0.354576%)
CCCGGATCGTCGGACTGTAG115 (0.122085%)
CCCTGATCGTCGGACTGTAG251 (0.266463%)
CCGATCGTCGGACTGTAGAA92 (0.097668%)
CCGGATCGTCGGACTGTAGA108 (0.114653%)
CCTAGATCGTCGGACTGTAG178 (0.188966%)
CCTCGATCGTCGGACTGTAG171 (0.181535%)
CCTGGATCGTCGGACTGTAG256 (0.271771%)
CCTTGATCGTCGGACTGTAG195 (0.207013%)
CGAGATCGTCGGACTGTAGA136 (0.144378%)
CGATCGTCGGACTGTAGAAC26 (0.027602%)
CGCGATCGTCGGACTGTAGA212 (0.225060%)
CGCGGATCGTCGGACTGTAG115 (0.122085%)
CGCTGATCGTCGGACTGTAG134 (0.142255%)
CGGACTGTAGAACTCTGAAC77 (0.081744%)
CGGGATCGTCGGACTGTAGA243 (0.257970%)
CGGGGATCGTCGGACTGTAG125 (0.132701%)
CGTCGGACTGTAGAACTCTG68 (0.072189%)
CGTGATCGTCGGACTGTAGA250 (0.265401%)
CTAGATCGTCGGACTGTAGA168 (0.178350%)
CTAGGATCGTCGGACTGTAG113 (0.119961%)
CTCGATCGTCGGACTGTAGA322 (0.341837%)
CTCTGATCGTCGGACTGTAG144 (0.152871%)
CTGCGATCGTCGGACTGTAG124 (0.131639%)
CTGGATCGTCGGACTGTAGA162 (0.171980%)
CTGGGATCGTCGGACTGTAG227 (0.240985%)
CTGTGATCGTCGGACTGTAG134 (0.142255%)
CTTGATCGTCGGACTGTAGA205 (0.217629%)
CTTGGCACCCGAGAATTCCA146 (0.154994%)
CTTTGATCGTCGGACTGTAG123 (0.130578%)
GAAGATCGTCGGACTGTAGA264 (0.280264%)
GACGATCGTCGGACTGTAGA186 (0.197459%)
GACTGTAGAACTCTGAACGT20 (0.021232%)
GAGATCGTCGGACTGTAGAA95 (0.100853%)
GAGGATCGTCGGACTGTAGA116 (0.123146%)
GATCGTCGGACTGTAGAACT140 (0.148625%)
GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCT247 (0.471990%)
GATGATCGTCGGACTGTAGA189 (0.200644%)
GCAAGATCGTCGGACTGTAG157 (0.166672%)
GCACGATCGTCGGACTGTAG149 (0.158179%)
GCAGATCGTCGGACTGTAGA194 (0.205952%)
GCAGGATCGTCGGACTGTAG165 (0.175165%)
GCATGATCGTCGGACTGTAG149 (0.158179%)
GCCAGATCGTCGGACTGTAG195 (0.207013%)
GCCGATCGTCGGACTGTAGA255 (0.270710%)
GCCTGATCGTCGGACTGTAG198 (0.210198%)
GCGCGATCGTCGGACTGTAG167 (0.177288%)
GCGGATCGTCGGACTGTAGA214 (0.227184%)
GCGGGATCGTCGGACTGTAG231 (0.245231%)
GCTAGATCGTCGGACTGTAG148 (0.157118%)
GCTGGATCGTCGGACTGTAG227 (0.240985%)
GCTTGATCGTCGGACTGTAG142 (0.150748%)
GGACTGTAGAACTCTGAACG77 (0.081744%)
GGAGATCGTCGGACTGTAGA161 (0.170919%)
GGAGGATCGTCGGACTGTAG136 (0.144378%)
GGATCGTCGGACTGTAGAAC19 (0.020171%)
GGCAGATCGTCGGACTGTAG150 (0.159241%)
GGCGATCGTCGGACTGTAGA267 (0.283449%)
GGCGGATCGTCGGACTGTAG210 (0.222937%)
GGCTGATCGTCGGACTGTAG193 (0.204890%)
GGGAGATCGTCGGACTGTAG164 (0.174103%)
GGGCGATCGTCGGACTGTAG241 (0.255847%)
GGGGATCGTCGGACTGTAGA111 (0.117838%)
GGGGGATCGTCGGACTGTAG273 (0.289818%)
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG607 (1.159911%)
GGTGATCGTCGGACTGTAGA358 (0.380055%)
GGTGGATCGTCGGACTGTAG163 (0.173042%)
GTAGATCGTCGGACTGTAGA294 (0.312112%)
GTCGATCGTCGGACTGTAGA151 (0.160302%)
GTCGGACTGTAGAACTCTGA90 (0.095545%)
GTGGATCGTCGGACTGTAGA224 (0.237800%)
GTGGGATCGTCGGACTGTAG135 (0.143317%)
GTTGATCGTCGGACTGTAGA232 (0.246293%)
NNNNNNNNNNNNNNNNNNNN829 (0.880071%)
TAAGATCGTCGGACTGTAGA318 (0.337591%)
TACGATCGTCGGACTGTAGA217 (0.230368%)
TAGATCGTCGGACTGTAGAA12 (0.012739%)
TAGGATCGTCGGACTGTAGA222 (0.235677%)
TATGATCGTCGGACTGTAGA234 (0.248416%)
TCAGATCGTCGGACTGTAGA147 (0.156056%)
TCCTGATCGTCGGACTGTAG146 (0.154994%)
TCGGACTGTAGAACTCTGAA85 (0.090237%)
TCGGATCGTCGGACTGTAGA211 (0.223999%)
TCGTCGGACTGTAGAACTCT51 (0.054142%)
TCTGATCGTCGGACTGTAGA171 (0.181535%)
TGAGATCGTCGGACTGTAGA201 (0.213383%)
TGATCGTCGGACTGTAGAAC30 (0.031848%)
TGCGATCGTCGGACTGTAGA264 (0.280264%)
TGCTGATCGTCGGACTGTAG149 (0.158179%)
TGGGATCGTCGGACTGTAGA195 (0.207013%)
TGGGGATCGTCGGACTGTAG181 (0.192151%)
TGTGATCGTCGGACTGTAGA216 (0.229307%)
TTAGATCGTCGGACTGTAGA258 (0.273894%)
TTCGATCGTCGGACTGTAGA189 (0.200644%)
TTGGATCGTCGGACTGTAGA318 (0.337591%)
TTTGATCGTCGGACTGTAGA146 (0.154994%)
+
+ +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 38
AAAAAAAAAAAAAAAAAAAA39 (0.092407%)
CTTGGCACCCGAGAATTCCA162 (0.383844%)
GTCGGACTGTAGAACTCTGA33 (0.078191%)
TCGGACTGTAGAACTCTGAA62 (0.146903%)
+
+ +
+
+ +

+ \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json new file mode 100644 index 0000000..9481ab5 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json @@ -0,0 +1,358 @@ +{ + "summary": { + "before_filtering": { + "total_reads":1000000, + "total_bases":37678766, + "q20_bases":34709953, + "q30_bases":34018346, + "q20_rate":0.921207, + "q30_rate":0.902852, + "read1_mean_length":37, + "gc_content":0.518639 + }, + "after_filtering": { + "total_reads":473502, + "total_bases":16881839, + "q20_bases":16178706, + "q30_bases":15892953, + "q20_rate":0.95835, + "q30_rate":0.941423, + "read1_mean_length":35, + "gc_content":0.527568 + } + }, + "filtering_result": { + "passed_filter_reads": 473502, + "low_quality_reads": 66799, + "too_many_N_reads": 105, + "too_short_reads": 459594, + "too_long_reads": 0 + }, + "duplication": { + "rate": 0.286979, + "histogram": [437812,16546,8852,5892,3938,2938,2123,1537,1168,918,718,552,445,368,315,274,220,184,152,137,120,87,94,90,79,61,49,39,34,34,295], + "mean_gc": [0.449334,0.458295,0.463191,0.461576,0.462608,0.464017,0.468745,0.467297,0.462957,0.463121,0.46369,0.469295,0.4685,0.457651,0.458724,0.475097,0.462531,0.457651,0.462126,0.454845,0.463007,0.469732,0.473467,0.462789,0.461703,0.471681,0.455542,0.464957,0.471742,0.467013,0.471545] + }, + "adapter_cutting": { + "adapter_trimmed_reads": 540203, + "adapter_trimmed_bases": 14696777, + "read1_adapter_sequence": "GATCGTCGGACTGTAGAACTCTGAAC", + "read1_adapter_counts": {"GATCGTCGG":14299, "GATCGTCGGAC":6547, "GATCGTCGGACT":5538, "GATCGTCGGACTG":5881, "GATCGTCGGACTGT":17243, "GATCGTCGGACTGTAGAACTCTGAACGTG":7306, "GATCGTCGGACTGTAGAACTCTGAACGTGT":404556, "others":78833} + }, + "read1_before_filtering": { + "total_reads": 1000000, + "total_bases": 37678766, + "q20_bases": 34709953, + "q30_bases": 34018346, + "total_cycles": 38, + "quality_curves": { + "A":[30.1023,29.9909,30.3424,30.3731,30.3951,33.7563,33.8171,33.7964,33.6769,34.2461,32.0416,32.2345,33.3671,31.4002,32.1319,33.9531,33.8651,34.3952,32.5273,31.4524,33.4457,31.626,34.0713,33.6001,34.1499,34.3216,32.0106,31.1379,31.4294,31.0495,33.149,34.0174,34.3201,32.2479,33.1344,30.9166,32.6897,34.06], + "T":[30.9601,30.7757,30.8734,30.8686,30.8783,34.3984,34.4052,34.4866,34.7508,34.6364,34.4516,34.4478,34.6493,34.0627,34.3648,34.7592,34.7462,34.717,34.5812,34.1128,34.6857,34.1296,34.6354,34.7421,34.6366,34.6568,34.4945,34.254,34.1913,33.9201,34.6827,34.6854,34.6539,34.5645,34.5903,33.921,34.6256,34.0429], + "C":[31.0626,30.9065,30.9966,31.0003,31.0258,34.5989,34.5798,34.6138,34.8344,34.713,34.6296,34.3718,34.7863,34.5195,34.3702,34.7335,34.8514,34.793,34.5575,34.5447,34.835,34.5791,34.7124,34.8038,34.7131,34.6841,34.6826,34.4275,34.1128,34.3422,34.7444,34.718,34.6531,34.6465,34.7181,34.5319,34.7889,34.3973], + "G":[30.5229,30.3815,30.5511,30.5992,30.5904,34.0087,34.0169,34.0197,33.9322,33.8345,34.3164,34.2746,33.6745,34.1624,34.1708,33.8907,34.0021,33.9368,34.3762,34.1284,33.6978,34.1745,33.7549,33.9866,33.8954,33.9569,34.3227,34.1765,34.0206,33.835,33.573,33.7451,33.8577,34.2241,33.6588,33.9831,33.671,34.0425], + "mean":[30.1568,30.0866,30.2723,30.2905,30.2976,33.7199,33.7054,33.7193,33.6292,33.7876,33.6127,33.5872,33.4341,33.3069,33.5387,33.6331,33.6975,33.9122,33.7498,33.3164,33.4659,33.324,33.6714,33.629,33.7399,33.8571,33.7149,33.356,33.2514,33.0513,33.3197,33.645,33.8238,33.7259,33.3852,33.6214,33.8441,34.06] + }, + "content_curves": { + "A":[0.314654,0.237083,0.219982,0.220298,0.215974,0.21456,0.233767,0.22907,0.137049,0.552003,0.141955,0.130698,0.12543,0.128143,0.132591,0.129813,0.141034,0.548306,0.138182,0.136471,0.133899,0.150379,0.552425,0.159106,0.562527,0.555956,0.14243,0.132202,0.133747,0.14431,0.14802,0.555425,0.5493,0.13679,0.12267,0.120523,0.10077,0], + "T":[0.194407,0.21312,0.219779,0.222531,0.216976,0.212131,0.226769,0.238401,0.123604,0.128531,0.541415,0.130619,0.137744,0.545369,0.124522,0.118915,0.120976,0.120502,0.130283,0.54418,0.13973,0.534456,0.124401,0.118367,0.121606,0.121829,0.132758,0.539629,0.136758,0.534435,0.132143,0.119065,0.121794,0.131477,0.14243,0.561881,0.156361,0.675335], + "C":[0.185744,0.249803,0.268543,0.269994,0.273495,0.28595,0.241631,0.219478,0.135682,0.129244,0.140163,0.554351,0.137444,0.137638,0.552921,0.145261,0.138754,0.143537,0.556207,0.136811,0.133065,0.129864,0.127386,0.132757,0.127862,0.147126,0.550559,0.148018,0.540736,0.131348,0.131496,0.134551,0.139989,0.54064,0.141824,0.132245,0.137366,0.157434], + "G":[0.290505,0.285286,0.276869,0.272243,0.27844,0.272174,0.282565,0.297718,0.58827,0.174806,0.16106,0.168924,0.583945,0.173419,0.174542,0.590603,0.583827,0.172221,0.159932,0.167123,0.577926,0.169892,0.180362,0.574324,0.172539,0.159606,0.1588,0.164682,0.173275,0.174395,0.572784,0.17539,0.173341,0.175547,0.57753,0.184419,0.604527,0.166027], + "N":[0.01469,0.014708,0.014827,0.014934,0.015115,0.015185,0.015268,0.015333,0.015395,0.015416,0.015407,0.015408,0.015437,0.015431,0.015424,0.015408,0.015409,0.015434,0.015396,0.015415,0.01538,0.015409,0.015426,0.015446,0.015466,0.015483,0.015453,0.015469,0.015484,0.015512,0.015557,0.015569,0.015576,0.015546,0.015546,0.000932911,0.000975171,0.00120431], + "GC":[0.476249,0.535089,0.545412,0.542237,0.551935,0.558124,0.524196,0.517196,0.723952,0.30405,0.301223,0.723275,0.721389,0.311057,0.727463,0.735864,0.722581,0.315758,0.716139,0.303934,0.710991,0.299756,0.307748,0.707081,0.300401,0.306732,0.709359,0.3127,0.714011,0.305743,0.70428,0.309941,0.31333,0.716187,0.719354,0.316664,0.741893,0.323461] + }, + "kmer_count": { + "AAAAA":144484, "AAAAT":41966, "AAAAC":29408, "AAAAG":30579, "AAATA":34195, "AAATT":26346, "AAATC":20102, "AAATG":25950, "AAACA":28214, "AAACT":23063, "AAACC":18676, "AAACG":11668, "AAAGA":36542, "AAAGT":23736, "AAAGC":19180, "AAAGG":26112, + "AATAA":29042, "AATAT":19418, "AATAC":13874, "AATAG":15272, "AATTA":19776, "AATTT":22841, "AATTC":43828, "AATTG":15119, "AATCA":15966, "AATCT":16819, "AATCC":16341, "AATCG":8257, "AATGA":25097, "AATGT":16807, "AATGC":14720, "AATGG":21075, + "AACAA":28818, "AACAT":19558, "AACAC":21331, "AACAG":18072, "AACTA":17082, "AACTT":17610, "AACTC":478972, "AACTG":19626, "AACCA":20754, "AACCT":20108, "AACCC":17788, "AACCG":7868, "AACGA":19520, "AACGT":455928, "AACGC":8679, "AACGG":10109, + "AAGAA":34505, "AAGAT":56306, "AAGAC":14603, "AAGAG":20627, "AAGTA":14735, "AAGTT":14634, "AAGTC":12450, "AAGTG":24529, "AAGCA":20377, "AAGCT":18340, "AAGCC":18960, "AAGCG":15372, "AAGGA":32982, "AAGGT":16410, "AAGGC":21495, "AAGGG":25562, + "ATAAA":29887, "ATAAT":16564, "ATAAC":13585, "ATAAG":13298, "ATATA":16168, "ATATT":16758, "ATATC":9312, "ATATG":13262, "ATACA":13990, "ATACT":11327, "ATACC":9943, "ATACG":4845, "ATAGA":19789, "ATAGT":12867, "ATAGC":10801, "ATAGG":13052, + "ATTAA":17682, "ATTAT":15122, "ATTAC":12235, "ATTAG":13620, "ATTTA":18097, "ATTTT":31976, "ATTTC":15787, "ATTTG":18503, "ATTCA":14109, "ATTCT":19452, "ATTCC":42087, "ATTCG":5973, "ATTGA":18343, "ATTGT":12522, "ATTGC":12604, "ATTGG":14767, + "ATCAA":13574, "ATCAT":13972, "ATCAC":13852, "ATCAG":13163, "ATCTA":10128, "ATCTT":15752, "ATCTC":20782, "ATCTG":18000, "ATCCA":13674, "ATCCT":16667, "ATCCC":15830, "ATCCG":6392, "ATCGA":15635, "ATCGT":542380, "ATCGC":7831, "ATCGG":8802, + "ATGAA":20171, "ATGAT":43413, "ATGAC":8922, "ATGAG":14408, "ATGTA":12627, "ATGTT":15273, "ATGTC":9768, "ATGTG":17801, "ATGCA":12654, "ATGCT":14772, "ATGCC":16021, "ATGCG":10654, "ATGGA":25691, "ATGGT":16030, "ATGGC":20975, "ATGGG":22150, + "ACAAA":29661, "ACAAT":14955, "ACAAC":17578, "ACAAG":17948, "ACATA":13881, "ACATT":15026, "ACATC":12294, "ACATG":18400, "ACACA":23204, "ACACT":18952, "ACACC":15764, "ACACG":10563, "ACAGA":26210, "ACAGT":18081, "ACAGC":17191, "ACAGG":22974, + "ACTAA":14398, "ACTAT":13566, "ACTAC":11620, "ACTAG":13307, "ACTTA":12049, "ACTTT":19765, "ACTTC":13895, "ACTTG":19480, "ACTCA":20479, "ACTCT":474997, "ACTCC":18489, "ACTCG":12108, "ACTGA":31461, "ACTGT":501830, "ACTGC":19241, "ACTGG":21126, + "ACCAA":17631, "ACCAT":16658, "ACCAC":24372, "ACCAG":20007, "ACCTA":11027, "ACCTT":14985, "ACCTC":22339, "ACCTG":26926, "ACCCA":20332, "ACCCT":18523, "ACCCC":21183, "ACCCG":33538, "ACCGA":11391, "ACCGT":5875, "ACCGC":11227, "ACCGG":10457, + "ACGAA":6533, "ACGAT":32428, "ACGAC":6008, "ACGAG":11228, "ACGTA":5034, "ACGTT":5024, "ACGTC":8464, "ACGTG":454222, "ACGCA":8560, "ACGCT":7954, "ACGCC":12704, "ACGCG":11123, "ACGGA":17532, "ACGGT":6356, "ACGGC":9578, "ACGGG":15203, + "AGAAA":34811, "AGAAT":47405, "AGAAC":496183, "AGAAG":26114, "AGATA":13075, "AGATT":13456, "AGATC":142833, "AGATG":20030, "AGACA":17801, "AGACT":14556, "AGACC":16518, "AGACG":10848, "AGAGA":28114, "AGAGT":15215, "AGAGC":21363, "AGAGG":31025, + "AGTAA":13148, "AGTAT":10798, "AGTAC":9212, "AGTAG":21211, "AGTTA":10454, "AGTTT":17371, "AGTTC":12673, "AGTTG":15031, "AGTCA":12538, "AGTCT":15214, "AGTCC":13646, "AGTCG":10460, "AGTGA":23819, "AGTGT":17235, "AGTGC":19604, "AGTGG":27459, + "AGCAA":18677, "AGCAT":15375, "AGCAC":18872, "AGCAG":28416, "AGCTA":15518, "AGCTT":15962, "AGCTC":19558, "AGCTG":35001, "AGCCA":27263, "AGCCT":28649, "AGCCC":25098, "AGCCG":19834, "AGCGA":19442, "AGCGT":8863, "AGCGC":18993, "AGCGG":25644, + "AGGAA":27049, "AGGAT":53354, "AGGAC":17886, "AGGAG":35210, "AGGTA":13563, "AGGTT":15994, "AGGTC":14255, "AGGTG":31386, "AGGCA":27981, "AGGCT":32224, "AGGCC":27173, "AGGCG":27600, "AGGGA":34585, "AGGGT":19220, "AGGGC":27068, "AGGGG":33235, + "TAAAA":35765, "TAAAT":19978, "TAAAC":14741, "TAAAG":17503, "TAATA":14160, "TAATT":16385, "TAATC":12247, "TAATG":13524, "TAACA":13962, "TAACT":12178, "TAACC":11011, "TAACG":6945, "TAAGA":21010, "TAAGT":10696, "TAAGC":11047, "TAAGG":14546, + "TATAA":15923, "TATAT":14329, "TATAC":8229, "TATAG":11206, "TATTA":13082, "TATTT":23598, "TATTC":10683, "TATTG":12148, "TATCA":9550, "TATCT":11282, "TATCC":8123, "TATCG":5489, "TATGA":17899, "TATGT":11193, "TATGC":8471, "TATGG":11673, + "TACAA":15459, "TACAT":10976, "TACAC":9495, "TACAG":14922, "TACTA":10237, "TACTT":13192, "TACTC":10826, "TACTG":12531, "TACCA":13386, "TACCT":12619, "TACCC":10580, "TACCG":5625, "TACGA":8716, "TACGT":3521, "TACGC":4513, "TACGG":5276, + "TAGAA":486704, "TAGAT":50743, "TAGAC":9056, "TAGAG":15013, "TAGTA":10696, "TAGTT":10449, "TAGTC":10523, "TAGTG":13167, "TAGCA":12903, "TAGCT":16527, "TAGCC":13996, "TAGCG":10166, "TAGGA":24174, "TAGGT":11336, "TAGGC":12723, "TAGGG":16871, + "TTAAA":24791, "TTAAT":14839, "TTAAC":10950, "TTAAG":14884, "TTATA":12889, "TTATT":20219, "TTATC":9461, "TTATG":12478, "TTACA":14145, "TTACT":12139, "TTACC":9976, "TTACG":4560, "TTAGA":18793, "TTAGT":11845, "TTAGC":12834, "TTAGG":14132, + "TTTAA":22090, "TTTAT":20901, "TTTAC":11268, "TTTAG":16069, "TTTTA":25993, "TTTTT":60972, "TTTTC":24680, "TTTTG":29302, "TTTCA":18908, "TTTCT":28255, "TTTCC":19687, "TTTCG":7174, "TTTGA":25543, "TTTGT":23016, "TTTGC":16130, "TTTGG":24354, + "TTCAA":15176, "TTCAT":14683, "TTCAC":13714, "TTCAG":17777, "TTCTA":14080, "TTCTT":23680, "TTCTC":21945, "TTCTG":23295, "TTCCA":45751, "TTCCT":22879, "TTCCC":20434, "TTCCG":9444, "TTCGA":8778, "TTCGT":6114, "TTCGC":6978, "TTCGG":9940, + "TTGAA":18355, "TTGAT":45462, "TTGAC":8681, "TTGAG":19065, "TTGTA":12727, "TTGTT":19727, "TTGTC":11709, "TTGTG":18936, "TTGCA":14867, "TTGCT":19533, "TTGCC":18106, "TTGCG":9890, "TTGGA":25599, "TTGGT":16045, "TTGGC":26079, "TTGGG":29382, + "TCAAA":18353, "TCAAT":9679, "TCAAC":12756, "TCAAG":17624, "TCATA":10834, "TCATT":14871, "TCATC":13295, "TCATG":15829, "TCACA":15955, "TCACT":18164, "TCACC":17458, "TCACG":9679, "TCAGA":23524, "TCAGT":14863, "TCAGC":18975, "TCAGG":22473, + "TCTAA":12056, "TCTAT":9173, "TCTAC":12029, "TCTAG":13691, "TCTTA":13304, "TCTTT":22567, "TCTTC":17909, "TCTTG":21402, "TCTCA":20348, "TCTCT":24309, "TCTCC":23147, "TCTCG":16985, "TCTGA":475138, "TCTGT":20211, "TCTGC":19048, "TCTGG":26603, + "TCCAA":22689, "TCCAT":19579, "TCCAC":22221, "TCCAG":30271, "TCCTA":12565, "TCCTT":17927, "TCCTC":23859, "TCCTG":31538, "TCCCA":25760, "TCCCT":22421, "TCCCC":24363, "TCCCG":18320, "TCCGA":9469, "TCCGT":6532, "TCCGC":12919, "TCCGG":14376, + "TCGAA":5924, "TCGAT":31835, "TCGAC":7635, "TCGAG":6851, "TCGTA":6275, "TCGTT":6663, "TCGTC":537241, "TCGTG":9662, "TCGCA":7552, "TCGCT":12966, "TCGCC":14597, "TCGCG":11833, "TCGGA":525350, "TCGGT":7960, "TCGGC":14622, "TCGGG":19774, + "TGAAA":24321, "TGAAT":14532, "TGAAC":469207, "TGAAG":24172, "TGATA":10338, "TGATT":13043, "TGATC":156719, "TGATG":14406, "TGACA":12265, "TGACT":12871, "TGACC":12689, "TGACG":6558, "TGAGA":25142, "TGAGT":15122, "TGAGC":21379, "TGAGG":29725, + "TGTAA":15457, "TGTAT":13675, "TGTAC":9233, "TGTAG":492200, "TGTTA":11708, "TGTTT":22400, "TGTTC":12383, "TGTTG":21040, "TGTCA":13672, "TGTCT":17210, "TGTCC":13547, "TGTCG":9491, "TGTGA":25390, "TGTGT":22935, "TGTGC":17483, "TGTGG":29891, + "TGCAA":15014, "TGCAT":13369, "TGCAC":16269, "TGCAG":26314, "TGCTA":11770, "TGCTT":19227, "TGCTC":17846, "TGCTG":33514, "TGCCA":21421, "TGCCT":29200, "TGCCC":25386, "TGCCG":14064, "TGCGA":16927, "TGCGT":7591, "TGCGC":16293, "TGCGG":19505, + "TGGAA":21938, "TGGAT":58712, "TGGAC":13925, "TGGAG":32296, "TGGTA":12989, "TGGTT":16961, "TGGTC":14221, "TGGTG":33546, "TGGCA":32232, "TGGCT":28035, "TGGCC":27238, "TGGCG":25986, "TGGGA":47654, "TGGGT":23316, "TGGGC":34607, "TGGGG":51519, + "CAAAA":30778, "CAAAT":18999, "CAAAC":16397, "CAAAG":24929, "CAATA":12939, "CAATT":11902, "CAATC":9848, "CAATG":17016, "CAACA":19853, "CAACT":15200, "CAACC":15639, "CAACG":11713, "CAAGA":32613, "CAAGT":14317, "CAAGC":17737, "CAAGG":22974, + "CATAA":13804, "CATAT":11895, "CATAC":9191, "CATAG":13990, "CATTA":11124, "CATTT":20116, "CATTC":12744, "CATTG":16368, "CATCA":13796, "CATCT":16488, "CATCC":13844, "CATCG":11363, "CATGA":25256, "CATGT":15346, "CATGC":16680, "CATGG":25409, + "CACAA":19972, "CACAT":16497, "CACAC":22796, "CACAG":24247, "CACTA":13222, "CACTT":18160, "CACTC":18355, "CACTG":32387, "CACCA":26053, "CACCT":24879, "CACCC":43667, "CACCG":14124, "CACGA":17513, "CACGT":7582, "CACGC":14114, "CACGG":14190, + "CAGAA":27364, "CAGAT":44774, "CAGAC":14725, "CAGAG":26086, "CAGTA":15799, "CAGTT":14343, "CAGTC":12840, "CAGTG":25398, "CAGCA":24473, "CAGCT":25164, "CAGCC":32868, "CAGCG":20972, "CAGGA":38895, "CAGGT":21356, "CAGGC":33821, "CAGGG":32654, + "CTAAA":16867, "CTAAT":11421, "CTAAC":10215, "CTAAG":14513, "CTATA":9905, "CTATT":10352, "CTATC":8389, "CTATG":12157, "CTACA":11399, "CTACT":13513, "CTACC":13186, "CTACG":6518, "CTAGA":25801, "CTAGT":8946, "CTAGC":12681, "CTAGG":17817, + "CTTAA":13732, "CTTAT":9928, "CTTAC":9538, "CTTAG":14233, "CTTTA":13262, "CTTTT":23679, "CTTTC":17284, "CTTTG":23566, "CTTCA":14519, "CTTCT":21017, "CTTCC":22333, "CTTCG":10417, "CTTGA":28851, "CTTGT":14784, "CTTGC":16771, "CTTGG":31941, + "CTCAA":19159, "CTCAT":15460, "CTCAC":21267, "CTCAG":30998, "CTCTA":14618, "CTCTT":20935, "CTCTC":24583, "CTCTG":485513, "CTCCA":21979, "CTCCT":31252, "CTCCC":36017, "CTCCG":16441, "CTCGA":18862, "CTCGT":8477, "CTCGC":17156, "CTCGG":23080, + "CTGAA":474969, "CTGAT":63544, "CTGAC":14219, "CTGAG":30506, "CTGTA":485196, "CTGTT":16720, "CTGTC":18520, "CTGTG":30020, "CTGCA":23205, "CTGCT":26450, "CTGCC":33331, "CTGCG":22352, "CTGGA":42266, "CTGGT":20479, "CTGGC":28242, "CTGGG":56393, + "CCAAA":23564, "CCAAT":13976, "CCAAC":16729, "CCAAG":28028, "CCATA":12688, "CCATT":16964, "CCATC":17603, "CCATG":26951, "CCACA":22590, "CCACT":24119, "CCACC":35529, "CCACG":16219, "CCAGA":33512, "CCAGT":16617, "CCAGC":35182, "CCAGG":44487, + "CCTAA":12604, "CCTAT":9065, "CCTAC":10059, "CCTAG":18930, "CCTTA":10949, "CCTTT":18851, "CCTTC":18265, "CCTTG":28086, "CCTCA":23627, "CCTCT":25981, "CCTCC":39791, "CCTCG":22434, "CCTGA":40004, "CCTGT":23234, "CCTGC":29170, "CCTGG":45845, + "CCCAA":21475, "CCCAT":17419, "CCCAC":24631, "CCCAG":43081, "CCCTA":13021, "CCCTT":20057, "CCCTC":27609, "CCCTG":37908, "CCCCA":29578, "CCCCT":27947, "CCCCC":33446, "CCCCG":26916, "CCCGA":40594, "CCCGT":9701, "CCCGC":25536, "CCCGG":30954, + "CCGAA":8112, "CCGAT":27287, "CCGAC":8455, "CCGAG":39957, "CCGTA":4284, "CCGTT":5544, "CCGTC":9547, "CCGTG":12875, "CCGCA":11481, "CCGCT":14521, "CCGCC":31892, "CCGCG":25505, "CCGGA":19515, "CCGGT":9665, "CCGGC":23388, "CCGGG":36482, + "CGAAA":6599, "CGAAT":5102, "CGAAC":7480, "CGAAG":9217, "CGATA":3224, "CGATT":3987, "CGATC":125904, "CGATG":6083, "CGACA":5475, "CGACT":6628, "CGACC":7864, "CGACG":10368, "CGAGA":39391, "CGAGT":10070, "CGAGC":12549, "CGAGG":17717, + "CGTAA":3963, "CGTAT":3720, "CGTAC":4630, "CGTAG":8584, "CGTTA":3478, "CGTTT":5739, "CGTTC":6542, "CGTTG":8596, "CGTCA":5439, "CGTCT":8598, "CGTCC":10725, "CGTCG":536775, "CGTGA":13893, "CGTGT":441413, "CGTGC":11462, "CGTGG":19487, + "CGCAA":7677, "CGCAT":6306, "CGCAC":13616, "CGCAG":17264, "CGCTA":6760, "CGCTT":10521, "CGCTC":14783, "CGCTG":24804, "CGCCA":16927, "CGCCT":21105, "CGCCC":27555, "CGCCG":28845, "CGCGA":17133, "CGCGT":8505, "CGCGC":26433, "CGCGG":34746, + "CGGAA":11680, "CGGAT":36972, "CGGAC":524782, "CGGAG":22495, "CGGTA":5664, "CGGTT":6823, "CGGTC":9780, "CGGTG":18619, "CGGCA":12751, "CGGCT":18403, "CGGCC":27257, "CGGCG":35972, "CGGGA":32334, "CGGGT":16928, "CGGGC":34426, "CGGGG":42861, + "GAAAA":28339, "GAAAT":22224, "GAAAC":17802, "GAAAG":26091, "GAATA":12092, "GAATT":42685, "GAATC":12536, "GAATG":18029, "GAACA":20902, "GAACT":482442, "GAACC":17777, "GAACG":464115, "GAAGA":31870, "GAAGT":15608, "GAAGC":20720, "GAAGG":28769, + "GATAA":11243, "GATAT":8100, "GATAC":6900, "GATAG":12579, "GATTA":11030, "GATTT":13539, "GATTC":10523, "GATTG":11687, "GATCA":12332, "GATCT":20365, "GATCC":12448, "GATCG":551311, "GATGA":16077, "GATGT":10792, "GATGC":11066, "GATGG":22922, + "GACAA":11935, "GACAT":10538, "GACAC":11824, "GACAG":23459, "GACTA":9618, "GACTT":12878, "GACTC":16779, "GACTG":512459, "GACCA":13961, "GACCT":14418, "GACCC":16237, "GACCG":9650, "GACGA":9157, "GACGT":6081, "GACGC":10169, "GACGG":17273, + "GAGAA":51564, "GAGAT":34097, "GAGAC":19581, "GAGAG":28918, "GAGTA":10263, "GAGTT":14000, "GAGTC":14195, "GAGTG":21729, "GAGCA":17611, "GAGCT":21426, "GAGCC":29376, "GAGCG":21901, "GAGGA":33883, "GAGGT":23792, "GAGGC":41912, "GAGGG":35618, + "GTAAA":13870, "GTAAT":12639, "GTAAC":8367, "GTAAG":12070, "GTATA":9086, "GTATT":11476, "GTATC":6684, "GTATG":10288, "GTACA":9535, "GTACT":8800, "GTACC":7805, "GTACG":5421, "GTAGA":499569, "GTAGT":10927, "GTAGC":14928, "GTAGG":18502, + "GTTAA":9963, "GTTAT":8381, "GTTAC":7267, "GTTAG":11241, "GTTTA":10542, "GTTTT":21786, "GTTTC":15355, "GTTTG":16413, "GTTCA":11950, "GTTCT":13584, "GTTCC":12937, "GTTCG":7459, "GTTGA":16891, "GTTGT":12445, "GTTGC":14921, "GTTGG":21693, + "GTCAA":8590, "GTCAT":10032, "GTCAC":11620, "GTCAG":15974, "GTCTA":6879, "GTCTT":14125, "GTCTC":19044, "GTCTG":17605, "GTCCA":10073, "GTCCT":14016, "GTCCC":17661, "GTCCG":10873, "GTCGA":8143, "GTCGT":7433, "GTCGC":13387, "GTCGG":540040, + "GTGAA":16778, "GTGAT":42240, "GTGAC":12400, "GTGAG":24846, "GTGTA":18142, "GTGTT":15422, "GTGTC":13707, "GTGTG":27617, "GTGCA":16964, "GTGCT":19439, "GTGCC":19810, "GTGCG":15220, "GTGGA":31188, "GTGGT":23955, "GTGGC":31938, "GTGGG":45251, + "GCAAA":16852, "GCAAT":12398, "GCAAC":14332, "GCAAG":21205, "GCATA":9940, "GCATT":12753, "GCATC":11716, "GCATG":21074, "GCACA":18789, "GCACT":19802, "GCACC":37336, "GCACG":15962, "GCAGA":28589, "GCAGT":19226, "GCAGC":31695, "GCAGG":37819, + "GCTAA":12663, "GCTAT":8957, "GCTAC":11088, "GCTAG":17326, "GCTTA":10006, "GCTTT":16609, "GCTTC":17957, "GCTTG":22702, "GCTCA":20075, "GCTCT":21409, "GCTCC":23657, "GCTCG":18296, "GCTGA":36860, "GCTGT":24422, "GCTGC":37634, "GCTGG":53769, + "GCCAA":18324, "GCCAT":19780, "GCCAC":25653, "GCCAG":34002, "GCCTA":12605, "GCCTT":19693, "GCCTC":37066, "GCCTG":41948, "GCCCA":26858, "GCCCT":27215, "GCCCC":35640, "GCCCG":26600, "GCCGA":20034, "GCCGT":9793, "GCCGC":32741, "GCCGG":33450, + "GCGAA":7034, "GCGAT":48678, "GCGAC":8299, "GCGAG":18556, "GCGTA":4535, "GCGTT":6984, "GCGTC":10868, "GCGTG":20100, "GCGCA":14680, "GCGCT":19659, "GCGCC":31402, "GCGCG":36411, "GCGGA":32288, "GCGGT":16489, "GCGGC":45598, "GCGGG":53076, + "GGAAA":25299, "GGAAT":17488, "GGAAC":15925, "GGAAG":32817, "GGATA":10515, "GGATT":15431, "GGATC":171012, "GGATG":19170, "GGACA":19682, "GGACT":522209, "GGACC":16086, "GGACG":14159, "GGAGA":37217, "GGAGT":22386, "GGAGC":33141, "GGAGG":53899, + "GGTAA":12198, "GGTAT":9070, "GGTAC":7691, "GGTAG":18427, "GGTTA":9300, "GGTTT":17854, "GGTTC":14222, "GGTTG":19222, "GGTCA":12380, "GGTCT":15849, "GGTCC":14689, "GGTCG":15766, "GGTGA":30707, "GGTGT":20157, "GGTGC":21795, "GGTGG":49925, + "GGCAA":19231, "GGCAT":17746, "GGCAC":36519, "GGCAG":40897, "GGCTA":12960, "GGCTT":18733, "GGCTC":28693, "GGCTG":56521, "GGCCA":28442, "GGCCT":27986, "GGCCC":34082, "GGCCG":32329, "GGCGA":27323, "GGCGT":15620, "GGCGC":35073, "GGCGG":64595, + "GGGAA":28548, "GGGAT":67168, "GGGAC":21828, "GGGAG":53607, "GGGTA":12539, "GGGTT":20153, "GGGTC":20951, "GGGTG":38420, "GGGCA":29542, "GGGCT":33928, "GGGCC":37599, "GGGCG":49766, "GGGGA":52154, "GGGGT":31489, "GGGGC":49488, "GGGGG":530960 + }, + "overrepresented_sequences": { + "AAAAAAAAAAAAAAAAAAAA":78, + "AAAGATCGTCGGACTGTAGA":233, + "AACGATCGTCGGACTGTAGA":304, + "AAGGATCGTCGGACTGTAGA":292, + "AATGATCGTCGGACTGTAGA":322, + "ACAGATCGTCGGACTGTAGA":183, + "ACCGATCGTCGGACTGTAGA":203, + "ACCTGATCGTCGGACTGTAG":144, + "ACGGATCGTCGGACTGTAGA":198, + "ACTGATCGTCGGACTGTAGA":162, + "ACTGTAGAACTCTGAACGTG":18, + "AGATCGTCGGACTGTAGAAC":38, + "AGCGATCGTCGGACTGTAGA":427, + "AGCTGATCGTCGGACTGTAG":152, + "AGGGATCGTCGGACTGTAGA":291, + "AGTGATCGTCGGACTGTAGA":232, + "ATAGATCGTCGGACTGTAGA":294, + "ATCGATCGTCGGACTGTAGA":188, + "ATCGTCGGACTGTAGAACTC":36, + "ATGATCGTCGGACTGTAGAA":17, + "ATGGATCGTCGGACTGTAGA":204, + "ATTGATCGTCGGACTGTAGA":306, + "CAAAGATCGTCGGACTGTAG":136, + "CAAGATCGTCGGACTGTAGA":191, + "CAAGGATCGTCGGACTGTAG":138, + "CACGATCGTCGGACTGTAGA":248, + "CACTGATCGTCGGACTGTAG":182, + "CAGGATCGTCGGACTGTAGA":105, + "CATCGATCGTCGGACTGTAG":120, + "CATGATCGTCGGACTGTAGA":191, + "CATGGATCGTCGGACTGTAG":132, + "CCAAGATCGTCGGACTGTAG":225, + "CCAGATCGTCGGACTGTAGA":137, + "CCAGGATCGTCGGACTGTAG":183, + "CCATGATCGTCGGACTGTAG":187, + "CCCAGATCGTCGGACTGTAG":212, + "CCCCTGATCGTCGGACTGTA":138, + "CCCGATCGTCGGACTGTAGA":334, + "CCCGGATCGTCGGACTGTAG":115, + "CCCTGATCGTCGGACTGTAG":251, + "CCGATCGTCGGACTGTAGAA":92, + "CCGGATCGTCGGACTGTAGA":108, + "CCTAGATCGTCGGACTGTAG":178, + "CCTCGATCGTCGGACTGTAG":171, + "CCTGGATCGTCGGACTGTAG":256, + "CCTTGATCGTCGGACTGTAG":195, + "CGAGATCGTCGGACTGTAGA":136, + "CGATCGTCGGACTGTAGAAC":26, + "CGCGATCGTCGGACTGTAGA":212, + "CGCGGATCGTCGGACTGTAG":115, + "CGCTGATCGTCGGACTGTAG":134, + "CGGACTGTAGAACTCTGAAC":77, + "CGGGATCGTCGGACTGTAGA":243, + "CGGGGATCGTCGGACTGTAG":125, + "CGTCGGACTGTAGAACTCTG":68, + "CGTGATCGTCGGACTGTAGA":250, + "CTAGATCGTCGGACTGTAGA":168, + "CTAGGATCGTCGGACTGTAG":113, + "CTCGATCGTCGGACTGTAGA":322, + "CTCTGATCGTCGGACTGTAG":144, + "CTGCGATCGTCGGACTGTAG":124, + "CTGGATCGTCGGACTGTAGA":162, + "CTGGGATCGTCGGACTGTAG":227, + "CTGTGATCGTCGGACTGTAG":134, + "CTTGATCGTCGGACTGTAGA":205, + "CTTGGCACCCGAGAATTCCA":146, + "CTTTGATCGTCGGACTGTAG":123, + "GAAGATCGTCGGACTGTAGA":264, + "GACGATCGTCGGACTGTAGA":186, + "GACTGTAGAACTCTGAACGT":20, + "GAGATCGTCGGACTGTAGAA":95, + "GAGGATCGTCGGACTGTAGA":116, + "GATCGTCGGACTGTAGAACT":140, + "GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCT":247, + "GATGATCGTCGGACTGTAGA":189, + "GCAAGATCGTCGGACTGTAG":157, + "GCACGATCGTCGGACTGTAG":149, + "GCAGATCGTCGGACTGTAGA":194, + "GCAGGATCGTCGGACTGTAG":165, + "GCATGATCGTCGGACTGTAG":149, + "GCCAGATCGTCGGACTGTAG":195, + "GCCGATCGTCGGACTGTAGA":255, + "GCCTGATCGTCGGACTGTAG":198, + "GCGCGATCGTCGGACTGTAG":167, + "GCGGATCGTCGGACTGTAGA":214, + "GCGGGATCGTCGGACTGTAG":231, + "GCTAGATCGTCGGACTGTAG":148, + "GCTGGATCGTCGGACTGTAG":227, + "GCTTGATCGTCGGACTGTAG":142, + "GGACTGTAGAACTCTGAACG":77, + "GGAGATCGTCGGACTGTAGA":161, + "GGAGGATCGTCGGACTGTAG":136, + "GGATCGTCGGACTGTAGAAC":19, + "GGCAGATCGTCGGACTGTAG":150, + "GGCGATCGTCGGACTGTAGA":267, + "GGCGGATCGTCGGACTGTAG":210, + "GGCTGATCGTCGGACTGTAG":193, + "GGGAGATCGTCGGACTGTAG":164, + "GGGCGATCGTCGGACTGTAG":241, + "GGGGATCGTCGGACTGTAGA":111, + "GGGGGATCGTCGGACTGTAG":273, + "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG":607, + "GGTGATCGTCGGACTGTAGA":358, + "GGTGGATCGTCGGACTGTAG":163, + "GTAGATCGTCGGACTGTAGA":294, + "GTCGATCGTCGGACTGTAGA":151, + "GTCGGACTGTAGAACTCTGA":90, + "GTGGATCGTCGGACTGTAGA":224, + "GTGGGATCGTCGGACTGTAG":135, + "GTTGATCGTCGGACTGTAGA":232, + "NNNNNNNNNNNNNNNNNNNN":829, + "TAAGATCGTCGGACTGTAGA":318, + "TACGATCGTCGGACTGTAGA":217, + "TAGATCGTCGGACTGTAGAA":12, + "TAGGATCGTCGGACTGTAGA":222, + "TATGATCGTCGGACTGTAGA":234, + "TCAGATCGTCGGACTGTAGA":147, + "TCCTGATCGTCGGACTGTAG":146, + "TCGGACTGTAGAACTCTGAA":85, + "TCGGATCGTCGGACTGTAGA":211, + "TCGTCGGACTGTAGAACTCT":51, + "TCTGATCGTCGGACTGTAGA":171, + "TGAGATCGTCGGACTGTAGA":201, + "TGATCGTCGGACTGTAGAAC":30, + "TGCGATCGTCGGACTGTAGA":264, + "TGCTGATCGTCGGACTGTAG":149, + "TGGGATCGTCGGACTGTAGA":195, + "TGGGGATCGTCGGACTGTAG":181, + "TGTGATCGTCGGACTGTAGA":216, + "TTAGATCGTCGGACTGTAGA":258, + "TTCGATCGTCGGACTGTAGA":189, + "TTGGATCGTCGGACTGTAGA":318, + "TTTGATCGTCGGACTGTAGA":146 } + }, + "read1_after_filtering": { + "total_reads": 473502, + "total_bases": 16881839, + "q20_bases": 16178706, + "q30_bases": 15892953, + "total_cycles": 38, + "quality_curves": { + "A":[30.6713,30.569,30.9089,30.913,30.9673,34.4454,34.3849,34.3322,34.7057,33.6297,33.5612,33.597,34.5249,33.0673,33.591,34.9484,34.8865,33.9735,33.8815,33.1075,34.6402,33.2496,32.4667,34.6565,33.084,33.7631,33.4267,33.0139,32.9276,32.7339,34.3503,32.8285,33.5863,33.5355,34.4883,32.7721,34.1664,34.5141], + "T":[31.31,31.1969,31.3023,31.3223,31.338,35.0047,35.0168,35.0214,35.2614,35.2646,34.5183,35.0285,35.1898,34.3468,34.9167,35.2779,35.2738,35.2752,35.1698,34.3305,35.203,34.2805,35.1388,35.2424,35.1874,35.183,35.0639,34.2394,34.7854,33.9934,35.1507,35.2173,35.1709,35.0959,35.1152,34.2133,35.1085,33.9155], + "C":[31.3285,31.2611,31.3642,31.3644,31.3905,35.0607,35.0708,35.107,35.3615,35.2602,35.2521,34.4025,35.3071,35.1436,34.1441,35.2865,35.3554,35.3445,34.5179,35.0765,35.332,35.1131,35.2514,35.3063,35.1792,35.2334,34.4872,34.9965,33.8475,34.8599,35.2382,35.157,35.2116,34.5276,35.1893,35.0442,35.2739,34.8959], + "G":[31.0278,30.9267,31.1261,31.1742,31.1564,34.7749,34.7778,34.766,34.3075,34.8652,35.0166,35.0326,34.1252,34.9221,34.9676,34.3339,34.3618,34.9276,35.1183,34.9084,34.2166,34.8925,34.7809,34.3116,34.8699,34.882,35.012,34.8599,34.7999,34.6789,33.9708,34.788,34.82,34.9695,34.027,34.7688,34.0838,34.7812], + "mean":[30.9994,30.978,31.1764,31.1972,31.2168,34.8289,34.8151,34.8032,34.8714,34.7243,34.5981,34.5248,34.7485,34.3973,34.4091,34.9334,34.9425,34.8716,34.6718,34.3892,34.8224,34.4161,34.4087,34.8554,34.5677,34.7538,34.4988,34.3057,34.105,34.1008,34.6506,34.4881,34.6932,34.5402,34.6847,34.2513,34.6906,34.5141] + }, + "content_curves": { + "A":[0.368039,0.263851,0.24372,0.235712,0.231946,0.230527,0.23293,0.243142,0.227186,0.26194,0.241021,0.236347,0.233264,0.231477,0.229463,0.225608,0.228546,0.250761,0.233627,0.23229,0.235824,0.236274,0.246287,0.237368,0.250838,0.253052,0.243299,0.239327,0.236056,0.236394,0.233299,0.248975,0.246062,0.235566,0.22024,0.223857,0.179453,0], + "T":[0.203505,0.255824,0.259811,0.257351,0.24406,0.236341,0.227009,0.234339,0.225095,0.223938,0.249583,0.222975,0.236799,0.258204,0.225125,0.226666,0.225194,0.222113,0.218612,0.247265,0.22763,0.235416,0.225858,0.225684,0.225908,0.225194,0.223146,0.235225,0.225901,0.236901,0.233142,0.227017,0.22854,0.230398,0.234023,0.248826,0.265889,0.349438], + "C":[0.151583,0.22907,0.241247,0.244299,0.256132,0.26473,0.254584,0.24289,0.2456,0.237642,0.240331,0.265855,0.235509,0.24104,0.280666,0.25953,0.257188,0.254992,0.280981,0.252884,0.252362,0.254375,0.249059,0.253404,0.241773,0.249389,0.263877,0.251336,0.260841,0.252284,0.250995,0.250514,0.253489,0.267365,0.260156,0.25354,0.269998,0.362529], + "G":[0.276867,0.251256,0.255222,0.262639,0.267862,0.268402,0.285477,0.279629,0.302115,0.276463,0.269065,0.27482,0.294417,0.269268,0.264744,0.288187,0.289067,0.272126,0.266778,0.267545,0.284182,0.273928,0.27879,0.283537,0.281459,0.272352,0.269659,0.274086,0.277164,0.274372,0.282503,0.273437,0.271825,0.266595,0.285551,0.273712,0.284603,0.287846], + "N":[6.33577e-06,0,0,0,0,0,0,0,4.22385e-06,1.68954e-05,0,2.11192e-06,1.05596e-05,1.05596e-05,2.11192e-06,8.44769e-06,4.22385e-06,8.44769e-06,2.12327e-06,1.49732e-05,2.1574e-06,6.52707e-06,6.59644e-06,6.65668e-06,2.24391e-05,1.36513e-05,1.84418e-05,2.57565e-05,3.82753e-05,4.85115e-05,6.13363e-05,5.7072e-05,8.2911e-05,7.59955e-05,3.03982e-05,6.47944e-05,5.57543e-05,0.000187018], + "GC":[0.42845,0.480325,0.496469,0.506938,0.523994,0.533132,0.540061,0.522519,0.547715,0.514106,0.509396,0.540676,0.529926,0.510308,0.545411,0.547717,0.546255,0.527117,0.547759,0.52043,0.536544,0.528304,0.527849,0.536941,0.523231,0.521741,0.533537,0.525423,0.538005,0.526656,0.533498,0.52395,0.525314,0.53396,0.545706,0.527253,0.554602,0.650375] + }, + "kmer_count": { + "AAAAA":67086, "AAAAT":31720, "AAAAC":19968, "AAAAG":22128, "AAATA":24467, "AAATT":21861, "AAATC":14241, "AAATG":20080, "AAACA":20003, "AAACT":17595, "AAACC":14698, "AAACG":6356, "AAAGA":21280, "AAAGT":18326, "AAAGC":15147, "AAAGG":19108, + "AATAA":19581, "AATAT":14838, "AATAC":10932, "AATAG":10622, "AATTA":15620, "AATTT":19950, "AATTC":40099, "AATTG":11046, "AATCA":12191, "AATCT":12752, "AATCC":14258, "AATCG":4360, "AATGA":14363, "AATGT":13258, "AATGC":11666, "AATGG":15263, + "AACAA":18712, "AACAT":14857, "AACAC":13181, "AACAG":14378, "AACTA":11393, "AACTT":13849, "AACTC":18598, "AACTG":13878, "AACCA":15965, "AACCT":15317, "AACCC":13902, "AACCG":4990, "AACGA":6130, "AACGT":9429, "AACGC":5820, "AACGG":5814, + "AAGAA":23853, "AAGAT":15226, "AAGAC":12477, "AAGAG":18055, "AAGTA":11693, "AAGTT":13038, "AAGTC":10943, "AAGTG":18688, "AAGCA":16034, "AAGCT":14363, "AAGCC":15250, "AAGCG":9069, "AAGGA":19833, "AAGGT":13548, "AAGGC":17071, "AAGGG":18338, + "ATAAA":20134, "ATAAT":12779, "ATAAC":9472, "ATAAG":9159, "ATATA":12510, "ATATT":14234, "ATATC":7073, "ATATG":9495, "ATACA":11273, "ATACT":9097, "ATACC":7745, "ATACG":2282, "ATAGA":9719, "ATAGT":9173, "ATAGC":7968, "ATAGG":8710, + "ATTAA":13695, "ATTAT":12774, "ATTAC":10206, "ATTAG":10129, "ATTTA":15749, "ATTTT":29559, "ATTTC":14439, "ATTTG":15042, "ATTCA":12326, "ATTCT":17982, "ATTCC":38165, "ATTCG":3623, "ATTGA":9989, "ATTGT":10645, "ATTGC":10073, "ATTGG":10398, + "ATCAA":9775, "ATCAT":10999, "ATCAC":11275, "ATCAG":10723, "ATCTA":7982, "ATCTT":13252, "ATCTC":13639, "ATCTG":12978, "ATCCA":11181, "ATCCT":13796, "ATCCC":13733, "ATCCG":4978, "ATCGA":3989, "ATCGT":13696, "ATCGC":5533, "ATCGG":5486, + "ATGAA":15105, "ATGAT":11664, "ATGAC":7599, "ATGAG":12352, "ATGTA":9634, "ATGTT":13887, "ATGTC":8811, "ATGTG":13915, "ATGCA":10092, "ATGCT":11497, "ATGCC":12951, "ATGCG":4998, "ATGGA":15163, "ATGGT":13435, "ATGGC":16735, "ATGGG":15332, + "ACAAA":20173, "ACAAT":10672, "ACAAC":11624, "ACAAG":12065, "ACATA":10093, "ACATT":12479, "ACATC":9306, "ACATG":13340, "ACACA":17245, "ACACT":12503, "ACACC":12162, "ACACG":5934, "ACAGA":17833, "ACAGT":13374, "ACAGC":14858, "ACAGG":19560, + "ACTAA":10387, "ACTAT":9373, "ACTAC":9760, "ACTAG":8748, "ACTTA":9543, "ACTTT":16940, "ACTTC":12091, "ACTTG":14509, "ACTCA":14006, "ACTCT":19301, "ACTCC":15798, "ACTCG":7354, "ACTGA":15660, "ACTGT":17869, "ACTGC":16272, "ACTGG":15325, + "ACCAA":12704, "ACCAT":13032, "ACCAC":19161, "ACCAG":14927, "ACCTA":8242, "ACCTT":11993, "ACCTC":17487, "ACCTG":19081, "ACCCA":15490, "ACCCT":13472, "ACCCC":15788, "ACCCG":29765, "ACCGA":5143, "ACCGT":4486, "ACCGC":9085, "ACCGG":7576, + "ACGAA":4347, "ACGAT":4168, "ACGAC":3959, "ACGAG":6534, "ACGTA":3130, "ACGTT":3667, "ACGTC":4367, "ACGTG":11300, "ACGCA":5880, "ACGCT":5369, "ACGCC":9726, "ACGCG":6966, "ACGGA":7339, "ACGGT":4840, "ACGGC":7844, "ACGGG":9356, + "AGAAA":26088, "AGAAT":43963, "AGAAC":22001, "AGAAG":21156, "AGATA":10332, "AGATT":12123, "AGATC":13562, "AGATG":17442, "AGACA":15551, "AGACT":12889, "AGACC":14919, "AGACG":8629, "AGAGA":23762, "AGAGT":14115, "AGAGC":19442, "AGAGG":27950, + "AGTAA":10898, "AGTAT":8960, "AGTAC":7673, "AGTAG":14337, "AGTTA":9177, "AGTTT":16216, "AGTTC":12091, "AGTTG":12608, "AGTCA":11296, "AGTCT":14145, "AGTCC":12900, "AGTCG":7163, "AGTGA":17435, "AGTGT":14198, "AGTGC":17744, "AGTGG":22209, + "AGCAA":14140, "AGCAT":11999, "AGCAC":14424, "AGCAG":23594, "AGCTA":12534, "AGCTT":13200, "AGCTC":16602, "AGCTG":27125, "AGCCA":22406, "AGCCT":23982, "AGCCC":21151, "AGCCG":16832, "AGCGA":8068, "AGCGT":6399, "AGCGC":14513, "AGCGG":18260, + "AGGAA":23109, "AGGAT":15781, "AGGAC":14033, "AGGAG":31550, "AGGTA":11011, "AGGTT":14275, "AGGTC":12903, "AGGTG":25466, "AGGCA":23097, "AGGCT":28196, "AGGCC":23746, "AGGCG":21038, "AGGGA":23922, "AGGGT":17083, "AGGGC":21939, "AGGGG":23428, + "TAAAA":24818, "TAAAT":16181, "TAAAC":10895, "TAAAG":12747, "TAATA":10720, "TAATT":13989, "TAATC":10151, "TAATG":9761, "TAACA":10517, "TAACT":9374, "TAACC":8739, "TAACG":3445, "TAAGA":11294, "TAAGT":9277, "TAAGC":8394, "TAAGG":10782, + "TATAA":11537, "TATAT":11990, "TATAC":6603, "TATAG":7953, "TATTA":10779, "TATTT":21596, "TATTC":9324, "TATTG":9023, "TATCA":7606, "TATCT":9355, "TATCC":6735, "TATCG":2330, "TATGA":8833, "TATGT":9775, "TATGC":6322, "TATGG":8467, + "TACAA":11552, "TACAT":9147, "TACAC":7476, "TACAG":12967, "TACTA":8406, "TACTT":11576, "TACTC":8815, "TACTG":9463, "TACCA":11003, "TACCT":10257, "TACCC":8367, "TACCG":3813, "TACGA":2811, "TACGT":2635, "TACGC":3100, "TACGG":3495, + "TAGAA":18785, "TAGAT":8781, "TAGAC":7396, "TAGAG":12917, "TAGTA":8685, "TAGTT":9300, "TAGTC":7860, "TAGTG":9800, "TAGCA":9699, "TAGCT":12947, "TAGCC":10997, "TAGCG":4951, "TAGGA":11979, "TAGGT":9046, "TAGGC":9459, "TAGGG":11504, + "TTAAA":20237, "TTAAT":12629, "TTAAC":8538, "TTAAG":11479, "TTATA":10737, "TTATT":18347, "TTATC":7888, "TTATG":9437, "TTACA":12235, "TTACT":10535, "TTACC":8302, "TTACG":2408, "TTAGA":10600, "TTAGT":10662, "TTAGC":10525, "TTAGG":11005, + "TTTAA":19446, "TTTAT":18913, "TTTAC":9919, "TTTAG":13151, "TTTTA":23853, "TTTTT":56398, "TTTTC":23361, "TTTTG":25553, "TTTCA":17602, "TTTCT":26859, "TTTCC":18474, "TTTCG":5128, "TTTGA":17325, "TTTGT":21337, "TTTGC":13900, "TTTGG":20582, + "TTCAA":13191, "TTCAT":13304, "TTCAC":12093, "TTCAG":16469, "TTCTA":13015, "TTCTT":22359, "TTCTC":20598, "TTCTG":20873, "TTCCA":41145, "TTCCT":20903, "TTCCC":18422, "TTCCG":8255, "TTCGA":3656, "TTCGT":4845, "TTCGC":5448, "TTCGG":7946, + "TTGAA":16201, "TTGAT":11679, "TTGAC":7650, "TTGAG":16617, "TTGTA":11059, "TTGTT":18169, "TTGTC":10483, "TTGTG":15086, "TTGCA":12612, "TTGCT":16404, "TTGCC":15007, "TTGCG":5373, "TTGGA":15149, "TTGGT":13723, "TTGGC":22196, "TTGGG":22377, + "TCAAA":14554, "TCAAT":7495, "TCAAC":9464, "TCAAG":13195, "TCATA":8625, "TCATT":13429, "TCATC":10563, "TCATG":12065, "TCACA":13118, "TCACT":15643, "TCACC":14736, "TCACG":6328, "TCAGA":15351, "TCAGT":14041, "TCAGC":17443, "TCAGG":20166, + "TCTAA":9715, "TCTAT":7973, "TCTAC":10766, "TCTAG":9963, "TCTTA":11368, "TCTTT":21100, "TCTTC":15881, "TCTTG":17313, "TCTCA":17862, "TCTCT":21917, "TCTCC":21225, "TCTCG":10223, "TCTGA":19100, "TCTGT":18832, "TCTGC":17399, "TCTGG":21943, + "TCCAA":19166, "TCCAT":16860, "TCCAC":18898, "TCCAG":26531, "TCCTA":10535, "TCCTT":16005, "TCCTC":20234, "TCCTG":25923, "TCCCA":22552, "TCCCT":18665, "TCCCC":20631, "TCCCG":14430, "TCCGA":5398, "TCCGT":5782, "TCCGC":11830, "TCCGG":12879, + "TCGAA":4103, "TCGAT":3835, "TCGAC":3560, "TCGAG":5399, "TCGTA":3183, "TCGTT":4845, "TCGTC":14502, "TCGTG":6590, "TCGCA":5193, "TCGCT":10465, "TCGCC":11427, "TCGCG":7794, "TCGGA":18132, "TCGGT":6916, "TCGGC":12378, "TCGGG":14327, + "TGAAA":19062, "TGAAT":12388, "TGAAC":19829, "TGAAG":17109, "TGATA":8365, "TGATT":11821, "TGATC":14872, "TGATG":11961, "TGACA":10759, "TGACT":11272, "TGACC":11316, "TGACG":4528, "TGAGA":18859, "TGAGT":14310, "TGAGC":18933, "TGAGG":26251, + "TGTAA":13283, "TGTAT":11392, "TGTAC":7544, "TGTAG":16702, "TGTTA":10302, "TGTTT":21097, "TGTTC":11791, "TGTTG":17021, "TGTCA":12436, "TGTCT":16210, "TGTCC":12932, "TGTCG":7035, "TGTGA":15500, "TGTGT":20679, "TGTGC":15163, "TGTGG":21776, + "TGCAA":12030, "TGCAT":10766, "TGCAC":12667, "TGCAG":23209, "TGCTA":9236, "TGCTT":16368, "TGCTC":14898, "TGCTG":25499, "TGCCA":17226, "TGCCT":23853, "TGCCC":20589, "TGCCG":10478, "TGCGA":5814, "TGCGT":5728, "TGCGC":12134, "TGCGG":13942, + "TGGAA":18389, "TGGAT":14922, "TGGAC":10823, "TGGAG":28144, "TGGTA":10064, "TGGTT":14833, "TGGTC":12504, "TGGTG":25842, "TGGCA":26718, "TGGCT":22727, "TGGCC":22820, "TGGCG":19059, "TGGGA":29121, "TGGGT":19905, "TGGGC":26089, "TGGGG":33332, + "CAAAA":21228, "CAAAT":14422, "CAAAC":12178, "CAAAG":17007, "CAATA":9072, "CAATT":9277, "CAATC":7250, "CAATG":11097, "CAACA":14522, "CAACT":10898, "CAACC":11731, "CAACG":5713, "CAAGA":15215, "CAAGT":12224, "CAAGC":13503, "CAAGG":16280, + "CATAA":9806, "CATAT":9378, "CATAC":6869, "CATAG":8353, "CATTA":8535, "CATTT":17915, "CATTC":11093, "CATTG":11281, "CATCA":11013, "CATCT":13545, "CATCC":11575, "CATCG":4981, "CATGA":11584, "CATGT":13127, "CATGC":13045, "CATGG":18324, + "CACAA":13494, "CACAT":12407, "CACAC":17016, "CACAG":18772, "CACTA":9531, "CACTT":14834, "CACTC":13860, "CACTG":21565, "CACCA":19600, "CACCT":18595, "CACCC":36828, "CACCG":9728, "CACGA":5506, "CACGT":5794, "CACGC":10276, "CACGG":9110, + "CAGAA":19864, "CAGAT":12845, "CAGAC":13059, "CAGAG":24313, "CAGTA":10839, "CAGTT":13294, "CAGTC":12054, "CAGTG":22387, "CAGCA":20737, "CAGCT":22172, "CAGCC":30430, "CAGCG":15417, "CAGGA":25069, "CAGGT":19313, "CAGGC":30747, "CAGGG":28661, + "CTAAA":12593, "CTAAT":8971, "CTAAC":7907, "CTAAG":9671, "CTATA":7541, "CTATT":8660, "CTATC":6218, "CTATG":7914, "CTACA":9692, "CTACT":12058, "CTACC":11493, "CTACG":4359, "CTAGA":10660, "CTAGT":7228, "CTAGC":9307, "CTAGG":12080, + "CTTAA":10754, "CTTAT":7990, "CTTAC":7744, "CTTAG":9999, "CTTTA":11100, "CTTTT":21598, "CTTTC":15775, "CTTTG":18616, "CTTCA":12894, "CTTCT":19096, "CTTCC":20044, "CTTCG":6856, "CTTGA":14312, "CTTGT":12457, "CTTGC":13344, "CTTGG":25072, + "CTCAA":14327, "CTCAT":12220, "CTCAC":17129, "CTCAG":24726, "CTCTA":11107, "CTCTT":17463, "CTCTC":20071, "CTCTG":29064, "CTCCA":18922, "CTCCT":25591, "CTCCC":28984, "CTCCG":13809, "CTCGA":5706, "CTCGT":6256, "CTCGC":13402, "CTCGG":17374, + "CTGAA":22033, "CTGAT":11658, "CTGAC":12220, "CTGAG":26738, "CTGTA":18644, "CTGTT":14812, "CTGTC":17221, "CTGTG":22972, "CTGCA":20681, "CTGCT":22192, "CTGCC":28267, "CTGCG":16875, "CTGGA":23011, "CTGGT":17320, "CTGGC":25022, "CTGGG":43377, + "CCAAA":17881, "CCAAT":10764, "CCAAC":12810, "CCAAG":18842, "CCATA":9291, "CCATT":13790, "CCATC":13558, "CCATG":19246, "CCACA":17397, "CCACT":18673, "CCACC":27096, "CCACG":10847, "CCAGA":17950, "CCAGT":14697, "CCAGC":30539, "CCAGG":36610, + "CCTAA":9615, "CCTAT":7075, "CCTAC":8551, "CCTAG":11188, "CCTTA":8231, "CCTTT":15848, "CCTTC":15924, "CCTTG":19536, "CCTCA":19378, "CCTCT":19717, "CCTCC":31772, "CCTCG":13912, "CCTGA":17657, "CCTGT":19582, "CCTGC":24680, "CCTGG":34855, + "CCCAA":15683, "CCCAT":12572, "CCCAC":18517, "CCCAG":33025, "CCCTA":8868, "CCCTT":14966, "CCCTC":20660, "CCCTG":23960, "CCCCA":21066, "CCCCT":18564, "CCCCC":22623, "CCCCG":19960, "CCCGA":30788, "CCCGT":8194, "CCCGC":21627, "CCCGG":25015, + "CCGAA":6613, "CCGAT":4521, "CCGAC":6585, "CCGAG":37595, "CCGTA":3184, "CCGTT":4565, "CCGTC":8357, "CCGTG":9838, "CCGCA":9361, "CCGCT":11809, "CCGCC":27447, "CCGCG":21585, "CCGGA":11914, "CCGGT":8376, "CCGGC":21689, "CCGGG":29697, + "CGAAA":4700, "CGAAT":3916, "CGAAC":4966, "CGAAG":6615, "CGATA":1967, "CGATT":3266, "CGATC":7941, "CGATG":4574, "CGACA":4052, "CGACT":4804, "CGACC":6680, "CGACG":5298, "CGAGA":34089, "CGAGT":6662, "CGAGC":10978, "CGAGG":14584, + "CGTAA":2555, "CGTAT":2364, "CGTAC":2915, "CGTAG":4294, "CGTTA":2393, "CGTTT":4607, "CGTTC":5997, "CGTTG":5146, "CGTCA":4112, "CGTCT":6941, "CGTCC":9099, "CGTCG":15622, "CGTGA":6866, "CGTGT":8348, "CGTGC":9072, "CGTGG":13132, + "CGCAA":4682, "CGCAT":4078, "CGCAC":9181, "CGCAG":13063, "CGCTA":4426, "CGCTT":8240, "CGCTC":11568, "CGCTG":17143, "CGCCA":11479, "CGCCT":15797, "CGCCC":20639, "CGCCG":24099, "CGCGA":6847, "CGCGT":6655, "CGCGC":20645, "CGCGG":25522, + "CGGAA":7863, "CGGAT":6015, "CGGAC":19480, "CGGAG":20120, "CGGTA":3826, "CGGTT":5664, "CGGTC":8752, "CGGTG":14252, "CGGCA":9937, "CGGCT":15864, "CGGCC":23995, "CGGCG":30448, "CGGGA":17246, "CGGGT":13472, "CGGGC":26115, "CGGGG":28394, + "GAAAA":21721, "GAAAT":16887, "GAAAC":14984, "GAAAG":18536, "GAATA":9729, "GAATT":39670, "GAATC":10965, "GAATG":13205, "GAACA":13771, "GAACT":18920, "GAACC":14484, "GAACG":12024, "GAAGA":19232, "GAAGT":13706, "GAAGC":17325, "GAAGG":22201, + "GATAA":9173, "GATAT":6694, "GATAC":5818, "GATAG":7292, "GATTA":9886, "GATTT":12521, "GATTC":9657, "GATTG":9027, "GATCA":10539, "GATCT":11519, "GATCC":10820, "GATCG":16701, "GATGA":10606, "GATGT":9729, "GATGC":9481, "GATGG":18414, + "GACAA":9427, "GACAT":8800, "GACAC":9787, "GACAG":17068, "GACTA":8014, "GACTT":11302, "GACTC":14105, "GACTG":19845, "GACCA":12246, "GACCT":12645, "GACCC":14201, "GACCG":7652, "GACGA":4194, "GACGT":4563, "GACGC":8805, "GACGG":11099, + "GAGAA":48016, "GAGAT":14851, "GAGAC":18447, "GAGAG":25901, "GAGTA":9288, "GAGTT":13284, "GAGTC":13565, "GAGTG":19652, "GAGCA":15659, "GAGCT":19074, "GAGCC":27182, "GAGCG":17393, "GAGGA":25779, "GAGGT":21312, "GAGGC":38665, "GAGGG":28122, + "GTAAA":10837, "GTAAT":10707, "GTAAC":6808, "GTAAG":8647, "GTATA":6821, "GTATT":9863, "GTATC":5364, "GTATG":7088, "GTACA":7449, "GTACT":6890, "GTACC":6514, "GTACG":3316, "GTAGA":16340, "GTAGT":9241, "GTAGC":11429, "GTAGG":10916, + "GTTAA":8362, "GTTAT":7073, "GTTAC":6321, "GTTAG":8624, "GTTTA":9462, "GTTTT":20190, "GTTTC":14550, "GTTTG":14352, "GTTCA":11336, "GTTCT":13070, "GTTCC":12339, "GTTCG":6428, "GTTGA":9647, "GTTGT":10928, "GTTGC":12870, "GTTGG":15970, + "GTCAA":7097, "GTCAT":8765, "GTCAC":10304, "GTCAG":13949, "GTCTA":6040, "GTCTT":13167, "GTCTC":17847, "GTCTG":15436, "GTCCA":9259, "GTCCT":13199, "GTCCC":16744, "GTCCG":9326, "GTCGA":3431, "GTCGT":4722, "GTCGC":11247, "GTCGG":21719, + "GTGAA":14526, "GTGAT":13536, "GTGAC":11233, "GTGAG":21184, "GTGTA":8985, "GTGTT":14013, "GTGTC":12770, "GTGTG":22226, "GTGCA":14547, "GTGCT":16888, "GTGCC":17486, "GTGCG":11240, "GTGGA":18433, "GTGGT":19643, "GTGGC":26118, "GTGGG":28792, + "GCAAA":12051, "GCAAT":8811, "GCAAC":10067, "GCAAG":12792, "GCATA":6727, "GCATT":9991, "GCATC":8735, "GCATG":13228, "GCACA":13387, "GCACT":14018, "GCACC":31268, "GCACG":8523, "GCAGA":18553, "GCAGT":17541, "GCAGC":27815, "GCAGG":29962, + "GCTAA":9605, "GCTAT":6823, "GCTAC":9608, "GCTAG":9284, "GCTTA":7508, "GCTTT":14176, "GCTTC":15915, "GCTTG":15140, "GCTCA":16798, "GCTCT":17440, "GCTCC":20171, "GCTCG":12147, "GCTGA":20034, "GCTGT":19227, "GCTGC":31879, "GCTGG":39633, + "GCCAA":13624, "GCCAT":15124, "GCCAC":19429, "GCCAG":25067, "GCCTA":9398, "GCCTT":15586, "GCCTC":28822, "GCCTG":30950, "GCCCA":20998, "GCCCT":19936, "GCCCC":25612, "GCCCG":22011, "GCCGA":12381, "GCCGT":8216, "GCCGC":29424, "GCCGG":28080, + "GCGAA":5206, "GCGAT":7188, "GCGAC":7291, "GCGAG":14838, "GCGTA":2812, "GCGTT":5674, "GCGTC":9259, "GCGTG":12984, "GCGCA":10605, "GCGCT":14842, "GCGCC":24948, "GCGCG":24814, "GCGGA":15888, "GCGGT":13044, "GCGGC":39771, "GCGGG":33255, + "GGAAA":20475, "GGAAT":13686, "GGAAC":12389, "GGAAG":25169, "GGATA":7769, "GGATT":13659, "GGATC":16418, "GGATG":14504, "GGACA":13485, "GGACT":24031, "GGACC":14058, "GGACG":10155, "GGAGA":27156, "GGAGT":20898, "GGAGC":29993, "GGAGG":44756, + "GGTAA":9610, "GGTAT":6801, "GGTAC":6455, "GGTAG":11305, "GGTTA":7738, "GGTTT":16237, "GGTTC":13589, "GGTTG":14333, "GGTCA":11228, "GGTCT":14848, "GGTCC":14101, "GGTCG":11102, "GGTGA":19470, "GGTGT":16894, "GGTGC":19285, "GGTGG":34442, + "GGCAA":12597, "GGCAT":12360, "GGCAC":29061, "GGCAG":31690, "GGCTA":9272, "GGCTT":15052, "GGCTC":23987, "GGCTG":42745, "GGCCA":22331, "GGCCT":21935, "GGCCC":27840, "GGCCG":27485, "GGCGA":13388, "GGCGT":12103, "GGCGC":28614, "GGCGG":45674, + "GGGAA":22282, "GGGAT":17410, "GGGAC":18391, "GGGAG":41334, "GGGTA":9119, "GGGTT":17546, "GGGTC":18240, "GGGTG":26433, "GGGCA":20798, "GGGCT":25578, "GGGCC":31116, "GGGCG":30143, "GGGGA":28425, "GGGGT":21563, "GGGGC":34750, "GGGGG":33171 + }, + "overrepresented_sequences": { + "AAAAAAAAAAAAAAAAAAAA":39, + "CTTGGCACCCGAGAATTCCA":162, + "GTCGGACTGTAGAACTCTGA":33, + "TCGGACTGTAGAACTCTGAA":62 } + }, + "command": "fastp --overrepresentation_analysis --thread 1 --in1 /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq --adapter_sequence GATCGTCGGACTGTAGAACTCTGAAC --length_required 18 --html /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.html --json /project/shefflab/processed/proseq/peppro_tutorial/pe/11-27-19/results_pipeline/tutorial/fastp/tutorial_R2_fastp_adapter.json --report_title tutorial --stdout " +} \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_processed_fastqc.html b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_processed_fastqc.html new file mode 100644 index 0000000..5a233b8 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_processed_fastqc.html @@ -0,0 +1,187 @@ +tutorial_R1_processed.fastq FastQC Report
FastQCFastQC Report
Wed 27 Nov 2019
tutorial_R1_processed.fastq

[OK]Basic Statistics

MeasureValue
Filenametutorial_R1_processed.fastq
File typeConventional base calls
EncodingSanger / Illumina 1.9
Total Sequences497796
Sequences flagged as poor quality0
Sequence length18-30
%GC53

[OK]Per base sequence quality

Per base quality graph

[OK]Per tile sequence quality

Per base quality graph

[OK]Per sequence quality scores

Per Sequence quality graph

[FAIL]Per base sequence content

Per base sequence content

[OK]Per sequence GC content

Per sequence GC content graph

[OK]Per base N content

N content graph

[WARN]Sequence Length Distribution

Sequence length distribution

[OK]Sequence Duplication Levels

Duplication level graph

[WARN]Overrepresented sequences

SequenceCountPercentagePossible Source
GCCGAATCCTAACCACTAGACCACCAGGGA6120.12294192801870646No Hit

[OK]Adapter Content

Adapter graph

[FAIL]Kmer Content

Kmer graph

SequenceCountPValueObs/Exp MaxMax Obs/Exp Position
GATCATT1100.021.69711315
ATCATTA1200.019.04514716
CATCGAT350.006366337618.81975224
ACGCGGT501.482526E-518.682937
CCGAACC350.007634100518.23680322
ACCGAGA1300.017.06442310
CGGTCTG553.3671233E-516.98277510
AGGATCA1400.016.76879713
CGGAAGG1400.016.679519
CACCGAG1400.016.679519
ATGGATC503.1313632E-416.3492076
GGATCAT1400.016.05521814
GCGGAAG1400.015.8471288
TCGGGCC555.163341E-415.30901816
TGCGGAA1450.015.3006757
GCACCGA1450.015.3006758
GGTCCAA658.73694E-515.19109819
ACCTGCG1500.014.7921394
ATCGCAG607.428703E-414.59205920
GATCGCA853.5939302E-614.52090319
\ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.html b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.html new file mode 100644 index 0000000..471ce9e --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.html @@ -0,0 +1,3511 @@ +fastp report at 2019-07-31 20:43:29 + + + +
+

tutorial +
+ +
+
General
+
+ + + + + + +
fastp version:0.19.4 (https://github.com/OpenGene/fastp)
sequencing:single end (46 cycles)
mean length before filtering:45bp
mean length after filtering:43bp
duplication rate:25.115256% (may be overestimated since this is SE data)
+
+
Before filtering
+
+ + + + + + +
total reads:1000.000000 K
total bases:45.253865 M
Q20 bases:43.967646 M (97.157770%)
Q30 bases:43.636909 M (96.426922%)
GC content:54.417765%
+
+
After filtering
+
+ + + + + + +
total reads:464.390000 K
total bases:20.168949 M
Q20 bases:19.734976 M (97.848311%)
Q30 bases:19.583954 M (97.099527%)
GC content:53.295717%
+
+
Filtering result
+
+ + + + + +
reads passed filters:464.390000 K (46.439000%)
reads with low quality:13.053000 K (1.305300%)
reads with too many N:8 (0.000800%)
reads too short:522.549000 K (52.254900%)
+
+
+
+
+ +
+
Adapter or bad ligation of read1
+
+ + + + + + + + + + +
SequenceOccurrences
TGGAATTCTCGGGTGCC13226
TGGAATTCTCGGGTGCCA9608
TGGAATTCTCGGGTGCCAAGG10676
TGGAATTCTCGGGTGCCAAGGAACTCC7569
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC8351
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA383032
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAAT18321
other adapter sequences148213
+
+
+
+
+ +
+
+
+
+ + +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 46
AAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
AAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
AACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC34 (0.060105%)
AACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
AACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
AACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
AAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
AAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
AAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC55 (0.097229%)
AAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
AAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
AAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
AAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
AAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC59 (0.104300%)
AAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
AATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
AATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA25 (0.044195%)
AATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC51 (0.090158%)
AATTCTCGGGTGCCAAGGAA46 (0.040660%)
AATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
ACAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
ACAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
ACATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
ACCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
ACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
ACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
ACCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
ACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
ACGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
ACGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ACGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC41 (0.072480%)
ACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
ACTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
ACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA24 (0.042427%)
ACTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
ACTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
AGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
AGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC34 (0.060105%)
AGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
AGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
AGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
AGCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
AGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
AGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
AGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
AGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
AGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC26 (0.045963%)
AGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
AGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
AGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
AGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
AGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
AGCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC15 (0.026517%)
AGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
AGGAATTCTCGGGTGCCAAG118 (0.104300%)
AGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
AGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC86 (0.152031%)
AGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
AGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
AGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
AGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC22 (0.038892%)
AGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
AGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC68 (0.120211%)
AGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG31 (0.054802%)
AGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
AGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC26 (0.045963%)
AGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
AGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
AGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
AGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC59 (0.104300%)
AGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
AGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
AGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA32 (0.056570%)
AGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
AGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
AGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC57 (0.100765%)
ATAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
ATACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
ATAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
ATATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC57 (0.100765%)
ATCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
ATCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC56 (0.098997%)
ATCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
ATGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC56 (0.098997%)
ATGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC69 (0.121979%)
ATGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC135 (0.238654%)
ATGGAATTCTCGGGTGCCAA5179 (4.577731%)
ATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT58 (0.112786%)
ATGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
ATGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
ATGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC79 (0.139657%)
ATGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC59 (0.104300%)
ATTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
ATTCTCGGGTGCCAAGGAAC43 (0.038008%)
ATTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
ATTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
ATTGGAATTCTCGGGTGCCA1375 (1.215366%)
ATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA30 (0.053034%)
ATTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
ATTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC85 (0.150263%)
CAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC75 (0.132585%)
CAAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC10 (0.017678%)
CAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
CAAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
CAAGGAACTCCAGTCACGCC86 (0.076016%)
CAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC78 (0.137889%)
CAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
CACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
CACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
CACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
CACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
CACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
CACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
CACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
CACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC85 (0.150263%)
CACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC22 (0.038892%)
CACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
CAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC46 (0.081319%)
CAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
CAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
CAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
CAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
CAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC30 (0.053034%)
CAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
CAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
CATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC74 (0.130818%)
CATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC28 (0.049499%)
CATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA18 (0.031820%)
CATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
CATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC20 (0.035356%)
CCAAGGAACTCCAGTCACGC38 (0.033588%)
CCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
CCACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
CCACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG23 (0.040660%)
CCACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC53 (0.093694%)
CCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
CCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC53 (0.093694%)
CCAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG39 (0.068944%)
CCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
CCATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
CCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC37 (0.065409%)
CCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC64 (0.113140%)
CCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC69 (0.121979%)
CCCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG32 (0.056570%)
CCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC46 (0.081319%)
CCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
CCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC54 (0.095461%)
CCCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG27 (0.047731%)
CCCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG38 (0.067177%)
CCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
CCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC64 (0.113140%)
CCCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG14 (0.024749%)
CCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
CCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
CCCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG34 (0.060105%)
CCCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG27 (0.047731%)
CCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
CCCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
CCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
CCCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
CCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
CCCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
CCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC58 (0.102533%)
CCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC50 (0.088390%)
CCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC41 (0.072480%)
CCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
CCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
CCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
CCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC53 (0.093694%)
CCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC25 (0.044195%)
CCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
CCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
CCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
CCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC68 (0.120211%)
CCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA7 (0.012375%)
CCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC70 (0.123746%)
CCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
CCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC45 (0.079551%)
CGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC19 (0.033588%)
CGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
CGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
CGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
CGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
CGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
CGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC38 (0.067177%)
CGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
CGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
CGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
CGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
CGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
CGCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC26 (0.045963%)
CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT24 (0.042427%)
CGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
CGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
CGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
CGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
CGGGTGCCAAGGAACTCCAG57 (0.050382%)
CGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
CGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
CGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
CGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
CGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA16 (0.028285%)
CGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
CGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC41 (0.072480%)
CTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC60 (0.106068%)
CTACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
CTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC51 (0.090158%)
CTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
CTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC19 (0.033588%)
CTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
CTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
CTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CTCGGGTGCCAAGGAACTCC39 (0.034472%)
CTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC30 (0.053034%)
CTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC26 (0.045963%)
CTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
CTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC54 (0.095461%)
CTGGAATTCTCGGGTGCCAA6923 (6.119256%)
CTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT35 (0.068060%)
CTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
CTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC32 (0.056570%)
CTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
CTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC47 (0.083087%)
CTTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC22 (0.038892%)
CTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
CTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA15 (0.026517%)
CTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
CTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
GAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
GAATTCTCGGGTGCCAAGGA39 (0.034472%)
GAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
GACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC25 (0.044195%)
GAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC66 (0.116675%)
GAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
GAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
GAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
GAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
GAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
GAGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG30 (0.053034%)
GAGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG42 (0.074248%)
GAGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG42 (0.074248%)
GAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
GAGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG34 (0.060105%)
GAGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG35 (0.061873%)
GAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
GAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
GAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GAGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
GAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC99 (0.175013%)
GATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
GATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA13 (0.022981%)
GATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
GATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
GCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC117 (0.206833%)
GCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
GCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC75 (0.132585%)
GCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC73 (0.129050%)
GCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
GCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
GCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
GCATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
GCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
GCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
GCCAAGGAACTCCAGTCACG39 (0.034472%)
GCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC44 (0.077783%)
GCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC88 (0.155567%)
GCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC41 (0.072480%)
GCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
GCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC86 (0.152031%)
GCCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG29 (0.051266%)
GCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC57 (0.100765%)
GCCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG29 (0.051266%)
GCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC64 (0.113140%)
GCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
GCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
GCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
GCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC62 (0.109604%)
GCGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC25 (0.044195%)
GCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
GCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC69 (0.121979%)
GCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC63 (0.111372%)
GCGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG37 (0.065409%)
GCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
GCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC56 (0.098997%)
GCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
GCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
GCGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GCGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
GCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
GCGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
GCTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC45 (0.079551%)
GCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
GCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC27 (0.047731%)
GCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
GCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATC20 (0.038892%)
GCTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC62 (0.109604%)
GCTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC31 (0.054802%)
GCTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA6 (0.010607%)
GCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
GGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
GGAATTCTCGGGTGCCAAGG162 (0.143192%)
GGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCG5 (0.009723%)
GGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
GGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG29 (0.051266%)
GGAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG39 (0.068944%)
GGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
GGAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG24 (0.042427%)
GGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC21 (0.037124%)
GGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC58 (0.102533%)
GGCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
GGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
GGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
GGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
GGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC55 (0.097229%)
GGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC24 (0.042427%)
GGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC27 (0.047731%)
GGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC45 (0.079551%)
GGCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC50 (0.088390%)
GGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
GGCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG19 (0.033588%)
GGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC55 (0.097229%)
GGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC18 (0.031820%)
GGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
GGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC37 (0.065409%)
GGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC24 (0.042427%)
GGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
GGGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC27 (0.047731%)
GGGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG39 (0.068944%)
GGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
GGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
GGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
GGGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG28 (0.049499%)
GGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
GGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC23 (0.040660%)
GGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
GGGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
GGGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG21 (0.037124%)
GGGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG35 (0.061873%)
GGGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG40 (0.070712%)
GGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC36 (0.063641%)
GGGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG24 (0.042427%)
GGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG42 (0.074248%)
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG20 (0.035356%)
GGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC85 (0.150263%)
GGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC17 (0.030053%)
GGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC61 (0.107836%)
GGGTGCCAAGGAACTCCAGT120 (0.106068%)
GGGTGCCTGGAATTCTCGGG14 (0.012375%)
GGGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC46 (0.081319%)
GGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC13 (0.022981%)
GGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
GGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC69 (0.121979%)
GGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
GGTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GGTGCCAAGGAACTCCAGTC76 (0.067177%)
GGTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG34 (0.060105%)
GGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC63 (0.111372%)
GGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA6 (0.010607%)
GGTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG28 (0.049499%)
GGTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG38 (0.067177%)
GGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC52 (0.091926%)
GGTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG30 (0.053034%)
GGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC76 (0.134353%)
GGTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
GGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
GTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
GTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
GTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
GTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC31 (0.054802%)
GTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC44 (0.077783%)
GTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
GTGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
GTGCCAAGGAACTCCAGTCA22 (0.019446%)
GTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC67 (0.118443%)
GTGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC46 (0.081319%)
GTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC70 (0.123746%)
GTGGAATTCTCGGGTGCCAA4479 (3.958999%)
GTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT47 (0.091396%)
GTGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG28 (0.049499%)
GTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
GTGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG24 (0.042427%)
GTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC29 (0.051266%)
GTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
GTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
GTGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
GTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC57 (0.100765%)
GTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC67 (0.118443%)
GTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
GTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
GTTGGAATTCTCGGGTGCCA1640 (1.449600%)
GTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA15 (0.026517%)
GTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC64 (0.113140%)
GTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC102 (0.180316%)
NNNNNNNNNNNNNNNNNNNN317 (0.280197%)
TAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
TAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC38 (0.067177%)
TAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
TAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
TACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC28 (0.049499%)
TACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC43 (0.076016%)
TACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC40 (0.070712%)
TACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
TAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC54 (0.095461%)
TAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC94 (0.166174%)
TAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
TAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
TAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC56 (0.098997%)
TATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC53 (0.093694%)
TATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
TATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA20 (0.035356%)
TATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC65 (0.114907%)
TATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC78 (0.137889%)
TCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC35 (0.061873%)
TCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC25 (0.044195%)
TCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
TCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC39 (0.068944%)
TCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC40 (0.070712%)
TCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
TCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
TCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
TCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC33 (0.058338%)
TCGGGTGCCAAGGAACTCCA111 (0.098113%)
TCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
TCTCGGGTGCCAAGGAACTC75 (0.066293%)
TCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC16 (0.028285%)
TCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA22 (0.038892%)
TGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC39 (0.068944%)
TGAATTCTCGGGTGCCAAGG200 (0.176780%)
TGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
TGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
TGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC35 (0.061873%)
TGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC71 (0.125514%)
TGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
TGCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC23 (0.040660%)
TGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC76 (0.134353%)
TGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC16 (0.028285%)
TGCCAAGGAACTCCAGTCAC31 (0.027401%)
TGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
TGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC67 (0.118443%)
TGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC30 (0.053034%)
TGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC60 (0.106068%)
TGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC67 (0.118443%)
TGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
TGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
TGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC33 (0.058338%)
TGGAATTCTCGGGTGCCAAG145 (0.128166%)
TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTC131 (0.254741%)
TGGAATTCTCGGGTGCCTGG135 (0.119327%)
TGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC72 (0.127282%)
TGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC34 (0.060105%)
TGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC77 (0.136121%)
TGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC38 (0.067177%)
TGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC29 (0.051266%)
TGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC51 (0.090158%)
TGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC47 (0.083087%)
TGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
TGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC55 (0.097229%)
TGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC48 (0.084855%)
TGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC70 (0.123746%)
TGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC32 (0.056570%)
TGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
TGTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC11 (0.019446%)
TGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
TGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC62 (0.109604%)
TGTGGAATTCTCGGGTGCCA1254 (1.108414%)
TGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA13 (0.022981%)
TGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC42 (0.074248%)
TGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC89 (0.157335%)
TTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC52 (0.091926%)
TTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC48 (0.084855%)
TTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC37 (0.065409%)
TTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC63 (0.111372%)
TTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC49 (0.086622%)
TTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC42 (0.074248%)
TTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
TTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC43 (0.076016%)
TTCTCGGGTGCCAAGGAACT27 (0.023865%)
TTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
TTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC45 (0.079551%)
TTGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG32 (0.056570%)
TTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC58 (0.102533%)
TTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC36 (0.063641%)
TTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC49 (0.086622%)
TTGGAATTCTCGGGTGCCAA1870 (1.652897%)
TTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT79 (0.153622%)
TTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC79 (0.139657%)
TTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC77 (0.136121%)
TTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC58 (0.102533%)
TTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC61 (0.107836%)
TTTGGAATTCTCGGGTGCCA1831 (1.618425%)
TTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA15 (0.026517%)
TTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC72 (0.127282%)
TTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC58 (0.102533%)
TTTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC21 (0.037124%)
TTTTTTTTTTTTTTTTTTTT247 (0.218324%)
+
+ +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + + + + + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 46
ATTCTCGGGTGCCAAGGAAC20 (0.039665%)
CAAGGAACTCCAGTCACGCC45 (0.089246%)
CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT28 (0.111062%)
CGGGTGCCAAGGAACTCCAG16 (0.031732%)
CTCGGGTGCCAAGGAACTCC18 (0.035698%)
GCCAAGGAACTCCAGTCACG12 (0.023799%)
GGGTGCCAAGGAACTCCAGT33 (0.065447%)
GGTGCCAAGGAACTCCAGTC43 (0.085280%)
TCGGGTGCCAAGGAACTCCA19 (0.037682%)
TCTCGGGTGCCAAGGAACTC51 (0.101146%)
TTCTCGGGTGCCAAGGAACT18 (0.035698%)
TTTTTTTTTTTTTTTTTTTT248 (0.491845%)
+
+ +
+
+ +

+ \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.json b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.json new file mode 100644 index 0000000..40f439f --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.json @@ -0,0 +1,751 @@ +{ + "summary": { + "before_filtering": { + "total_reads":1000000, + "total_bases":45253865, + "q20_bases":43967646, + "q30_bases":43636909, + "q20_rate":0.971578, + "q30_rate":0.964269, + "read1_mean_length":45, + "gc_content":0.544178 + }, + "after_filtering": { + "total_reads":464390, + "total_bases":20168949, + "q20_bases":19734976, + "q30_bases":19583954, + "q20_rate":0.978483, + "q30_rate":0.970995, + "read1_mean_length":43, + "gc_content":0.532957 + } + }, + "filtering_result": { + "passed_filter_reads": 464390, + "low_quality_reads": 13053, + "too_many_N_reads": 8, + "too_short_reads": 522549, + "too_long_reads": 0 + }, + "duplication": { + "rate": 0.251153, + "histogram": [503744,11613,7640,5342,3985,2834,2119,1608,1219,874,732,569,470,397,319,258,236,179,165,125,108,103,83,65,57,48,44,43,34,28,239], + "mean_gc": [0.480399,0.47184,0.467902,0.467371,0.464419,0.464187,0.462171,0.467381,0.465097,0.465648,0.461261,0.46912,0.45627,0.457875,0.47601,0.462183,0.452193,0.47061,0.474819,0.466855,0.445534,0.455968,0.464068,0.484163,0.466323,0.479167,0.455704,0.431555,0.458131,0.465266,0.457216] + }, + "adapter_cutting": { + "adapter_trimmed_reads": 598996, + "adapter_trimmed_bases": 20040284, + "read1_adapter_sequence": "TGGAATTCTCGGGTGCCAAGG", + "read1_adapter_counts": {"TGGAATTCTCGGGTGCC":13226, "TGGAATTCTCGGGTGCCA":9608, "TGGAATTCTCGGGTGCCAAGG":10676, "TGGAATTCTCGGGTGCCAAGGAACTCC":7569, "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":8351, "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA":383032, "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAAT":18321, "others":148213} + }, + "read1_before_filtering": { + "total_reads": 1000000, + "total_bases": 45253865, + "q20_bases": 43967646, + "q30_bases": 43636909, + "total_cycles": 46, + "quality_curves": { + "A":[30.825,31.2657,31.4639,31.4486,31.4701,35.1037,35.0473,35.0462,29.2943,34.3377,35.4184,35.0449,35.0191,34.7388,34.8436,34.855,34.8743,35.0673,35.4237,35.407,35.4116,35.1336,35.3507,34.991,35.0165,35.1529,35.1368,35.248,35.2393,35.1017,35.0664,34.8135,34.7857,34.7447,34.8149,35.0829,35.1229,34.7796,34.6786,34.8595,34.783,34.7735,34.6327,33.8781,35.1474,34.6511], + "T":[31.5874,31.6519,31.7482,31.7856,31.7864,35.2857,35.2607,35.2695,34.93,35.4955,35.5748,35.463,35.3973,35.2732,35.2216,35.4814,35.3842,35.4626,35.5958,35.6074,35.6002,35.5315,35.5951,35.4782,35.4045,35.3112,35.3535,35.5531,35.538,35.3607,35.347,35.3206,35.392,35.3024,35.2876,35.288,35.469,35.4355,35.2452,35.2433,35.1889,34.8702,35.0138,35.1155,35.1567,34.7483], + "C":[31.6474,31.6396,31.7754,31.8059,31.8197,35.382,35.3296,35.3459,35.7253,35.6156,35.6202,35.6008,35.5569,35.6171,35.4983,35.3351,35.4399,35.4535,35.6175,35.5962,35.5956,35.5052,35.562,35.5001,35.4802,35.4274,35.4352,35.4928,35.4697,35.3313,35.2862,35.3946,35.2492,35.3685,35.362,35.305,35.3884,35.2054,35.3247,35.0867,35.2704,34.7263,35.0728,35.0187,34.9901,34.5319], + "G":[30.994,31.4042,31.6051,31.6401,31.6655,35.1563,35.0983,35.1032,35.1066,34.7424,35.1775,35.3779,35.3458,35.4484,35.3565,35.4238,35.3309,35.41,35.4303,35.4535,35.4857,35.4385,35.4623,35.4419,35.3814,35.2563,35.2569,35.4561,35.3765,35.2343,35.2151,35.2878,35.2521,35.2882,35.2342,35.1407,35.3602,35.1777,35.1811,35.0089,35.0724,34.8611,34.8826,34.8794,34.8105,34.685], + "mean":[31.0743,31.2937,31.4543,31.485,31.5004,35.02,34.9692,34.971,34.4948,34.7427,35.1241,35.0241,34.9936,35.0753,35.0282,35.1029,35.1163,35.1861,35.2667,35.274,35.2923,35.242,35.2635,35.199,35.1668,35.0124,35.0116,35.2275,35.1745,34.9748,34.9478,35.071,35.0514,35.0506,35.0396,35.1553,35.3492,35.2677,35.1959,34.9697,35.16,34.8248,34.9741,34.8271,35.0771,34.6511] + }, + "content_curves": { + "A":[0.261857,0.268095,0.211688,0.201831,0.201457,0.188008,0.204672,0.212125,0.065411,0.092826,0.141561,0.567159,0.552973,0.11433,0.113464,0.106365,0.107895,0.120065,0.110839,0.122845,0.125413,0.137053,0.140272,0.145609,0.15958,0.574379,0.559845,0.146432,0.154294,0.568209,0.552768,0.142272,0.133742,0.131647,0.144863,0.542575,0.132827,0.136031,0.149721,0.548434,0.138468,0.135123,0.143501,0.16222,0.566997,0], + "T":[0.241992,0.263236,0.214082,0.21003,0.212406,0.220349,0.234819,0.281611,0.556641,0.160461,0.147852,0.123155,0.164815,0.588928,0.584086,0.150477,0.557249,0.133987,0.133518,0.128561,0.145741,0.550515,0.128334,0.122974,0.126179,0.129226,0.127741,0.130084,0.132402,0.131377,0.134164,0.151636,0.548358,0.14436,0.136034,0.134135,0.149881,0.541099,0.13293,0.124204,0.126291,0.131345,0.130483,0.129414,0.137012,0.380379], + "C":[0.272413,0.215889,0.229589,0.264438,0.257089,0.260867,0.243539,0.190242,0.12897,0.146404,0.152226,0.177508,0.16473,0.158349,0.166135,0.606719,0.179492,0.580986,0.161947,0.15555,0.143524,0.142381,0.160735,0.581664,0.566584,0.144489,0.141001,0.140405,0.14183,0.146175,0.166327,0.565839,0.176678,0.580319,0.570976,0.161966,0.15826,0.173377,0.570445,0.175035,0.571023,0.182177,0.581659,0.575252,0.161032,0.363115], + "G":[0.216942,0.246339,0.338201,0.317263,0.32261,0.324327,0.31053,0.309584,0.24254,0.593871,0.551922,0.125738,0.111041,0.13195,0.129873,0.129998,0.148925,0.158524,0.587255,0.586595,0.578856,0.163609,0.564198,0.143307,0.141209,0.1454,0.164931,0.576626,0.565019,0.147773,0.14028,0.133802,0.134769,0.137225,0.141684,0.16131,0.559023,0.149488,0.146892,0.152326,0.164216,0.551324,0.144345,0.133098,0.134935,0.256432], + "N":[0.006796,0.006441,0.00644,0.006438,0.006438,0.006449,0.00644,0.006438,0.006438,0.006438,0.006439,0.00644,0.006441,0.006443,0.006442,0.006441,0.006439,0.006438,0.006441,0.006449,0.006466,0.006442,0.006461,0.006446,0.006448,0.006506,0.006482,0.006453,0.006455,0.006466,0.006461,0.006451,0.006453,0.006449,0.006443,1.41045e-05,9.07335e-06,5.04096e-06,1.3107e-05,1.00829e-06,2.01848e-06,3.23089e-05,1.21308e-05,1.62851e-05,2.42268e-05,7.40712e-05], + "GC":[0.489355,0.462228,0.56779,0.581701,0.579699,0.585194,0.554069,0.499826,0.37151,0.740275,0.704148,0.303246,0.275771,0.290299,0.296008,0.736717,0.328417,0.73951,0.749202,0.742145,0.72238,0.30599,0.724933,0.724971,0.707793,0.289889,0.305932,0.717031,0.706849,0.293948,0.306607,0.699641,0.311447,0.717544,0.71266,0.323276,0.717284,0.322865,0.717337,0.327361,0.735239,0.7335,0.726004,0.70835,0.295967,0.619547] + }, + "kmer_count": { + "AAAAA":55223, "AAAAT":34547, "AAAAC":26093, "AAAAG":24400, "AAATA":26058, "AAATT":25637, "AAATC":18572, "AAATG":29168, "AAACA":26239, "AAACT":21408, "AAACC":21078, "AAACG":6935, "AAAGA":23650, "AAAGT":22271, "AAAGC":19974, "AAAGG":23804, + "AATAA":22554, "AATAT":39601, "AATAC":15859, "AATAG":11613, "AATTA":19326, "AATTT":27811, "AATTC":623965, "AATTG":23892, "AATCA":16603, "AATCT":18208, "AATCC":20428, "AATCG":4840, "AATGA":17077, "AATGT":17382, "AATGC":16201, "AATGG":50661, + "AACAA":24529, "AACAT":18198, "AACAC":19622, "AACAG":18920, "AACTA":14428, "AACTT":17160, "AACTC":514868, "AACTG":21875, "AACCA":19983, "AACCT":20429, "AACCC":23604, "AACCG":7369, "AACGA":6285, "AACGT":6011, "AACGC":8988, "AACGG":7871, + "AAGAA":25562, "AAGAT":16706, "AAGAC":16120, "AAGAG":23269, "AAGTA":14183, "AAGTT":19949, "AAGTC":17665, "AAGTG":29348, "AAGCA":23623, "AAGCT":22927, "AAGCC":23918, "AAGCG":14431, "AAGGA":531891, "AAGGT":22558, "AAGGC":26058, "AAGGG":28271, + "ATAAA":23041, "ATAAT":17541, "ATAAC":10379, "ATAAG":11653, "ATATA":14677, "ATATT":19068, "ATATC":25228, "ATATG":19492, "ATACA":16417, "ATACT":14478, "ATACC":11131, "ATACG":4725, "ATAGA":9628, "ATAGT":12225, "ATAGC":11536, "ATAGG":12116, + "ATTAA":16504, "ATTAT":19022, "ATTAC":14205, "ATTAG":13156, "ATTTA":18483, "ATTTT":38693, "ATTTC":20098, "ATTTG":29632, "ATTCA":17817, "ATTCT":624362, "ATTCC":19174, "ATTCG":6470, "ATTGA":11418, "ATTGT":16318, "ATTGC":15796, "ATTGG":53560, + "ATCAA":15276, "ATCAT":15347, "ATCAC":17368, "ATCAG":13777, "ATCTA":12542, "ATCTT":19714, "ATCTC":29642, "ATCTG":21763, "ATCCA":20685, "ATCCT":21763, "ATCCC":24738, "ATCCG":8628, "ATCGA":5135, "ATCGT":5789, "ATCGC":8481, "ATCGG":7533, + "ATGAA":17596, "ATGAT":15870, "ATGAC":11553, "ATGAG":18127, "ATGTA":12373, "ATGTT":21508, "ATGTC":12858, "ATGTG":28088, "ATGCA":18294, "ATGCT":22940, "ATGCC":24502, "ATGCG":10366, "ATGGA":142965, "ATGGT":22893, "ATGGC":26245, "ATGGG":26040, + "ACAAA":28007, "ACAAT":17123, "ACAAC":15826, "ACAAG":19276, "ACATA":13069, "ACATT":19079, "ACATC":12613, "ACATG":25005, "ACACA":27195, "ACACT":19838, "ACACC":24140, "ACACG":10568, "ACAGA":22069, "ACAGT":24702, "ACAGC":26533, "ACAGG":27635, + "ACTAA":13820, "ACTAT":12654, "ACTAC":14681, "ACTAG":11335, "ACTTA":12028, "ACTTT":24353, "ACTTC":17640, "ACTTG":24807, "ACTCA":22583, "ACTCT":18707, "ACTCC":517231, "ACTCG":9284, "ACTGA":15887, "ACTGT":16278, "ACTGC":23006, "ACTGG":48473, + "ACCAA":20122, "ACCAT":21096, "ACCAC":25951, "ACCAG":24166, "ACCTA":13759, "ACCTT":22101, "ACCTC":27126, "ACCTG":32295, "ACCCA":27110, "ACCCT":24968, "ACCCC":28194, "ACCCG":18332, "ACCGA":7521, "ACCGT":8368, "ACCGC":17069, "ACCGG":11654, + "ACGAA":6843, "ACGAT":20578, "ACGAC":8975, "ACGAG":9792, "ACGTA":4394, "ACGTT":7984, "ACGTC":6315, "ACGTG":14559, "ACGCA":12090, "ACGCT":13007, "ACGCC":486805, "ACGCG":11420, "ACGGA":8816, "ACGGT":9158, "ACGGC":13456, "ACGGG":14626, + "AGAAA":28481, "AGAAT":20511, "AGAAC":15969, "AGAAG":22725, "AGATA":12273, "AGATT":16743, "AGATC":12925, "AGATG":24966, "AGACA":20304, "AGACT":17163, "AGACC":17802, "AGACG":9072, "AGAGA":25988, "AGAGT":23849, "AGAGC":24036, "AGAGG":32410, + "AGTAA":14291, "AGTAT":13368, "AGTAC":10385, "AGTAG":14769, "AGTTA":12904, "AGTTT":23612, "AGTTC":22459, "AGTTG":26660, "AGTCA":496175, "AGTCT":17780, "AGTCC":32302, "AGTCG":5872, "AGTGA":20525, "AGTGT":19842, "AGTGC":26523, "AGTGG":57726, + "AGCAA":23871, "AGCAT":22486, "AGCAC":24521, "AGCAG":32373, "AGCTA":21541, "AGCTT":26593, "AGCTC":25117, "AGCTG":44345, "AGCCA":34228, "AGCCT":43454, "AGCCC":37988, "AGCCG":21193, "AGCGA":15462, "AGCGT":14175, "AGCGC":24410, "AGCGG":20103, + "AGGAA":535101, "AGGAT":20197, "AGGAC":17593, "AGGAG":38355, "AGGTA":15526, "AGGTT":25980, "AGGTC":17425, "AGGTG":45422, "AGGCA":36601, "AGGCT":42129, "AGGCC":36280, "AGGCG":28626, "AGGGA":28642, "AGGGT":32152, "AGGGC":37922, "AGGGG":41749, + "TAAAA":27778, "TAAAT":20682, "TAAAC":13474, "TAAAG":15264, "TAATA":13511, "TAATT":20492, "TAATC":14672, "TAATG":19704, "TAACA":14478, "TAACT":14082, "TAACC":11400, "TAACG":5651, "TAAGA":13680, "TAAGT":14465, "TAAGC":13743, "TAAGG":16143, + "TATAA":14337, "TATAT":16827, "TATAC":10003, "TATAG":9624, "TATTA":13635, "TATTT":29547, "TATTC":13982, "TATTG":22622, "TATCA":11503, "TATCT":26182, "TATCC":13128, "TATCG":5363, "TATGA":11739, "TATGT":17052, "TATGC":14576, "TATGG":43613, + "TACAA":16263, "TACAT":15489, "TACAC":14312, "TACAG":25831, "TACTA":12355, "TACTT":17867, "TACTC":14971, "TACTG":21174, "TACCA":17699, "TACCT":19845, "TACCC":15186, "TACCG":7405, "TACGA":4960, "TACGT":6625, "TACGC":7389, "TACGG":7970, + "TAGAA":14700, "TAGAT":11129, "TAGAC":8577, "TAGAG":15821, "TAGTA":10838, "TAGTT":15873, "TAGTC":12728, "TAGTG":20570, "TAGCA":16185, "TAGCT":21497, "TAGCC":18677, "TAGCG":10544, "TAGGA":16220, "TAGGT":15912, "TAGGC":18717, "TAGGG":22573, + "TTAAA":23479, "TTAAT":17769, "TTAAC":12487, "TTAAG":14941, "TTATA":13954, "TTATT":23785, "TTATC":13594, "TTATG":21307, "TTACA":18116, "TTACT":16411, "TTACC":16110, "TTACG":5529, "TTAGA":12568, "TTAGT":15507, "TTAGC":16581, "TTAGG":17875, + "TTTAA":23308, "TTTAT":23585, "TTTAC":15439, "TTTAG":16446, "TTTTA":26959, "TTTTT":231675, "TTTTC":27328, "TTTTG":39731, "TTTCA":24188, "TTTCT":31504, "TTTCC":28384, "TTTCG":7712, "TTTGA":18362, "TTTGT":26430, "TTTGC":21832, "TTTGG":74998, + "TTCAA":21767, "TTCAT":19897, "TTCAC":21650, "TTCAG":23750, "TTCTA":23129, "TTCTT":29817, "TTCTC":621418, "TTCTG":31953, "TTCCA":27216, "TTCCT":32195, "TTCCC":32474, "TTCCG":11498, "TTCGA":5785, "TTCGT":6808, "TTCGC":9169, "TTCGG":10691, + "TTGAA":19443, "TTGAT":14643, "TTGAC":10795, "TTGAG":20864, "TTGTA":14447, "TTGTT":25042, "TTGTC":14986, "TTGTG":31359, "TTGCA":20299, "TTGCT":27827, "TTGCC":27693, "TTGCG":12518, "TTGGA":190740, "TTGGT":23636, "TTGGC":27637, "TTGGG":34514, + "TCAAA":21635, "TCAAT":15900, "TCAAC":13325, "TCAAG":21810, "TCATA":10935, "TCATT":18985, "TCATC":14051, "TCATG":21195, "TCACA":22013, "TCACT":25706, "TCACC":29214, "TCACG":486803, "TCAGA":20852, "TCAGT":18958, "TCAGC":26909, "TCAGG":25283, + "TCTAA":13321, "TCTAT":14094, "TCTAC":18502, "TCTAG":16086, "TCTTA":13142, "TCTTT":25929, "TCTTC":23016, "TCTTG":27552, "TCTCA":24185, "TCTCT":30903, "TCTCC":35162, "TCTCG":607957, "TCTGA":18462, "TCTGT":21662, "TCTGC":24477, "TCTGG":49989, + "TCCAA":20838, "TCCAT":22773, "TCCAC":25455, "TCCAG":513119, "TCCTA":14944, "TCCTT":27608, "TCCTC":31884, "TCCTG":39675, "TCCCA":42000, "TCCCT":34791, "TCCCC":38837, "TCCCG":24627, "TCCGA":22098, "TCCGT":9428, "TCCGC":20715, "TCCGG":16127, + "TCGAA":5968, "TCGAT":5653, "TCGAC":5195, "TCGAG":9898, "TCGTA":3647, "TCGTT":6922, "TCGTC":5945, "TCGTG":12590, "TCGCA":7880, "TCGCT":12952, "TCGCC":16597, "TCGCG":10913, "TCGGA":7559, "TCGGT":9723, "TCGGC":16094, "TCGGG":594588, + "TGAAA":21250, "TGAAT":20718, "TGAAC":15120, "TGAAG":16829, "TGATA":10203, "TGATT":16868, "TGATC":15183, "TGATG":21972, "TGACA":16981, "TGACT":17134, "TGACC":16566, "TGACG":6636, "TGAGA":21180, "TGAGT":19474, "TGAGC":25151, "TGAGG":31037, + "TGTAA":17462, "TGTAT":15760, "TGTAC":10890, "TGTAG":13266, "TGTTA":14013, "TGTTT":27543, "TGTTC":17401, "TGTTG":30744, "TGTCA":15963, "TGTCT":22150, "TGTCC":19858, "TGTCG":7704, "TGTGA":18777, "TGTGT":29039, "TGTGC":25742, "TGTGG":60798, + "TGCAA":22414, "TGCAT":20466, "TGCAC":23073, "TGCAG":31715, "TGCTA":17184, "TGCTT":29990, "TGCTC":23124, "TGCTG":48445, "TGCCA":569786, "TGCCT":46856, "TGCCC":38195, "TGCCG":17740, "TGCGA":9969, "TGCGT":14682, "TGCGC":20502, "TGCGG":18676, + "TGGAA":630261, "TGGAT":18801, "TGGAC":14118, "TGGAG":29964, "TGGTA":15609, "TGGTT":25359, "TGGTC":19268, "TGGTG":44823, "TGGCA":30392, "TGGCT":38145, "TGGCC":38538, "TGGCG":25253, "TGGGA":35090, "TGGGT":34602, "TGGGC":39605, "TGGGG":44512, + "CAAAA":30274, "CAAAT":22830, "CAAAC":19560, "CAAAG":25664, "CAATA":35724, "CAATT":20014, "CAATC":13655, "CAATG":28557, "CAACA":22947, "CAACT":19283, "CAACC":20281, "CAACG":8767, "CAAGA":22122, "CAAGT":25093, "CAAGC":25237, "CAAGG":550432, + "CATAA":12822, "CATAT":14999, "CATAC":11189, "CATAG":11331, "CATTA":13818, "CATTT":29453, "CATTC":18526, "CATTG":28533, "CATCA":15777, "CATCT":23093, "CATCC":20968, "CATCG":6711, "CATGA":16582, "CATGT":22666, "CATGC":23507, "CATGG":71747, + "CACAA":22440, "CACAT":22299, "CACAC":29770, "CACAG":30120, "CACTA":14297, "CACTT":27185, "CACTC":26548, "CACTG":40540, "CACCA":35285, "CACCT":36196, "CACCC":36291, "CACCG":18778, "CACGA":11889, "CACGT":12885, "CACGC":492868, "CACGG":17097, + "CAGAA":23939, "CAGAT":17269, "CAGAC":19700, "CAGAG":33691, "CAGTA":14132, "CAGTT":22652, "CAGTC":506526, "CAGTG":38787, "CAGCA":35685, "CAGCT":42816, "CAGCC":57090, "CAGCG":26376, "CAGGA":33839, "CAGGT":31441, "CAGGC":46277, "CAGGG":40292, + "CTAAA":16827, "CTAAT":15245, "CTAAC":11706, "CTAAG":14620, "CTATA":10862, "CTATT":18412, "CTATC":10905, "CTATG":23234, "CTACA":23601, "CTACT":21892, "CTACC":18373, "CTACG":8850, "CTAGA":13645, "CTAGT":15658, "CTAGC":18000, "CTAGG":22301, + "CTTAA":14475, "CTTAT":14705, "CTTAC":13391, "CTTAG":14010, "CTTTA":16602, "CTTTT":31572, "CTTTC":24367, "CTTTG":37161, "CTTCA":22202, "CTTCT":29743, "CTTCC":35151, "CTTCG":9888, "CTTGA":18180, "CTTGT":21766, "CTTGC":22475, "CTTGG":79049, + "CTCAA":23273, "CTCAT":17524, "CTCAC":28900, "CTCAG":35229, "CTCTA":15415, "CTCTT":23526, "CTCTC":32425, "CTCTG":39660, "CTCCA":517020, "CTCCT":40820, "CTCCC":54346, "CTCCG":23761, "CTCGA":9143, "CTCGT":13678, "CTCGC":18205, "CTCGG":605004, + "CTGAA":20060, "CTGAT":14406, "CTGAC":19167, "CTGAG":29895, "CTGTA":18034, "CTGTT":20631, "CTGTC":20668, "CTGTG":30926, "CTGCA":30993, "CTGCT":33104, "CTGCC":44179, "CTGCG":19441, "CTGGA":203162, "CTGGT":23048, "CTGGC":35364, "CTGGG":48625, + "CCAAA":26595, "CCAAT":76192, "CCAAC":21988, "CCAAG":551449, "CCATA":12671, "CCATT":26993, "CCATC":23226, "CCATG":45459, "CCACA":29577, "CCACT":35715, "CCACC":43528, "CCACG":20792, "CCAGA":26927, "CCAGT":507843, "CCAGC":56072, "CCAGG":51305, + "CCTAA":13607, "CCTAT":15887, "CCTAC":16811, "CCTAG":18100, "CCTTA":14433, "CCTTT":29102, "CCTTC":29351, "CCTTG":41094, "CCTCA":31312, "CCTCT":36045, "CCTCC":53848, "CCTCG":20018, "CCTGA":23658, "CCTGT":27633, "CCTGC":37128, "CCTGG":104012, + "CCCAA":30597, "CCCAT":29310, "CCCAC":39158, "CCCAG":59554, "CCCTA":15899, "CCCTT":30632, "CCCTC":35685, "CCCTG":54282, "CCCCA":45826, "CCCCT":37133, "CCCCC":41599, "CCCCG":36870, "CCCGA":16688, "CCCGT":18103, "CCCGC":45255, "CCCGG":38317, + "CCGAA":7795, "CCGAT":7933, "CCGAC":27348, "CCGAG":21329, "CCGTA":5029, "CCGTT":11320, "CCGTC":12724, "CCGTG":24105, "CCGCA":19706, "CCGCT":29088, "CCGCC":54750, "CCGCG":32697, "CCGGA":15047, "CCGGT":13067, "CCGGC":31844, "CCGGG":34316, + "CGAAA":6689, "CGAAT":6613, "CGAAC":7514, "CGAAG":9458, "CGATA":4113, "CGATT":8487, "CGATC":21759, "CGATG":15070, "CGACA":11568, "CGACT":9424, "CGACC":12210, "CGACG":22922, "CGAGA":12362, "CGAGT":13367, "CGAGC":17015, "CGAGG":24021, + "CGTAA":4222, "CGTAT":5837, "CGTAC":4800, "CGTAG":6245, "CGTTA":6142, "CGTTT":11239, "CGTTC":8845, "CGTTG":17667, "CGTCA":6863, "CGTCT":11002, "CGTCC":13963, "CGTCG":6103, "CGTGA":9959, "CGTGT":12995, "CGTGC":16397, "CGTGG":50964, + "CGCAA":11871, "CGCAT":13332, "CGCAC":14783, "CGCAG":24019, "CGCTA":10566, "CGCTT":20057, "CGCTC":21746, "CGCTG":36683, "CGCCA":483325, "CGCCT":32660, "CGCCC":41085, "CGCCG":34813, "CGCGA":11953, "CGCGT":15911, "CGCGC":35171, "CGCGG":28951, + "CGGAA":11198, "CGGAT":8250, "CGGAC":11312, "CGGAG":18027, "CGGTA":6818, "CGGTT":10985, "CGGTC":10507, "CGGTG":24345, "CGGCA":16149, "CGGCT":22832, "CGGCC":32538, "CGGCG":29175, "CGGGA":20815, "CGGGT":593174, "CGGGC":29840, "CGGGG":33211, + "GAAAA":24817, "GAAAT":20739, "GAAAC":17062, "GAAAG":18556, "GAATA":11875, "GAATT":632098, "GAATC":12606, "GAATG":20121, "GAACA":14577, "GAACT":517081, "GAACC":17164, "GAACG":6109, "GAAGA":18206, "GAAGT":16535, "GAAGC":20610, "GAAGG":22629, + "GATAA":10292, "GATAT":10852, "GATAC":7996, "GATAG":9691, "GATTA":13160, "GATTT":18560, "GATTC":14862, "GATTG":17938, "GATCA":16046, "GATCT":19389, "GATCC":20140, "GATCG":8837, "GATGA":14331, "GATGT":14664, "GATGC":15659, "GATGG":47696, + "GACAA":14671, "GACAT":12196, "GACAC":16969, "GACAG":20999, "GACTA":9495, "GACTT":15660, "GACTC":16682, "GACTG":19882, "GACCA":15374, "GACCT":17618, "GACCC":22910, "GACCG":9949, "GACGA":20673, "GACGT":6288, "GACGC":12396, "GACGG":11805, + "GAGAA":22236, "GAGAT":18674, "GAGAC":20606, "GAGAG":28092, "GAGTA":11847, "GAGTT":24827, "GAGTC":18574, "GAGTG":30765, "GAGCA":21670, "GAGCT":26523, "GAGCC":33293, "GAGCG":18176, "GAGGA":28457, "GAGGT":29950, "GAGGC":46973, "GAGGG":38666, + "GTAAA":12736, "GTAAT":16401, "GTAAC":9856, "GTAAG":11491, "GTATA":9543, "GTATT":16492, "GTATC":8714, "GTATG":17992, "GTACA":11164, "GTACT":12562, "GTACC":10782, "GTACG":5520, "GTAGA":11338, "GTAGT":13090, "GTAGC":14100, "GTAGG":13180, + "GTTAA":11897, "GTTAT":13401, "GTTAC":11243, "GTTAG":12784, "GTTTA":13873, "GTTTT":27741, "GTTTC":18673, "GTTTG":29490, "GTTCA":20459, "GTTCT":24846, "GTTCC":17722, "GTTCG":6865, "GTTGA":13305, "GTTGT":18113, "GTTGC":20166, "GTTGG":62287, + "GTCAA":11429, "GTCAT":12405, "GTCAC":496745, "GTCAG":16697, "GTCTA":10352, "GTCTT":17742, "GTCTC":23908, "GTCTG":22926, "GTCCA":16680, "GTCCT":20258, "GTCCC":26742, "GTCCG":24885, "GTCGA":5492, "GTCGT":5262, "GTCGC":10196, "GTCGG":9079, + "GTGAA":16405, "GTGAT":18835, "GTGAC":15708, "GTGAG":25447, "GTGTA":11527, "GTGTT":21718, "GTGTC":16644, "GTGTG":41502, "GTGCA":25764, "GTGCT":32606, "GTGCC":588167, "GTGCG":18233, "GTGGA":156486, "GTGGT":32497, "GTGGC":36802, "GTGGG":37668, + "GCAAA":20015, "GCAAT":20263, "GCAAC":18741, "GCAAG":23343, "GCATA":11407, "GCATT":22219, "GCATC":15223, "GCATG":37750, "GCACA":21639, "GCACT":26638, "GCACC":26735, "GCACG":15202, "GCAGA":22284, "GCAGT":28911, "GCAGC":45041, "GCAGG":40406, + "GCTAA":15136, "GCTAT":18284, "GCTAC":19990, "GCTAG":18093, "GCTTA":13983, "GCTTT":27592, "GCTTC":25286, "GCTTG":43193, "GCTCA":25633, "GCTCT":26900, "GCTCC":38171, "GCTCG":14899, "GCTGA":24042, "GCTGT":23618, "GCTGC":38663, "GCTGG":105129, + "GCCAA":601690, "GCCAT":33211, "GCCAC":36026, "GCCAG":36611, "GCCTA":17279, "GCCTT":32597, "GCCTC":48379, "GCCTG":67680, "GCCCA":39958, "GCCCT":39002, "GCCCC":50423, "GCCCG":37502, "GCCGA":15650, "GCCGT":15089, "GCCGC":47554, "GCCGG":27214, + "GCGAA":8846, "GCGAT":13749, "GCGAC":14582, "GCGAG":21402, "GCGTA":6966, "GCGTT":15587, "GCGTC":12333, "GCGTG":35552, "GCGCA":20418, "GCGCT":30547, "GCGCC":38792, "GCGCG":32830, "GCGGA":16573, "GCGGT":19194, "GCGGC":38456, "GCGGG":35615, + "GGAAA":23836, "GGAAT":633027, "GGAAC":521260, "GGAAG":25491, "GGATA":9722, "GGATT":20796, "GGATC":14193, "GGATG":26264, "GGACA":16457, "GGACT":18120, "GGACC":19201, "GGACG":11445, "GGAGA":26690, "GGAGT":26400, "GGAGC":30601, "GGAGG":53083, + "GGTAA":12273, "GGTAT":15668, "GGTAC":12166, "GGTAG":15281, "GGTTA":13345, "GGTTT":24730, "GGTTC":20147, "GGTTG":34735, "GGTCA":16415, "GGTCT":23372, "GGTCC":21522, "GGTCG":9147, "GGTGA":23533, "GGTGT":26195, "GGTGC":595350, "GGTGG":92124, + "GGCAA":22919, "GGCAT":26591, "GGCAC":26709, "GGCAG":43958, "GGCTA":18293, "GGCTT":29848, "GGCTC":35992, "GGCTG":60231, "GGCCA":34285, "GGCCT":41493, "GGCCC":46107, "GGCCG":31623, "GGCGA":18270, "GGCGT":23260, "GGCGC":37773, "GGCGG":41648, + "GGGAA":26512, "GGGAT":23710, "GGGAC":23254, "GGGAG":47972, "GGGTA":15757, "GGGTT":29311, "GGGTC":23396, "GGGTG":624958, "GGGCA":34457, "GGGCT":41166, "GGGCC":46250, "GGGCG":36118, "GGGGA":33765, "GGGGT":36877, "GGGGC":49693, "GGGGG":61798 + }, + "overrepresented_sequences": { + "AAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "AAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "AACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":34, + "AACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "AACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "AACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "AAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "AAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "AAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":55, + "AAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "AAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "AAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "AAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "AAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":59, + "AAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "AATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "AATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":25, + "AATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":51, + "AATTCTCGGGTGCCAAGGAA":46, + "AATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "ACAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "ACAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "ACATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "ACCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "ACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "ACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "ACCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "ACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "ACGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "ACGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ACGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":41, + "ACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "ACTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "ACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":24, + "ACTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "ACTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "AGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "AGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":34, + "AGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "AGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "AGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "AGCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "AGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "AGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "AGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "AGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "AGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":26, + "AGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "AGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "AGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "AGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "AGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "AGCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":15, + "AGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "AGGAATTCTCGGGTGCCAAG":118, + "AGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "AGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":86, + "AGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "AGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "AGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "AGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":22, + "AGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "AGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":68, + "AGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":31, + "AGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "AGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":26, + "AGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "AGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "AGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "AGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":59, + "AGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "AGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "AGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":32, + "AGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "AGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "AGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":57, + "ATAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "ATACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "ATAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "ATATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":57, + "ATCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "ATCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":56, + "ATCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "ATGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":56, + "ATGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":69, + "ATGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":135, + "ATGGAATTCTCGGGTGCCAA":5179, + "ATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":58, + "ATGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "ATGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "ATGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":79, + "ATGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":59, + "ATTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "ATTCTCGGGTGCCAAGGAAC":43, + "ATTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "ATTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "ATTGGAATTCTCGGGTGCCA":1375, + "ATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":30, + "ATTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "ATTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":85, + "CAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":75, + "CAAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":10, + "CAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "CAAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "CAAGGAACTCCAGTCACGCC":86, + "CAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":78, + "CAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "CACACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "CACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "CACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "CACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "CACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "CACCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "CACGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "CACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":85, + "CACTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":22, + "CACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "CAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":46, + "CAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "CAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "CAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "CAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "CAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":30, + "CAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "CAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "CATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":74, + "CATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":28, + "CATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":18, + "CATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "CATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":20, + "CCAAGGAACTCCAGTCACGC":38, + "CCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "CCACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "CCACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":23, + "CCACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":53, + "CCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "CCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":53, + "CCAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":39, + "CCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "CCATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "CCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":37, + "CCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":64, + "CCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":69, + "CCCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":32, + "CCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":46, + "CCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "CCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":54, + "CCCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":27, + "CCCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":38, + "CCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "CCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":64, + "CCCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":14, + "CCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "CCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "CCCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":34, + "CCCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":27, + "CCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "CCCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "CCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "CCCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "CCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "CCCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "CCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":58, + "CCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":50, + "CCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":41, + "CCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "CCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "CCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "CCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":53, + "CCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":25, + "CCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "CCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "CCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "CCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":68, + "CCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":7, + "CCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":70, + "CCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "CCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":45, + "CGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":19, + "CGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "CGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "CGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "CGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "CGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "CGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":38, + "CGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "CGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "CGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "CGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "CGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "CGCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":26, + "CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT":24, + "CGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "CGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "CGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "CGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "CGGGTGCCAAGGAACTCCAG":57, + "CGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "CGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "CGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "CGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "CGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":16, + "CGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "CGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":41, + "CTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":60, + "CTACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "CTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":51, + "CTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "CTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":19, + "CTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "CTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "CTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CTCGGGTGCCAAGGAACTCC":39, + "CTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":30, + "CTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":26, + "CTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "CTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":54, + "CTGGAATTCTCGGGTGCCAA":6923, + "CTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":35, + "CTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "CTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":32, + "CTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "CTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":47, + "CTTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":22, + "CTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "CTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":15, + "CTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "CTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "GAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "GAATTCTCGGGTGCCAAGGA":39, + "GAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "GACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":25, + "GAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":66, + "GAGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "GAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "GAGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "GAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "GAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "GAGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":30, + "GAGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":42, + "GAGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":42, + "GAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "GAGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":34, + "GAGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":35, + "GAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "GAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "GAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GAGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "GAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":99, + "GATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "GATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":13, + "GATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "GATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "GCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":117, + "GCAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "GCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":75, + "GCAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":73, + "GCAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "GCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "GCAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "GCATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "GCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "GCATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "GCCAAGGAACTCCAGTCACG":39, + "GCCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":44, + "GCCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":88, + "GCCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":41, + "GCCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "GCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":86, + "GCCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":29, + "GCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":57, + "GCCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":29, + "GCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":64, + "GCCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "GCCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "GCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "GCCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GCCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":62, + "GCGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":25, + "GCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "GCGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":69, + "GCGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":63, + "GCGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":37, + "GCGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "GCGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":56, + "GCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "GCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "GCGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GCGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "GCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "GCGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "GCTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":45, + "GCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "GCTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":27, + "GCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "GCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATC":20, + "GCTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GCTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":62, + "GCTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":31, + "GCTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCA":6, + "GCTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GCTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "GGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "GGAATTCTCGGGTGCCAAGG":162, + "GGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCG":5, + "GGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "GGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":29, + "GGAGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":39, + "GGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "GGAGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":24, + "GGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":21, + "GGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":58, + "GGCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "GGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "GGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "GGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "GGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":55, + "GGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":24, + "GGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":27, + "GGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":45, + "GGCGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":50, + "GGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "GGCGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":19, + "GGCGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":55, + "GGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":18, + "GGCGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "GGCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":37, + "GGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":24, + "GGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "GGGAGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC":27, + "GGGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":39, + "GGGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "GGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "GGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "GGGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":28, + "GGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "GGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":23, + "GGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "GGGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "GGGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":21, + "GGGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":35, + "GGGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":40, + "GGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":36, + "GGGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":24, + "GGGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":42, + "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG":20, + "GGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":85, + "GGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":17, + "GGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":61, + "GGGTGCCAAGGAACTCCAGT":120, + "GGGTGCCTGGAATTCTCGGG":14, + "GGGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":46, + "GGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":13, + "GGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "GGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":69, + "GGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "GGTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GGTGCCAAGGAACTCCAGTC":76, + "GGTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":34, + "GGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":63, + "GGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":6, + "GGTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":28, + "GGTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":38, + "GGTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":52, + "GGTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":30, + "GGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":76, + "GGTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "GGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "GTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "GTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "GTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "GTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":31, + "GTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":44, + "GTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "GTGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "GTGCCAAGGAACTCCAGTCA":22, + "GTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":67, + "GTGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":46, + "GTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":70, + "GTGGAATTCTCGGGTGCCAA":4479, + "GTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":47, + "GTGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":28, + "GTGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "GTGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":24, + "GTGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":29, + "GTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "GTGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "GTGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "GTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":57, + "GTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":67, + "GTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "GTTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "GTTGGAATTCTCGGGTGCCA":1640, + "GTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":15, + "GTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":64, + "GTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":102, + "NNNNNNNNNNNNNNNNNNNN":317, + "TAAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "TAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":38, + "TAAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "TAATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "TACATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TACCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":28, + "TACCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":43, + "TACCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TACGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":40, + "TACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TAGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "TAGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":54, + "TAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":94, + "TAGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "TAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "TAGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":56, + "TATATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":53, + "TATCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "TATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":20, + "TATGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":65, + "TATTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":78, + "TCACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":35, + "TCAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":25, + "TCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "TCCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":39, + "TCCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":40, + "TCCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "TCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "TCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "TCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":33, + "TCGGGTGCCAAGGAACTCCA":111, + "TCTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "TCTCGGGTGCCAAGGAACTC":75, + "TCTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":16, + "TCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":22, + "TGAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":39, + "TGAATTCTCGGGTGCCAAGG":200, + "TGACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "TGAGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "TGAGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":35, + "TGAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":71, + "TGATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "TGCAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":23, + "TGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":76, + "TGCATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":16, + "TGCCAAGGAACTCCAGTCAC":31, + "TGCCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "TGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":67, + "TGCCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":30, + "TGCCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":60, + "TGCGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":67, + "TGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "TGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "TGCTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":33, + "TGGAATTCTCGGGTGCCAAG":145, + "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTC":131, + "TGGAATTCTCGGGTGCCTGG":135, + "TGGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":72, + "TGGCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":34, + "TGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":77, + "TGGCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":38, + "TGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":29, + "TGGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":51, + "TGGGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":47, + "TGGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "TGGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":55, + "TGGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":48, + "TGGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":70, + "TGGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":32, + "TGTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "TGTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":11, + "TGTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "TGTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":62, + "TGTGGAATTCTCGGGTGCCA":1254, + "TGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":13, + "TGTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":42, + "TGTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":89, + "TTAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":52, + "TTACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":48, + "TTAGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":37, + "TTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":63, + "TTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":49, + "TTCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":42, + "TTCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "TTCGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":43, + "TTCTCGGGTGCCAAGGAACT":27, + "TTCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "TTGATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":45, + "TTGCCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACG":32, + "TTGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":58, + "TTGCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":36, + "TTGCTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":49, + "TTGGAATTCTCGGGTGCCAA":1870, + "TTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCT":79, + "TTGGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":79, + "TTGTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":77, + "TTTATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":58, + "TTTCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":61, + "TTTGGAATTCTCGGGTGCCA":1831, + "TTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAA":15, + "TTTGTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":72, + "TTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCC":58, + "TTTTTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC":21, + "TTTTTTTTTTTTTTTTTTTT":247 } + }, + "read1_after_filtering": { + "total_reads": 464390, + "total_bases": 20168949, + "q20_bases": 19734976, + "q30_bases": 19583954, + "total_cycles": 46, + "quality_curves": { + "A":[31.2401,31.5022,31.636,31.6266,31.6418,35.3722,35.3044,35.163,29.4968,34.3459,35.4818,34.0262,33.623,34.9507,35.0877,35.1445,35.1238,35.2717,35.5328,35.5116,35.5107,35.2808,35.4546,35.1221,35.1075,33.931,33.9263,35.4028,35.393,34.2387,34.2948,35.0348,35.0266,34.9663,34.9694,33.8772,35.2847,35.0588,34.9437,34.0013,35.0035,34.9558,35.0212,34.3869,34.4151,34.9608], + "T":[31.7057,31.7317,31.8156,31.8333,31.8356,35.421,35.3998,35.4321,34.8953,35.5527,35.629,35.5163,35.4649,35.3396,35.1332,35.5055,35.18,35.4995,35.6472,35.663,35.6578,35.4918,35.6359,35.5049,35.4427,35.351,35.3955,35.6073,35.6008,35.4272,35.391,35.3695,35.2359,35.3385,35.3526,35.3373,35.5239,35.2352,35.2943,35.2832,35.2925,35.1967,35.1907,35.2366,35.2946,34.995], + "C":[31.7543,31.7341,31.8508,31.8611,31.8692,35.4989,35.4932,35.4685,35.7496,35.6681,35.6658,35.6331,35.5948,35.6448,35.5296,35.2787,35.4779,35.1619,35.6657,35.6526,35.6522,35.5465,35.6182,35.3801,35.2972,35.478,35.487,35.5555,35.5347,35.3807,35.3303,35.0579,35.301,35.2075,35.1599,35.3702,35.4638,35.2103,34.9253,35.1284,34.9168,35.1121,35.0812,34.9454,35.2082,35.079], + "G":[31.0929,31.5382,31.7071,31.7314,31.7502,35.3147,35.2504,35.2554,35.1647,34.9292,35.3012,35.4334,35.4166,35.4993,35.4229,35.4899,35.4124,35.4862,35.4727,35.5204,35.5427,35.4896,35.4576,35.4835,35.431,35.3145,35.3219,35.4564,35.4278,35.3371,35.3271,35.3734,35.3208,35.3572,35.3076,35.2383,35.3131,35.28,35.2836,35.1727,35.1823,35.0648,35.0097,35.0101,34.9348,34.7628], + "mean":[31.4558,31.6196,31.7461,31.7669,31.7776,35.3954,35.3472,35.3116,34.684,35.2227,35.5365,35.2027,35.0916,35.3874,35.3032,35.3529,35.3159,35.3461,35.5855,35.5899,35.5918,35.4528,35.5412,35.3698,35.3162,35.0235,35.0473,35.5045,35.4878,35.0899,35.091,35.2035,35.2252,35.2164,35.1948,34.9685,35.3989,35.1962,35.1021,34.9027,35.0913,35.0845,35.0762,34.9036,34.9953,34.9608] + }, + "content_curves": { + "A":[0.254181,0.281201,0.220347,0.193923,0.193531,0.181238,0.218304,0.215575,0.100622,0.156956,0.228741,0.224889,0.219301,0.2068,0.219699,0.204776,0.210054,0.226344,0.206144,0.233093,0.236616,0.250225,0.246739,0.24869,0.251351,0.24693,0.239311,0.241437,0.244322,0.254153,0.24345,0.237582,0.235991,0.23746,0.240977,0.241306,0.232234,0.241307,0.246552,0.242179,0.23706,0.232102,0.230151,0.230668,0.206967,0], + "T":[0.272721,0.293081,0.227787,0.219326,0.224266,0.243289,0.254695,0.298768,0.211518,0.283574,0.265848,0.211439,0.266345,0.255916,0.263931,0.238575,0.238397,0.237027,0.245843,0.229081,0.24257,0.23314,0.229204,0.232281,0.230001,0.234725,0.232644,0.232192,0.232769,0.230492,0.231413,0.244757,0.239029,0.23599,0.234994,0.233683,0.240108,0.23518,0.235476,0.23652,0.234816,0.247934,0.245417,0.242449,0.255816,0.353318], + "C":[0.256069,0.178262,0.208454,0.257818,0.24555,0.243065,0.193656,0.109692,0.250738,0.281783,0.292935,0.341932,0.3125,0.293178,0.285975,0.331596,0.298094,0.291979,0.298925,0.296802,0.27057,0.262282,0.273925,0.27391,0.268725,0.26832,0.268933,0.266767,0.26895,0.270127,0.281413,0.279585,0.283248,0.288293,0.282408,0.272267,0.278011,0.281316,0.279326,0.281201,0.280404,0.280907,0.279059,0.271034,0.279558,0.37276], + "G":[0.21665,0.247451,0.343412,0.328933,0.336652,0.332397,0.333343,0.375964,0.437122,0.277687,0.212477,0.221738,0.201848,0.244099,0.230393,0.225052,0.253455,0.24465,0.249084,0.241013,0.250219,0.254353,0.250117,0.245106,0.249915,0.249971,0.259048,0.259591,0.253943,0.245212,0.24371,0.238069,0.241722,0.238245,0.241617,0.252732,0.249644,0.24219,0.238641,0.2401,0.247717,0.239026,0.245357,0.255844,0.257645,0.273864], + "N":[0.000378992,4.30672e-06,0,0,0,1.07668e-05,2.15336e-06,0,0,0,0,2.15336e-06,6.46009e-06,6.46009e-06,2.15336e-06,0,0,0,4.30672e-06,1.07668e-05,2.58403e-05,0,1.50735e-05,1.29202e-05,8.61345e-06,5.38341e-05,6.30625e-05,1.32141e-05,1.57277e-05,1.59085e-05,1.37857e-05,6.96578e-06,9.38427e-06,1.1852e-05,4.79222e-06,1.21396e-05,2.45901e-06,7.46026e-06,5.03127e-06,0,2.58286e-06,3.10263e-05,1.55618e-05,5.27796e-06,1.41118e-05,5.7578e-05], + "GC":[0.472719,0.425713,0.551866,0.58675,0.582202,0.575462,0.526999,0.485656,0.687859,0.559469,0.505411,0.563671,0.514348,0.537277,0.516368,0.556649,0.551549,0.536629,0.548009,0.537815,0.520789,0.516635,0.524042,0.519016,0.51864,0.518291,0.527981,0.526358,0.522893,0.515339,0.525123,0.517655,0.52497,0.526538,0.524025,0.524999,0.527655,0.523506,0.517967,0.521301,0.528121,0.519933,0.524416,0.526878,0.537203,0.646624] + }, + "kmer_count": { + "AAAAA":46163, "AAAAT":28333, "AAAAC":21998, "AAAAG":21412, "AAATA":21873, "AAATT":19922, "AAATC":14853, "AAATG":19360, "AAACA":22163, "AAACT":18052, "AAACC":18971, "AAACG":5637, "AAAGA":20704, "AAAGT":18541, "AAAGC":16828, "AAAGG":19493, + "AATAA":18551, "AATAT":15644, "AATAC":12562, "AATAG":9861, "AATTA":15139, "AATTT":22023, "AATTC":22197, "AATTG":11227, "AATCA":13861, "AATCT":15143, "AATCC":17507, "AATCG":4218, "AATGA":14955, "AATGT":13898, "AATGC":13064, "AATGG":17482, + "AACAA":19988, "AACAT":15021, "AACAC":16486, "AACAG":16520, "AACTA":11040, "AACTT":14172, "AACTC":23899, "AACTG":15297, "AACCA":16493, "AACCT":17278, "AACCC":20854, "AACCG":6320, "AACGA":4833, "AACGT":4278, "AACGC":6871, "AACGG":6559, + "AAGAA":21865, "AAGAT":13547, "AAGAC":14755, "AAGAG":19645, "AAGTA":12101, "AAGTT":15154, "AAGTC":14140, "AAGTG":19735, "AAGCA":19542, "AAGCT":16794, "AAGCC":19319, "AAGCG":11802, "AAGGA":25402, "AAGGT":15328, "AAGGC":21035, "AAGGG":21961, + "ATAAA":19391, "ATAAT":14020, "ATAAC":8634, "ATAAG":9623, "ATATA":12781, "ATATT":15158, "ATATC":9190, "ATATG":11584, "ATACA":13741, "ATACT":10980, "ATACC":8745, "ATACG":3650, "ATAGA":8539, "ATAGT":9730, "ATAGC":9210, "ATAGG":9982, + "ATTAA":14132, "ATTAT":13970, "ATTAC":12424, "ATTAG":11143, "ATTTA":16578, "ATTTT":33163, "ATTTC":18327, "ATTTG":16557, "ATTCA":15187, "ATTCT":26675, "ATTCC":16914, "ATTCG":5265, "ATTGA":9366, "ATTGT":12200, "ATTGC":12419, "ATTGG":14591, + "ATCAA":12817, "ATCAT":13049, "ATCAC":15112, "ATCAG":12582, "ATCTA":11054, "ATCTT":17120, "ATCTC":20195, "ATCTG":16572, "ATCCA":17217, "ATCCT":18501, "ATCCC":21373, "ATCCG":7439, "ATCGA":4314, "ATCGT":4686, "ATCGC":7571, "ATCGG":5418, + "ATGAA":14643, "ATGAT":13001, "ATGAC":10271, "ATGAG":15146, "ATGTA":10896, "ATGTT":17137, "ATGTC":11487, "ATGTG":17982, "ATGCA":15255, "ATGCT":16559, "ATGCC":18223, "ATGCG":8035, "ATGGA":16826, "ATGGT":17461, "ATGGC":21147, "ATGGG":20627, + "ACAAA":23767, "ACAAT":12938, "ACAAC":12918, "ACAAG":15444, "ACATA":11234, "ACATT":15172, "ACATC":11437, "ACATG":16237, "ACACA":23199, "ACACT":15929, "ACACC":19878, "ACACG":8601, "ACAGA":20408, "ACAGT":19596, "ACAGC":22284, "ACAGG":24177, + "ACTAA":11757, "ACTAT":9982, "ACTAC":11271, "ACTAG":9522, "ACTTA":10692, "ACTTT":20723, "ACTTC":15895, "ACTTG":15496, "ACTCA":17505, "ACTCT":16761, "ACTCC":31951, "ACTCG":7774, "ACTGA":14832, "ACTGT":14390, "ACTGC":21205, "ACTGG":18239, + "ACCAA":16022, "ACCAT":15809, "ACCAC":21683, "ACCAG":20125, "ACCTA":11354, "ACCTT":17054, "ACCTC":24854, "ACCTG":23160, "ACCCA":22243, "ACCCT":20248, "ACCCC":23339, "ACCCG":15662, "ACCGA":6381, "ACCGT":6078, "ACCGC":14224, "ACCGG":10385, + "ACGAA":5295, "ACGAT":17571, "ACGAC":5683, "ACGAG":7731, "ACGTA":3573, "ACGTT":5564, "ACGTC":5644, "ACGTG":8114, "ACGCA":7851, "ACGCT":8740, "ACGCC":21613, "ACGCG":9027, "ACGGA":7799, "ACGGT":6806, "ACGGC":11067, "ACGGG":12330, + "AGAAA":25550, "AGAAT":17119, "AGAAC":14399, "AGAAG":20653, "AGATA":10404, "AGATT":13356, "AGATC":12108, "AGATG":16644, "AGACA":18371, "AGACT":15204, "AGACC":16907, "AGACG":8220, "AGAGA":23436, "AGAGT":18958, "AGAGC":20876, "AGAGG":25629, + "AGTAA":11580, "AGTAT":10190, "AGTAC":8891, "AGTAG":13467, "AGTTA":10943, "AGTTT":18747, "AGTTC":19686, "AGTTG":14186, "AGTCA":20589, "AGTCT":15184, "AGTCC":29006, "AGTCG":5232, "AGTGA":17853, "AGTGT":14853, "AGTGC":20653, "AGTGG":25650, + "AGCAA":19474, "AGCAT":15475, "AGCAC":21384, "AGCAG":27428, "AGCTA":17615, "AGCTT":18422, "AGCTC":23017, "AGCTG":27380, "AGCCA":27271, "AGCCT":32749, "AGCCC":31657, "AGCCG":18497, "AGCGA":12715, "AGCGT":8815, "AGCGC":19413, "AGCGG":16982, + "AGGAA":29613, "AGGAT":15945, "AGGAC":16073, "AGGAG":31702, "AGGTA":12891, "AGGTT":18927, "AGGTC":15680, "AGGTG":26485, "AGGCA":29942, "AGGCT":31644, "AGGCC":29663, "AGGCG":22554, "AGGGA":23698, "AGGGT":20788, "AGGGC":29652, "AGGGG":30877, + "TAAAA":24485, "TAAAT":16788, "TAAAC":11785, "TAAAG":13212, "TAATA":11618, "TAATT":16383, "TAATC":13535, "TAATG":11902, "TAACA":12510, "TAACT":11377, "TAACC":9676, "TAACG":3902, "TAAGA":12309, "TAAGT":11425, "TAAGC":10923, "TAAGG":12844, + "TATAA":12717, "TATAT":13779, "TATAC":8509, "TATAG":8348, "TATTA":12027, "TATTT":25099, "TATTC":12549, "TATTG":11958, "TATCA":10144, "TATCT":13378, "TATCC":10948, "TATCG":3066, "TATGA":10104, "TATGT":13410, "TATGC":11319, "TATGG":14508, + "TACAA":13900, "TACAT":12364, "TACAC":12081, "TACAG":21915, "TACTA":10663, "TACTT":14451, "TACTC":13454, "TACTG":13177, "TACCA":13833, "TACCT":14818, "TACCC":11867, "TACCG":5800, "TACGA":3929, "TACGT":4569, "TACGC":5566, "TACGG":6718, + "TAGAA":13223, "TAGAT":8980, "TAGAC":7792, "TAGAG":13540, "TAGTA":9530, "TAGTT":12294, "TAGTC":11480, "TAGTG":13123, "TAGCA":13282, "TAGCT":15959, "TAGCC":14546, "TAGCG":8019, "TAGGA":13526, "TAGGT":12159, "TAGGC":15135, "TAGGG":16210, + "TTAAA":21192, "TTAAT":14602, "TTAAC":10936, "TTAAG":12887, "TTATA":12266, "TTATT":19592, "TTATC":10682, "TTATG":12703, "TTACA":16267, "TTACT":13482, "TTACC":13495, "TTACG":4388, "TTAGA":11196, "TTAGT":12430, "TTAGC":13443, "TTAGG":14992, + "TTTAA":21456, "TTTAT":19840, "TTTAC":13674, "TTTAG":14415, "TTTTA":24989, "TTTTT":224474, "TTTTC":24927, "TTTTG":24575, "TTTCA":22600, "TTTCT":28038, "TTTCC":25192, "TTTCG":6225, "TTTGA":16096, "TTTGT":21909, "TTTGC":17737, "TTTGG":25898, + "TTCAA":19782, "TTCAT":17108, "TTCAC":19046, "TTCAG":21437, "TTCTA":19057, "TTCTT":25228, "TTCTC":32344, "TTCTG":22870, "TTCCA":23864, "TTCCT":27624, "TTCCC":27134, "TTCCG":9953, "TTCGA":4866, "TTCGT":5134, "TTCGC":7416, "TTCGG":9234, + "TTGAA":16094, "TTGAT":11562, "TTGAC":9399, "TTGAG":17721, "TTGTA":12851, "TTGTT":19908, "TTGTC":12925, "TTGTG":20109, "TTGCA":17199, "TTGCT":20622, "TTGCC":20775, "TTGCG":9656, "TTGGA":19320, "TTGGT":18316, "TTGGC":22980, "TTGGG":28479, + "TCAAA":19394, "TCAAT":13050, "TCAAC":11720, "TCAAG":17450, "TCATA":9879, "TCATT":16251, "TCATC":13187, "TCATG":15113, "TCACA":18947, "TCACT":21459, "TCACC":24751, "TCACG":15163, "TCAGA":19531, "TCAGT":15427, "TCAGC":24575, "TCAGG":22670, + "TCTAA":12292, "TCTAT":11948, "TCTAC":16611, "TCTAG":12668, "TCTTA":12255, "TCTTT":22915, "TCTTC":21828, "TCTTG":19193, "TCTCA":22317, "TCTCT":27703, "TCTCC":32224, "TCTCG":19433, "TCTGA":17633, "TCTGT":19957, "TCTGC":23150, "TCTGG":23126, + "TCCAA":18041, "TCCAT":18587, "TCCAC":22407, "TCCAG":35472, "TCCTA":13530, "TCCTT":23522, "TCCTC":29863, "TCCTG":30896, "TCCCA":36697, "TCCCT":27782, "TCCCC":31853, "TCCCG":20538, "TCCGA":20297, "TCCGT":7327, "TCCGC":18592, "TCCGG":14555, + "TCGAA":5055, "TCGAT":4330, "TCGAC":4659, "TCGAG":7956, "TCGTA":3054, "TCGTT":5350, "TCGTC":5587, "TCGTG":7104, "TCGCA":6669, "TCGCT":10527, "TCGCC":14921, "TCGCG":8993, "TCGGA":6337, "TCGGT":7254, "TCGGC":15112, "TCGGG":20728, + "TGAAA":19091, "TGAAT":13721, "TGAAC":14310, "TGAAG":15365, "TGATA":8960, "TGATT":13770, "TGATC":14371, "TGATG":14424, "TGACA":14639, "TGACT":14877, "TGACC":15240, "TGACG":5586, "TGAGA":19546, "TGAGT":15662, "TGAGC":21759, "TGAGG":26732, + "TGTAA":15903, "TGTAT":13240, "TGTAC":9644, "TGTAG":12312, "TGTTA":12372, "TGTTT":22704, "TGTTC":16032, "TGTTG":19119, "TGTCA":14420, "TGTCT":19437, "TGTCC":17629, "TGTCG":6593, "TGTGA":16628, "TGTGT":23808, "TGTGC":20517, "TGTGG":26887, + "TGCAA":18735, "TGCAT":15354, "TGCAC":20340, "TGCAG":28906, "TGCTA":14028, "TGCTT":22113, "TGCTC":21237, "TGCTG":29841, "TGCCA":30243, "TGCCT":31868, "TGCCC":29048, "TGCCG":14653, "TGCGA":8064, "TGCGT":9955, "TGCGC":16321, "TGCGG":16316, + "TGGAA":26044, "TGGAT":14412, "TGGAC":13226, "TGGAG":26291, "TGGTA":13506, "TGGTT":19286, "TGGTC":17676, "TGGTG":29402, "TGGCA":25260, "TGGCT":30028, "TGGCC":31916, "TGGCG":19312, "TGGGA":30756, "TGGGT":26008, "TGGGC":33262, "TGGGG":35311, + "CAAAA":25728, "CAAAT":17508, "CAAAC":17445, "CAAAG":21449, "CAATA":12816, "CAATT":14388, "CAATC":11393, "CAATG":14787, "CAACA":19425, "CAACT":15060, "CAACC":17202, "CAACG":6929, "CAAGA":18914, "CAAGT":17620, "CAAGC":19837, "CAAGG":29668, + "CATAA":10911, "CATAT":11391, "CATAC":9204, "CATAG":9755, "CATTA":11738, "CATTT":23575, "CATTC":16355, "CATTG":14635, "CATCA":14104, "CATCT":20037, "CATCC":17898, "CATCG":5979, "CATGA":14474, "CATGT":18005, "CATGC":18939, "CATGG":24996, + "CACAA":18710, "CACAT":16969, "CACAC":24907, "CACAG":26605, "CACTA":11851, "CACTT":21579, "CACTC":23382, "CACTG":26640, "CACCA":28653, "CACCT":29636, "CACCC":29613, "CACCG":15934, "CACGA":7793, "CACGT":9011, "CACGC":23214, "CACGG":13705, + "CAGAA":22397, "CAGAT":14406, "CAGAC":18220, "CAGAG":30455, "CAGTA":11839, "CAGTT":17251, "CAGTC":28691, "CAGTG":26556, "CAGCA":30451, "CAGCT":33266, "CAGCC":48284, "CAGCG":21733, "CAGGA":29425, "CAGGT":24133, "CAGGC":39502, "CAGGG":33676, + "CTAAA":14736, "CTAAT":11855, "CTAAC":10113, "CTAAG":12558, "CTATA":9492, "CTATT":14012, "CTATC":9257, "CTATG":12737, "CTACA":19389, "CTACT":17600, "CTACC":14348, "CTACG":7217, "CTAGA":12095, "CTAGT":11884, "CTAGC":13665, "CTAGG":16983, + "CTTAA":12931, "CTTAT":11195, "CTTAC":11569, "CTTAG":12172, "CTTTA":14760, "CTTTT":25600, "CTTTC":22259, "CTTTG":22099, "CTTCA":20193, "CTTCT":25091, "CTTCC":30693, "CTTCG":8641, "CTTGA":16094, "CTTGT":17469, "CTTGC":18103, "CTTGG":28243, + "CTCAA":19256, "CTCAT":15046, "CTCAC":26438, "CTCAG":31530, "CTCTA":14263, "CTCTT":20921, "CTCTC":30183, "CTCTG":30029, "CTCCA":38723, "CTCCT":36932, "CTCCC":46839, "CTCCG":22075, "CTCGA":7458, "CTCGT":7246, "CTCGC":16222, "CTCGG":27232, + "CTGAA":18346, "CTGAT":12561, "CTGAC":18092, "CTGAG":28133, "CTGTA":17081, "CTGTT":17738, "CTGTC":19265, "CTGTG":22727, "CTGCA":28661, "CTGCT":28373, "CTGCC":38305, "CTGCG":17138, "CTGGA":28402, "CTGGT":19003, "CTGGC":30767, "CTGGG":43295, + "CCAAA":22568, "CCAAT":18722, "CCAAC":18401, "CCAAG":30604, "CCATA":10321, "CCATT":18940, "CCATC":20316, "CCATG":23590, "CCACA":24772, "CCACT":25641, "CCACC":36398, "CCACG":16735, "CCAGA":24400, "CCAGT":26069, "CCAGC":47319, "CCAGG":43762, + "CCTAA":11907, "CCTAT":10828, "CCTAC":13764, "CCTAG":15063, "CCTTA":12483, "CCTTT":21580, "CCTTC":25916, "CCTTG":23147, "CCTCA":29044, "CCTCT":30384, "CCTCC":49129, "CCTCG":18079, "CCTGA":22282, "CCTGT":24047, "CCTGC":33652, "CCTGG":42223, + "CCCAA":25048, "CCCAT":18471, "CCCAC":31008, "CCCAG":51711, "CCCTA":12725, "CCCTT":21647, "CCCTC":31466, "CCCTG":32835, "CCCCA":36015, "CCCCT":25858, "CCCCC":31847, "CCCCG":30890, "CCCGA":14848, "CCCGT":10797, "CCCGC":35932, "CCCGG":34259, + "CCGAA":6801, "CCGAT":5232, "CCGAC":25046, "CCGAG":18683, "CCGTA":4067, "CCGTT":7087, "CCGTC":11500, "CCGTG":12163, "CCGCA":16243, "CCGCT":19884, "CCGCC":46992, "CCGCG":27624, "CCGGA":13590, "CCGGT":9474, "CCGGC":29259, "CCGGG":31322, + "CGAAA":5366, "CGAAT":3970, "CGAAC":6863, "CGAAG":7929, "CGATA":3044, "CGATT":5711, "CGATC":20305, "CGATG":7496, "CGACA":8196, "CGACT":7536, "CGACC":11100, "CGACG":20873, "CGAGA":10750, "CGAGT":9011, "CGAGC":13942, "CGAGG":19574, + "CGTAA":3343, "CGTAT":3806, "CGTAC":3986, "CGTAG":5475, "CGTTA":4869, "CGTTT":7702, "CGTTC":7749, "CGTTG":8050, "CGTCA":5927, "CGTCT":9441, "CGTCC":12580, "CGTCG":5746, "CGTGA":8371, "CGTGT":9007, "CGTGC":12459, "CGTGG":15840, + "CGCAA":7257, "CGCAT":7442, "CGCAC":12561, "CGCAG":21020, "CGCTA":7486, "CGCTT":12409, "CGCTC":19636, "CGCTG":19687, "CGCCA":26103, "CGCCT":24276, "CGCCC":32667, "CGCCG":32103, "CGCGA":9520, "CGCGT":9858, "CGCGC":27876, "CGCGG":26128, + "CGGAA":8627, "CGGAT":6323, "CGGAC":10893, "CGGAG":16748, "CGGTA":5866, "CGGTT":7700, "CGGTC":9517, "CGGTG":14790, "CGGCA":13463, "CGGCT":19030, "CGGCC":30281, "CGGCG":26684, "CGGGA":18308, "CGGGT":20288, "CGGGC":27461, "CGGGG":28730, + "GAAAA":22275, "GAAAT":15390, "GAAAC":15525, "GAAAG":16889, "GAATA":9763, "GAATT":21062, "GAATC":11594, "GAATG":12631, "GAACA":13163, "GAACT":21168, "GAACC":15913, "GAACG":5546, "GAAGA":16763, "GAAGT":13352, "GAAGC":18351, "GAAGG":19642, + "GATAA":8872, "GATAT":8203, "GATAC":6745, "GATAG":7862, "GATTA":11688, "GATTT":14988, "GATTC":13355, "GATTG":9469, "GATCA":14920, "GATCT":17695, "GATCC":18797, "GATCG":8191, "GATGA":12354, "GATGT":11359, "GATGC":12253, "GATGG":19031, + "GACAA":12219, "GACAT":10220, "GACAC":15359, "GACAG":19601, "GACTA":8716, "GACTT":13686, "GACTC":15282, "GACTG":14945, "GACCA":14062, "GACCT":15975, "GACCC":21120, "GACCG":9263, "GACGA":18910, "GACGT":4683, "GACGC":11086, "GACGG":10862, + "GAGAA":20672, "GAGAT":15051, "GAGAC":19406, "GAGAG":23726, "GAGTA":10180, "GAGTT":19014, "GAGTC":16623, "GAGTG":18658, "GAGCA":18313, "GAGCT":20580, "GAGCC":28623, "GAGCG":14889, "GAGGA":24219, "GAGGT":21763, "GAGGC":37903, "GAGGG":30945, + "GTAAA":11111, "GTAAT":13403, "GTAAC":7994, "GTAAG":9683, "GTATA":8301, "GTATT":12936, "GTATC":7875, "GTATG":10181, "GTACA":9823, "GTACT":10148, "GTACC":8957, "GTACG":4404, "GTAGA":10550, "GTAGT":11105, "GTAGC":12547, "GTAGG":11736, + "GTTAA":10349, "GTTAT":10134, "GTTAC":9700, "GTTAG":10821, "GTTTA":11968, "GTTTT":22981, "GTTTC":17144, "GTTTG":16352, "GTTCA":18643, "GTTCT":21146, "GTTCC":15610, "GTTCG":5902, "GTTGA":11178, "GTTGT":13501, "GTTGC":15862, "GTTGG":19633, + "GTCAA":9789, "GTCAT":10263, "GTCAC":20892, "GTCAG":15661, "GTCTA":9287, "GTCTT":15151, "GTCTC":22680, "GTCTG":16683, "GTCCA":14820, "GTCCT":17153, "GTCCC":23533, "GTCCG":22387, "GTCGA":4905, "GTCGT":4198, "GTCGC":9247, "GTCGG":8057, + "GTGAA":13661, "GTGAT":14989, "GTGAC":13330, "GTGAG":21599, "GTGTA":9975, "GTGTT":16130, "GTGTC":14762, "GTGTG":26527, "GTGCA":21184, "GTGCT":21686, "GTGCC":28588, "GTGCG":14228, "GTGGA":19922, "GTGGT":24587, "GTGGC":29811, "GTGGG":31312, + "GCAAA":16253, "GCAAT":13816, "GCAAC":16102, "GCAAG":18923, "GCATA":8993, "GCATT":15583, "GCATC":13450, "GCATG":19742, "GCACA":18819, "GCACT":21852, "GCACC":23909, "GCACG":12399, "GCAGA":20604, "GCAGT":22679, "GCAGC":38532, "GCAGG":34810, + "GCTAA":12429, "GCTAT":12274, "GCTAC":16479, "GCTAG":14299, "GCTTA":11281, "GCTTT":19844, "GCTTC":21906, "GCTTG":20629, "GCTCA":23611, "GCTCT":23270, "GCTCC":34942, "GCTCG":13409, "GCTGA":22253, "GCTGT":19081, "GCTGC":34454, "GCTGG":40283, + "GCCAA":30426, "GCCAT":20900, "GCCAC":29674, "GCCAG":31535, "GCCTA":13548, "GCCTT":22447, "GCCTC":44629, "GCCTG":38500, "GCCCA":31576, "GCCCT":27109, "GCCCC":41246, "GCCCG":31032, "GCCGA":13650, "GCCGT":10375, "GCCGC":42482, "GCCGG":25708, + "GCGAA":6835, "GCGAT":9123, "GCGAC":13088, "GCGAG":17274, "GCGTA":5485, "GCGTT":9790, "GCGTC":11200, "GCGTG":17112, "GCGCA":15816, "GCGCT":19500, "GCGCC":32624, "GCGCG":26827, "GCGGA":14813, "GCGGT":14115, "GCGGC":35299, "GCGGG":31090, + "GGAAA":20548, "GGAAT":21323, "GGAAC":21563, "GGAAG":22767, "GGATA":8416, "GGATT":16984, "GGATC":13390, "GGATG":15476, "GGACA":15699, "GGACT":15985, "GGACC":18288, "GGACG":10730, "GGAGA":24524, "GGAGT":20819, "GGAGC":26201, "GGAGG":43799, + "GGTAA":10515, "GGTAT":11631, "GGTAC":10168, "GGTAG":13543, "GGTTA":11492, "GGTTT":19013, "GGTTC":18148, "GGTTG":17453, "GGTCA":14870, "GGTCT":20492, "GGTCC":19281, "GGTCG":8410, "GGTGA":19357, "GGTGT":19003, "GGTGC":30432, "GGTGG":38473, + "GGCAA":18830, "GGCAT":18336, "GGCAC":23597, "GGCAG":37755, "GGCTA":14943, "GGCTT":20433, "GGCTC":33558, "GGCTG":40777, "GGCCA":29115, "GGCCT":31686, "GGCCC":40083, "GGCCG":28685, "GGCGA":14816, "GGCGT":14368, "GGCGC":30850, "GGCGG":36865, + "GGGAA":22073, "GGGAT":18421, "GGGAC":21923, "GGGAG":39709, "GGGTA":12936, "GGGTT":20459, "GGGTC":21113, "GGGTG":36912, "GGGCA":28700, "GGGCT":30790, "GGGCC":39935, "GGGCG":28673, "GGGGA":28966, "GGGGT":25149, "GGGGC":39229, "GGGGG":38089 + }, + "overrepresented_sequences": { + "ATTCTCGGGTGCCAAGGAAC":20, + "CAAGGAACTCCAGTCACGCC":45, + "CGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTT":28, + "CGGGTGCCAAGGAACTCCAG":16, + "CTCGGGTGCCAAGGAACTCC":18, + "GCCAAGGAACTCCAGTCACG":12, + "GGGTGCCAAGGAACTCCAGT":33, + "GGTGCCAAGGAACTCCAGTC":43, + "TCGGGTGCCAAGGAACTCCA":19, + "TCTCGGGTGCCAAGGAACTC":51, + "TTCTCGGGTGCCAAGGAACT":18, + "TTTTTTTTTTTTTTTTTTTT":248 } + }, + "command": "fastp --overrepresentation_analysis --thread 1 --in1 /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastq/tutorial_R1.fastq --adapter_sequence TGGAATTCTCGGGTGCCAAGG --length_required 26 --html /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.html --json /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.json --report_title tutorial -o /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastq/tutorial_R1_noadap.fastq " +} \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt new file mode 100644 index 0000000..da43c3b --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R1_rmAdapter.txt @@ -0,0 +1,29 @@ +Streaming uncompressed output to STDOUT... + +Read1 before filtering: +total reads: 1000000 +total bases: 37678766 +Q20 bases: 34709953(92.1207%) +Q30 bases: 34018346(90.2852%) + +Read1 after filtering: +total reads: 439519 +total bases: 16134044 +Q20 bases: 15454671(95.7892%) +Q30 bases: 15178418(94.077%) + +Filtering result: +reads passed filter: 439519 +reads failed due to low quality: 66799 +reads failed due to too many N: 105 +reads failed due to too short: 493577 +reads with adapter trimmed: 540203 +bases trimmed due to adapters: 14696777 + +Duplication rate (may be overestimated since this is SE data): 28.6979% + +JSON report: /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json +HTML report: /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html + +fastp --overrepresentation_analysis --thread 1 --in1 /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq --adapter_sequence GATCGTCGGACTGTAGAACTCTGAAC --length_required 26 --html /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html --json /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json --report_title tutorial --stdout +fastp v0.19.4, time used: 7 seconds diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html new file mode 100644 index 0000000..0883100 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html @@ -0,0 +1,2725 @@ +fastp report at 2019-07-31 20:43:39 + + + +
+

tutorial +
+ +
+
General
+
+ + + + + + +
fastp version:0.19.4 (https://github.com/OpenGene/fastp)
sequencing:single end (38 cycles)
mean length before filtering:37bp
mean length after filtering:36bp
duplication rate:28.697855% (may be overestimated since this is SE data)
+
+
Before filtering
+
+ + + + + + +
total reads:1000.000000 K
total bases:37.678766 M
Q20 bases:34.709953 M (92.120727%)
Q30 bases:34.018346 M (90.285191%)
GC content:51.863917%
+
+
After filtering
+
+ + + + + + +
total reads:439.519000 K
total bases:16.134044 M
Q20 bases:15.454671 M (95.789196%)
Q30 bases:15.178418 M (94.076959%)
GC content:52.695090%
+
+
Filtering result
+
+ + + + + +
reads passed filters:439.519000 K (43.951900%)
reads with low quality:66.799000 K (6.679900%)
reads with too many N:105 (0.010500%)
reads too short:493.577000 K (49.357700%)
+
+
+
+
+ +
+
Adapter or bad ligation of read1
+
+ + + + + + + + + + +
SequenceOccurrences
GATCGTCGG14299
GATCGTCGGAC6547
GATCGTCGGACT5538
GATCGTCGGACTG5881
GATCGTCGGACTGT17243
GATCGTCGGACTGTAGAACTCTGAACGTG7306
GATCGTCGGACTGTAGAACTCTGAACGTGT404556
other adapter sequences78833
+
+
+
+
+ +
+
+
+
+ + +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 38
AAAAAAAAAAAAAAAAAAAA78 (0.082805%)
AAAGATCGTCGGACTGTAGA233 (0.247354%)
AACGATCGTCGGACTGTAGA304 (0.322728%)
AAGGATCGTCGGACTGTAGA292 (0.309989%)
AATGATCGTCGGACTGTAGA322 (0.341837%)
ACAGATCGTCGGACTGTAGA183 (0.194274%)
ACCGATCGTCGGACTGTAGA203 (0.215506%)
ACCTGATCGTCGGACTGTAG144 (0.152871%)
ACGGATCGTCGGACTGTAGA198 (0.210198%)
ACTGATCGTCGGACTGTAGA162 (0.171980%)
ACTGTAGAACTCTGAACGTG18 (0.019109%)
AGATCGTCGGACTGTAGAAC38 (0.040341%)
AGCGATCGTCGGACTGTAGA427 (0.453306%)
AGCTGATCGTCGGACTGTAG152 (0.161364%)
AGGGATCGTCGGACTGTAGA291 (0.308927%)
AGTGATCGTCGGACTGTAGA232 (0.246293%)
ATAGATCGTCGGACTGTAGA294 (0.312112%)
ATCGATCGTCGGACTGTAGA188 (0.199582%)
ATCGTCGGACTGTAGAACTC36 (0.038218%)
ATGATCGTCGGACTGTAGAA17 (0.018047%)
ATGGATCGTCGGACTGTAGA204 (0.216568%)
ATTGATCGTCGGACTGTAGA306 (0.324851%)
CAAAGATCGTCGGACTGTAG136 (0.144378%)
CAAGATCGTCGGACTGTAGA191 (0.202767%)
CAAGGATCGTCGGACTGTAG138 (0.146502%)
CACGATCGTCGGACTGTAGA248 (0.263278%)
CACTGATCGTCGGACTGTAG182 (0.193212%)
CAGGATCGTCGGACTGTAGA105 (0.111469%)
CATCGATCGTCGGACTGTAG120 (0.127393%)
CATGATCGTCGGACTGTAGA191 (0.202767%)
CATGGATCGTCGGACTGTAG132 (0.140132%)
CCAAGATCGTCGGACTGTAG225 (0.238861%)
CCAGATCGTCGGACTGTAGA137 (0.145440%)
CCAGGATCGTCGGACTGTAG183 (0.194274%)
CCATGATCGTCGGACTGTAG187 (0.198520%)
CCCAGATCGTCGGACTGTAG212 (0.225060%)
CCCCTGATCGTCGGACTGTA138 (0.146502%)
CCCGATCGTCGGACTGTAGA334 (0.354576%)
CCCGGATCGTCGGACTGTAG115 (0.122085%)
CCCTGATCGTCGGACTGTAG251 (0.266463%)
CCGATCGTCGGACTGTAGAA92 (0.097668%)
CCGGATCGTCGGACTGTAGA108 (0.114653%)
CCTAGATCGTCGGACTGTAG178 (0.188966%)
CCTCGATCGTCGGACTGTAG171 (0.181535%)
CCTGGATCGTCGGACTGTAG256 (0.271771%)
CCTTGATCGTCGGACTGTAG195 (0.207013%)
CGAGATCGTCGGACTGTAGA136 (0.144378%)
CGATCGTCGGACTGTAGAAC26 (0.027602%)
CGCGATCGTCGGACTGTAGA212 (0.225060%)
CGCGGATCGTCGGACTGTAG115 (0.122085%)
CGCTGATCGTCGGACTGTAG134 (0.142255%)
CGGACTGTAGAACTCTGAAC77 (0.081744%)
CGGGATCGTCGGACTGTAGA243 (0.257970%)
CGGGGATCGTCGGACTGTAG125 (0.132701%)
CGTCGGACTGTAGAACTCTG68 (0.072189%)
CGTGATCGTCGGACTGTAGA250 (0.265401%)
CTAGATCGTCGGACTGTAGA168 (0.178350%)
CTAGGATCGTCGGACTGTAG113 (0.119961%)
CTCGATCGTCGGACTGTAGA322 (0.341837%)
CTCTGATCGTCGGACTGTAG144 (0.152871%)
CTGCGATCGTCGGACTGTAG124 (0.131639%)
CTGGATCGTCGGACTGTAGA162 (0.171980%)
CTGGGATCGTCGGACTGTAG227 (0.240985%)
CTGTGATCGTCGGACTGTAG134 (0.142255%)
CTTGATCGTCGGACTGTAGA205 (0.217629%)
CTTGGCACCCGAGAATTCCA146 (0.154994%)
CTTTGATCGTCGGACTGTAG123 (0.130578%)
GAAGATCGTCGGACTGTAGA264 (0.280264%)
GACGATCGTCGGACTGTAGA186 (0.197459%)
GACTGTAGAACTCTGAACGT20 (0.021232%)
GAGATCGTCGGACTGTAGAA95 (0.100853%)
GAGGATCGTCGGACTGTAGA116 (0.123146%)
GATCGTCGGACTGTAGAACT140 (0.148625%)
GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCT247 (0.471990%)
GATGATCGTCGGACTGTAGA189 (0.200644%)
GCAAGATCGTCGGACTGTAG157 (0.166672%)
GCACGATCGTCGGACTGTAG149 (0.158179%)
GCAGATCGTCGGACTGTAGA194 (0.205952%)
GCAGGATCGTCGGACTGTAG165 (0.175165%)
GCATGATCGTCGGACTGTAG149 (0.158179%)
GCCAGATCGTCGGACTGTAG195 (0.207013%)
GCCGATCGTCGGACTGTAGA255 (0.270710%)
GCCTGATCGTCGGACTGTAG198 (0.210198%)
GCGCGATCGTCGGACTGTAG167 (0.177288%)
GCGGATCGTCGGACTGTAGA214 (0.227184%)
GCGGGATCGTCGGACTGTAG231 (0.245231%)
GCTAGATCGTCGGACTGTAG148 (0.157118%)
GCTGGATCGTCGGACTGTAG227 (0.240985%)
GCTTGATCGTCGGACTGTAG142 (0.150748%)
GGACTGTAGAACTCTGAACG77 (0.081744%)
GGAGATCGTCGGACTGTAGA161 (0.170919%)
GGAGGATCGTCGGACTGTAG136 (0.144378%)
GGATCGTCGGACTGTAGAAC19 (0.020171%)
GGCAGATCGTCGGACTGTAG150 (0.159241%)
GGCGATCGTCGGACTGTAGA267 (0.283449%)
GGCGGATCGTCGGACTGTAG210 (0.222937%)
GGCTGATCGTCGGACTGTAG193 (0.204890%)
GGGAGATCGTCGGACTGTAG164 (0.174103%)
GGGCGATCGTCGGACTGTAG241 (0.255847%)
GGGGATCGTCGGACTGTAGA111 (0.117838%)
GGGGGATCGTCGGACTGTAG273 (0.289818%)
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG607 (1.159911%)
GGTGATCGTCGGACTGTAGA358 (0.380055%)
GGTGGATCGTCGGACTGTAG163 (0.173042%)
GTAGATCGTCGGACTGTAGA294 (0.312112%)
GTCGATCGTCGGACTGTAGA151 (0.160302%)
GTCGGACTGTAGAACTCTGA90 (0.095545%)
GTGGATCGTCGGACTGTAGA224 (0.237800%)
GTGGGATCGTCGGACTGTAG135 (0.143317%)
GTTGATCGTCGGACTGTAGA232 (0.246293%)
NNNNNNNNNNNNNNNNNNNN829 (0.880071%)
TAAGATCGTCGGACTGTAGA318 (0.337591%)
TACGATCGTCGGACTGTAGA217 (0.230368%)
TAGATCGTCGGACTGTAGAA12 (0.012739%)
TAGGATCGTCGGACTGTAGA222 (0.235677%)
TATGATCGTCGGACTGTAGA234 (0.248416%)
TCAGATCGTCGGACTGTAGA147 (0.156056%)
TCCTGATCGTCGGACTGTAG146 (0.154994%)
TCGGACTGTAGAACTCTGAA85 (0.090237%)
TCGGATCGTCGGACTGTAGA211 (0.223999%)
TCGTCGGACTGTAGAACTCT51 (0.054142%)
TCTGATCGTCGGACTGTAGA171 (0.181535%)
TGAGATCGTCGGACTGTAGA201 (0.213383%)
TGATCGTCGGACTGTAGAAC30 (0.031848%)
TGCGATCGTCGGACTGTAGA264 (0.280264%)
TGCTGATCGTCGGACTGTAG149 (0.158179%)
TGGGATCGTCGGACTGTAGA195 (0.207013%)
TGGGGATCGTCGGACTGTAG181 (0.192151%)
TGTGATCGTCGGACTGTAGA216 (0.229307%)
TTAGATCGTCGGACTGTAGA258 (0.273894%)
TTCGATCGTCGGACTGTAGA189 (0.200644%)
TTGGATCGTCGGACTGTAGA318 (0.337591%)
TTTGATCGTCGGACTGTAGA146 (0.154994%)
+
+ +
+
+
+ +
+ +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Value of each position will be shown on mouse over.
+
+
+ + + +
+
Darker background means larger counts. The count will be shown on mouse over.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AAATACAGTATTTCTGCACTCCCGGAGTGCGG
AAAAAAAAAAAATAAAACAAAAGAAATAAAATTAAATCAAATGAAACAAAACTAAACCAAACGAAAGAAAAGTAAAGCAAAGG
AATAATAAAATATAATACAATAGAATTAAATTTAATTCAATTGAATCAAATCTAATCCAATCGAATGAAATGTAATGCAATGG
AACAACAAAACATAACACAACAGAACTAAACTTAACTCAACTGAACCAAACCTAACCCAACCGAACGAAACGTAACGCAACGG
AAGAAGAAAAGATAAGACAAGAGAAGTAAAGTTAAGTCAAGTGAAGCAAAGCTAAGCCAAGCGAAGGAAAGGTAAGGCAAGGG
ATAATAAAATAATATAACATAAGATATAATATTATATCATATGATACAATACTATACCATACGATAGAATAGTATAGCATAGG
ATTATTAAATTATATTACATTAGATTTAATTTTATTTCATTTGATTCAATTCTATTCCATTCGATTGAATTGTATTGCATTGG
ATCATCAAATCATATCACATCAGATCTAATCTTATCTCATCTGATCCAATCCTATCCCATCCGATCGAATCGTATCGCATCGG
ATGATGAAATGATATGACATGAGATGTAATGTTATGTCATGTGATGCAATGCTATGCCATGCGATGGAATGGTATGGCATGGG
ACAACAAAACAATACAACACAAGACATAACATTACATCACATGACACAACACTACACCACACGACAGAACAGTACAGCACAGG
ACTACTAAACTATACTACACTAGACTTAACTTTACTTCACTTGACTCAACTCTACTCCACTCGACTGAACTGTACTGCACTGG
ACCACCAAACCATACCACACCAGACCTAACCTTACCTCACCTGACCCAACCCTACCCCACCCGACCGAACCGTACCGCACCGG
ACGACGAAACGATACGACACGAGACGTAACGTTACGTCACGTGACGCAACGCTACGCCACGCGACGGAACGGTACGGCACGGG
AGAAGAAAAGAATAGAACAGAAGAGATAAGATTAGATCAGATGAGACAAGACTAGACCAGACGAGAGAAGAGTAGAGCAGAGG
AGTAGTAAAGTATAGTACAGTAGAGTTAAGTTTAGTTCAGTTGAGTCAAGTCTAGTCCAGTCGAGTGAAGTGTAGTGCAGTGG
AGCAGCAAAGCATAGCACAGCAGAGCTAAGCTTAGCTCAGCTGAGCCAAGCCTAGCCCAGCCGAGCGAAGCGTAGCGCAGCGG
AGGAGGAAAGGATAGGACAGGAGAGGTAAGGTTAGGTCAGGTGAGGCAAGGCTAGGCCAGGCGAGGGAAGGGTAGGGCAGGGG
TAATAAAATAAATTAAACTAAAGTAATATAATTTAATCTAATGTAACATAACTTAACCTAACGTAAGATAAGTTAAGCTAAGG
TATTATAATATATTATACTATAGTATTATATTTTATTCTATTGTATCATATCTTATCCTATCGTATGATATGTTATGCTATGG
TACTACAATACATTACACTACAGTACTATACTTTACTCTACTGTACCATACCTTACCCTACCGTACGATACGTTACGCTACGG
TAGTAGAATAGATTAGACTAGAGTAGTATAGTTTAGTCTAGTGTAGCATAGCTTAGCCTAGCGTAGGATAGGTTAGGCTAGGG
TTATTAAATTAATTTAACTTAAGTTATATTATTTTATCTTATGTTACATTACTTTACCTTACGTTAGATTAGTTTAGCTTAGG
TTTTTTAATTTATTTTACTTTAGTTTTATTTTTTTTTCTTTTGTTTCATTTCTTTTCCTTTCGTTTGATTTGTTTTGCTTTGG
TTCTTCAATTCATTTCACTTCAGTTCTATTCTTTTCTCTTCTGTTCCATTCCTTTCCCTTCCGTTCGATTCGTTTCGCTTCGG
TTGTTGAATTGATTTGACTTGAGTTGTATTGTTTTGTCTTGTGTTGCATTGCTTTGCCTTGCGTTGGATTGGTTTGGCTTGGG
TCATCAAATCAATTCAACTCAAGTCATATCATTTCATCTCATGTCACATCACTTCACCTCACGTCAGATCAGTTCAGCTCAGG
TCTTCTAATCTATTCTACTCTAGTCTTATCTTTTCTTCTCTTGTCTCATCTCTTCTCCTCTCGTCTGATCTGTTCTGCTCTGG
TCCTCCAATCCATTCCACTCCAGTCCTATCCTTTCCTCTCCTGTCCCATCCCTTCCCCTCCCGTCCGATCCGTTCCGCTCCGG
TCGTCGAATCGATTCGACTCGAGTCGTATCGTTTCGTCTCGTGTCGCATCGCTTCGCCTCGCGTCGGATCGGTTCGGCTCGGG
TGATGAAATGAATTGAACTGAAGTGATATGATTTGATCTGATGTGACATGACTTGACCTGACGTGAGATGAGTTGAGCTGAGG
TGTTGTAATGTATTGTACTGTAGTGTTATGTTTTGTTCTGTTGTGTCATGTCTTGTCCTGTCGTGTGATGTGTTGTGCTGTGG
TGCTGCAATGCATTGCACTGCAGTGCTATGCTTTGCTCTGCTGTGCCATGCCTTGCCCTGCCGTGCGATGCGTTGCGCTGCGG
TGGTGGAATGGATTGGACTGGAGTGGTATGGTTTGGTCTGGTGTGGCATGGCTTGGCCTGGCGTGGGATGGGTTGGGCTGGGG
CAACAAAACAAATCAAACCAAAGCAATACAATTCAATCCAATGCAACACAACTCAACCCAACGCAAGACAAGTCAAGCCAAGG
CATCATAACATATCATACCATAGCATTACATTTCATTCCATTGCATCACATCTCATCCCATCGCATGACATGTCATGCCATGG
CACCACAACACATCACACCACAGCACTACACTTCACTCCACTGCACCACACCTCACCCCACCGCACGACACGTCACGCCACGG
CAGCAGAACAGATCAGACCAGAGCAGTACAGTTCAGTCCAGTGCAGCACAGCTCAGCCCAGCGCAGGACAGGTCAGGCCAGGG
CTACTAAACTAATCTAACCTAAGCTATACTATTCTATCCTATGCTACACTACTCTACCCTACGCTAGACTAGTCTAGCCTAGG
CTTCTTAACTTATCTTACCTTAGCTTTACTTTTCTTTCCTTTGCTTCACTTCTCTTCCCTTCGCTTGACTTGTCTTGCCTTGG
CTCCTCAACTCATCTCACCTCAGCTCTACTCTTCTCTCCTCTGCTCCACTCCTCTCCCCTCCGCTCGACTCGTCTCGCCTCGG
CTGCTGAACTGATCTGACCTGAGCTGTACTGTTCTGTCCTGTGCTGCACTGCTCTGCCCTGCGCTGGACTGGTCTGGCCTGGG
CCACCAAACCAATCCAACCCAAGCCATACCATTCCATCCCATGCCACACCACTCCACCCCACGCCAGACCAGTCCAGCCCAGG
CCTCCTAACCTATCCTACCCTAGCCTTACCTTTCCTTCCCTTGCCTCACCTCTCCTCCCCTCGCCTGACCTGTCCTGCCCTGG
CCCCCCAACCCATCCCACCCCAGCCCTACCCTTCCCTCCCCTGCCCCACCCCTCCCCCCCCCGCCCGACCCGTCCCGCCCCGG
CCGCCGAACCGATCCGACCCGAGCCGTACCGTTCCGTCCCGTGCCGCACCGCTCCGCCCCGCGCCGGACCGGTCCGGCCCGGG
CGACGAAACGAATCGAACCGAAGCGATACGATTCGATCCGATGCGACACGACTCGACCCGACGCGAGACGAGTCGAGCCGAGG
CGTCGTAACGTATCGTACCGTAGCGTTACGTTTCGTTCCGTTGCGTCACGTCTCGTCCCGTCGCGTGACGTGTCGTGCCGTGG
CGCCGCAACGCATCGCACCGCAGCGCTACGCTTCGCTCCGCTGCGCCACGCCTCGCCCCGCCGCGCGACGCGTCGCGCCGCGG
CGGCGGAACGGATCGGACCGGAGCGGTACGGTTCGGTCCGGTGCGGCACGGCTCGGCCCGGCGCGGGACGGGTCGGGCCGGGG
GAAGAAAAGAAATGAAACGAAAGGAATAGAATTGAATCGAATGGAACAGAACTGAACCGAACGGAAGAGAAGTGAAGCGAAGG
GATGATAAGATATGATACGATAGGATTAGATTTGATTCGATTGGATCAGATCTGATCCGATCGGATGAGATGTGATGCGATGG
GACGACAAGACATGACACGACAGGACTAGACTTGACTCGACTGGACCAGACCTGACCCGACCGGACGAGACGTGACGCGACGG
GAGGAGAAGAGATGAGACGAGAGGAGTAGAGTTGAGTCGAGTGGAGCAGAGCTGAGCCGAGCGGAGGAGAGGTGAGGCGAGGG
GTAGTAAAGTAATGTAACGTAAGGTATAGTATTGTATCGTATGGTACAGTACTGTACCGTACGGTAGAGTAGTGTAGCGTAGG
GTTGTTAAGTTATGTTACGTTAGGTTTAGTTTTGTTTCGTTTGGTTCAGTTCTGTTCCGTTCGGTTGAGTTGTGTTGCGTTGG
GTCGTCAAGTCATGTCACGTCAGGTCTAGTCTTGTCTCGTCTGGTCCAGTCCTGTCCCGTCCGGTCGAGTCGTGTCGCGTCGG
GTGGTGAAGTGATGTGACGTGAGGTGTAGTGTTGTGTCGTGTGGTGCAGTGCTGTGCCGTGCGGTGGAGTGGTGTGGCGTGGG
GCAGCAAAGCAATGCAACGCAAGGCATAGCATTGCATCGCATGGCACAGCACTGCACCGCACGGCAGAGCAGTGCAGCGCAGG
GCTGCTAAGCTATGCTACGCTAGGCTTAGCTTTGCTTCGCTTGGCTCAGCTCTGCTCCGCTCGGCTGAGCTGTGCTGCGCTGG
GCCGCCAAGCCATGCCACGCCAGGCCTAGCCTTGCCTCGCCTGGCCCAGCCCTGCCCCGCCCGGCCGAGCCGTGCCGCGCCGG
GCGGCGAAGCGATGCGACGCGAGGCGTAGCGTTGCGTCGCGTGGCGCAGCGCTGCGCCGCGCGGCGGAGCGGTGCGGCGCGGG
GGAGGAAAGGAATGGAACGGAAGGGATAGGATTGGATCGGATGGGACAGGACTGGACCGGACGGGAGAGGAGTGGAGCGGAGG
GGTGGTAAGGTATGGTACGGTAGGGTTAGGTTTGGTTCGGTTGGGTCAGGTCTGGTCCGGTCGGGTGAGGTGTGGTGCGGTGG
GGCGGCAAGGCATGGCACGGCAGGGCTAGGCTTGGCTCGGCTGGGCCAGGCCTGGCCCGGCCGGGCGAGGCGTGGCGCGGCGG
GGGGGGAAGGGATGGGACGGGAGGGGTAGGGTTGGGTCGGGTGGGGCAGGGCTGGGCCGGGCGGGGGAGGGGTGGGGCGGGGG
+
+ +
+
Sampling rate: 1 / 20
+ + + + + + +
overrepresented sequencecount (% of bases)distribution: cycle 1 ~ cycle 38
AAAAAAAAAAAAAAAAAAAA34 (0.084294%)
CTTGGCACCCGAGAATTCCA128 (0.317341%)
GTCGGACTGTAGAACTCTGA31 (0.076856%)
TCGGACTGTAGAACTCTGAA54 (0.133878%)
+
+ +
+
+ +

+ \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json new file mode 100644 index 0000000..f99137e --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json @@ -0,0 +1,358 @@ +{ + "summary": { + "before_filtering": { + "total_reads":1000000, + "total_bases":37678766, + "q20_bases":34709953, + "q30_bases":34018346, + "q20_rate":0.921207, + "q30_rate":0.902852, + "read1_mean_length":37, + "gc_content":0.518639 + }, + "after_filtering": { + "total_reads":439519, + "total_bases":16134044, + "q20_bases":15454671, + "q30_bases":15178418, + "q20_rate":0.957892, + "q30_rate":0.94077, + "read1_mean_length":36, + "gc_content":0.526951 + } + }, + "filtering_result": { + "passed_filter_reads": 439519, + "low_quality_reads": 66799, + "too_many_N_reads": 105, + "too_short_reads": 493577, + "too_long_reads": 0 + }, + "duplication": { + "rate": 0.286979, + "histogram": [437812,16546,8852,5892,3938,2938,2123,1537,1168,918,718,552,445,368,315,274,220,184,152,137,120,87,94,90,79,61,49,39,34,34,295], + "mean_gc": [0.449334,0.458295,0.463191,0.461576,0.462608,0.464017,0.468745,0.467297,0.462957,0.463121,0.46369,0.469295,0.4685,0.457651,0.458724,0.475097,0.462531,0.457651,0.462126,0.454845,0.463007,0.469732,0.473467,0.462789,0.461703,0.471681,0.455542,0.464957,0.471742,0.467013,0.471545] + }, + "adapter_cutting": { + "adapter_trimmed_reads": 540203, + "adapter_trimmed_bases": 14696777, + "read1_adapter_sequence": "GATCGTCGGACTGTAGAACTCTGAAC", + "read1_adapter_counts": {"GATCGTCGG":14299, "GATCGTCGGAC":6547, "GATCGTCGGACT":5538, "GATCGTCGGACTG":5881, "GATCGTCGGACTGT":17243, "GATCGTCGGACTGTAGAACTCTGAACGTG":7306, "GATCGTCGGACTGTAGAACTCTGAACGTGT":404556, "others":78833} + }, + "read1_before_filtering": { + "total_reads": 1000000, + "total_bases": 37678766, + "q20_bases": 34709953, + "q30_bases": 34018346, + "total_cycles": 38, + "quality_curves": { + "A":[30.1023,29.9909,30.3424,30.3731,30.3951,33.7563,33.8171,33.7964,33.6769,34.2461,32.0416,32.2345,33.3671,31.4002,32.1319,33.9531,33.8651,34.3952,32.5273,31.4524,33.4457,31.626,34.0713,33.6001,34.1499,34.3216,32.0106,31.1379,31.4294,31.0495,33.149,34.0174,34.3201,32.2479,33.1344,30.9166,32.6897,34.06], + "T":[30.9601,30.7757,30.8734,30.8686,30.8783,34.3984,34.4052,34.4866,34.7508,34.6364,34.4516,34.4478,34.6493,34.0627,34.3648,34.7592,34.7462,34.717,34.5812,34.1128,34.6857,34.1296,34.6354,34.7421,34.6366,34.6568,34.4945,34.254,34.1913,33.9201,34.6827,34.6854,34.6539,34.5645,34.5903,33.921,34.6256,34.0429], + "C":[31.0626,30.9065,30.9966,31.0003,31.0258,34.5989,34.5798,34.6138,34.8344,34.713,34.6296,34.3718,34.7863,34.5195,34.3702,34.7335,34.8514,34.793,34.5575,34.5447,34.835,34.5791,34.7124,34.8038,34.7131,34.6841,34.6826,34.4275,34.1128,34.3422,34.7444,34.718,34.6531,34.6465,34.7181,34.5319,34.7889,34.3973], + "G":[30.5229,30.3815,30.5511,30.5992,30.5904,34.0087,34.0169,34.0197,33.9322,33.8345,34.3164,34.2746,33.6745,34.1624,34.1708,33.8907,34.0021,33.9368,34.3762,34.1284,33.6978,34.1745,33.7549,33.9866,33.8954,33.9569,34.3227,34.1765,34.0206,33.835,33.573,33.7451,33.8577,34.2241,33.6588,33.9831,33.671,34.0425], + "mean":[30.1568,30.0866,30.2723,30.2905,30.2976,33.7199,33.7054,33.7193,33.6292,33.7876,33.6127,33.5872,33.4341,33.3069,33.5387,33.6331,33.6975,33.9122,33.7498,33.3164,33.4659,33.324,33.6714,33.629,33.7399,33.8571,33.7149,33.356,33.2514,33.0513,33.3197,33.645,33.8238,33.7259,33.3852,33.6214,33.8441,34.06] + }, + "content_curves": { + "A":[0.314654,0.237083,0.219982,0.220298,0.215974,0.21456,0.233767,0.22907,0.137049,0.552003,0.141955,0.130698,0.12543,0.128143,0.132591,0.129813,0.141034,0.548306,0.138182,0.136471,0.133899,0.150379,0.552425,0.159106,0.562527,0.555956,0.14243,0.132202,0.133747,0.14431,0.14802,0.555425,0.5493,0.13679,0.12267,0.120523,0.10077,0], + "T":[0.194407,0.21312,0.219779,0.222531,0.216976,0.212131,0.226769,0.238401,0.123604,0.128531,0.541415,0.130619,0.137744,0.545369,0.124522,0.118915,0.120976,0.120502,0.130283,0.54418,0.13973,0.534456,0.124401,0.118367,0.121606,0.121829,0.132758,0.539629,0.136758,0.534435,0.132143,0.119065,0.121794,0.131477,0.14243,0.561881,0.156361,0.675335], + "C":[0.185744,0.249803,0.268543,0.269994,0.273495,0.28595,0.241631,0.219478,0.135682,0.129244,0.140163,0.554351,0.137444,0.137638,0.552921,0.145261,0.138754,0.143537,0.556207,0.136811,0.133065,0.129864,0.127386,0.132757,0.127862,0.147126,0.550559,0.148018,0.540736,0.131348,0.131496,0.134551,0.139989,0.54064,0.141824,0.132245,0.137366,0.157434], + "G":[0.290505,0.285286,0.276869,0.272243,0.27844,0.272174,0.282565,0.297718,0.58827,0.174806,0.16106,0.168924,0.583945,0.173419,0.174542,0.590603,0.583827,0.172221,0.159932,0.167123,0.577926,0.169892,0.180362,0.574324,0.172539,0.159606,0.1588,0.164682,0.173275,0.174395,0.572784,0.17539,0.173341,0.175547,0.57753,0.184419,0.604527,0.166027], + "N":[0.01469,0.014708,0.014827,0.014934,0.015115,0.015185,0.015268,0.015333,0.015395,0.015416,0.015407,0.015408,0.015437,0.015431,0.015424,0.015408,0.015409,0.015434,0.015396,0.015415,0.01538,0.015409,0.015426,0.015446,0.015466,0.015483,0.015453,0.015469,0.015484,0.015512,0.015557,0.015569,0.015576,0.015546,0.015546,0.000932911,0.000975171,0.00120431], + "GC":[0.476249,0.535089,0.545412,0.542237,0.551935,0.558124,0.524196,0.517196,0.723952,0.30405,0.301223,0.723275,0.721389,0.311057,0.727463,0.735864,0.722581,0.315758,0.716139,0.303934,0.710991,0.299756,0.307748,0.707081,0.300401,0.306732,0.709359,0.3127,0.714011,0.305743,0.70428,0.309941,0.31333,0.716187,0.719354,0.316664,0.741893,0.323461] + }, + "kmer_count": { + "AAAAA":144484, "AAAAT":41966, "AAAAC":29408, "AAAAG":30579, "AAATA":34195, "AAATT":26346, "AAATC":20102, "AAATG":25950, "AAACA":28214, "AAACT":23063, "AAACC":18676, "AAACG":11668, "AAAGA":36542, "AAAGT":23736, "AAAGC":19180, "AAAGG":26112, + "AATAA":29042, "AATAT":19418, "AATAC":13874, "AATAG":15272, "AATTA":19776, "AATTT":22841, "AATTC":43828, "AATTG":15119, "AATCA":15966, "AATCT":16819, "AATCC":16341, "AATCG":8257, "AATGA":25097, "AATGT":16807, "AATGC":14720, "AATGG":21075, + "AACAA":28818, "AACAT":19558, "AACAC":21331, "AACAG":18072, "AACTA":17082, "AACTT":17610, "AACTC":478972, "AACTG":19626, "AACCA":20754, "AACCT":20108, "AACCC":17788, "AACCG":7868, "AACGA":19520, "AACGT":455928, "AACGC":8679, "AACGG":10109, + "AAGAA":34505, "AAGAT":56306, "AAGAC":14603, "AAGAG":20627, "AAGTA":14735, "AAGTT":14634, "AAGTC":12450, "AAGTG":24529, "AAGCA":20377, "AAGCT":18340, "AAGCC":18960, "AAGCG":15372, "AAGGA":32982, "AAGGT":16410, "AAGGC":21495, "AAGGG":25562, + "ATAAA":29887, "ATAAT":16564, "ATAAC":13585, "ATAAG":13298, "ATATA":16168, "ATATT":16758, "ATATC":9312, "ATATG":13262, "ATACA":13990, "ATACT":11327, "ATACC":9943, "ATACG":4845, "ATAGA":19789, "ATAGT":12867, "ATAGC":10801, "ATAGG":13052, + "ATTAA":17682, "ATTAT":15122, "ATTAC":12235, "ATTAG":13620, "ATTTA":18097, "ATTTT":31976, "ATTTC":15787, "ATTTG":18503, "ATTCA":14109, "ATTCT":19452, "ATTCC":42087, "ATTCG":5973, "ATTGA":18343, "ATTGT":12522, "ATTGC":12604, "ATTGG":14767, + "ATCAA":13574, "ATCAT":13972, "ATCAC":13852, "ATCAG":13163, "ATCTA":10128, "ATCTT":15752, "ATCTC":20782, "ATCTG":18000, "ATCCA":13674, "ATCCT":16667, "ATCCC":15830, "ATCCG":6392, "ATCGA":15635, "ATCGT":542380, "ATCGC":7831, "ATCGG":8802, + "ATGAA":20171, "ATGAT":43413, "ATGAC":8922, "ATGAG":14408, "ATGTA":12627, "ATGTT":15273, "ATGTC":9768, "ATGTG":17801, "ATGCA":12654, "ATGCT":14772, "ATGCC":16021, "ATGCG":10654, "ATGGA":25691, "ATGGT":16030, "ATGGC":20975, "ATGGG":22150, + "ACAAA":29661, "ACAAT":14955, "ACAAC":17578, "ACAAG":17948, "ACATA":13881, "ACATT":15026, "ACATC":12294, "ACATG":18400, "ACACA":23204, "ACACT":18952, "ACACC":15764, "ACACG":10563, "ACAGA":26210, "ACAGT":18081, "ACAGC":17191, "ACAGG":22974, + "ACTAA":14398, "ACTAT":13566, "ACTAC":11620, "ACTAG":13307, "ACTTA":12049, "ACTTT":19765, "ACTTC":13895, "ACTTG":19480, "ACTCA":20479, "ACTCT":474997, "ACTCC":18489, "ACTCG":12108, "ACTGA":31461, "ACTGT":501830, "ACTGC":19241, "ACTGG":21126, + "ACCAA":17631, "ACCAT":16658, "ACCAC":24372, "ACCAG":20007, "ACCTA":11027, "ACCTT":14985, "ACCTC":22339, "ACCTG":26926, "ACCCA":20332, "ACCCT":18523, "ACCCC":21183, "ACCCG":33538, "ACCGA":11391, "ACCGT":5875, "ACCGC":11227, "ACCGG":10457, + "ACGAA":6533, "ACGAT":32428, "ACGAC":6008, "ACGAG":11228, "ACGTA":5034, "ACGTT":5024, "ACGTC":8464, "ACGTG":454222, "ACGCA":8560, "ACGCT":7954, "ACGCC":12704, "ACGCG":11123, "ACGGA":17532, "ACGGT":6356, "ACGGC":9578, "ACGGG":15203, + "AGAAA":34811, "AGAAT":47405, "AGAAC":496183, "AGAAG":26114, "AGATA":13075, "AGATT":13456, "AGATC":142833, "AGATG":20030, "AGACA":17801, "AGACT":14556, "AGACC":16518, "AGACG":10848, "AGAGA":28114, "AGAGT":15215, "AGAGC":21363, "AGAGG":31025, + "AGTAA":13148, "AGTAT":10798, "AGTAC":9212, "AGTAG":21211, "AGTTA":10454, "AGTTT":17371, "AGTTC":12673, "AGTTG":15031, "AGTCA":12538, "AGTCT":15214, "AGTCC":13646, "AGTCG":10460, "AGTGA":23819, "AGTGT":17235, "AGTGC":19604, "AGTGG":27459, + "AGCAA":18677, "AGCAT":15375, "AGCAC":18872, "AGCAG":28416, "AGCTA":15518, "AGCTT":15962, "AGCTC":19558, "AGCTG":35001, "AGCCA":27263, "AGCCT":28649, "AGCCC":25098, "AGCCG":19834, "AGCGA":19442, "AGCGT":8863, "AGCGC":18993, "AGCGG":25644, + "AGGAA":27049, "AGGAT":53354, "AGGAC":17886, "AGGAG":35210, "AGGTA":13563, "AGGTT":15994, "AGGTC":14255, "AGGTG":31386, "AGGCA":27981, "AGGCT":32224, "AGGCC":27173, "AGGCG":27600, "AGGGA":34585, "AGGGT":19220, "AGGGC":27068, "AGGGG":33235, + "TAAAA":35765, "TAAAT":19978, "TAAAC":14741, "TAAAG":17503, "TAATA":14160, "TAATT":16385, "TAATC":12247, "TAATG":13524, "TAACA":13962, "TAACT":12178, "TAACC":11011, "TAACG":6945, "TAAGA":21010, "TAAGT":10696, "TAAGC":11047, "TAAGG":14546, + "TATAA":15923, "TATAT":14329, "TATAC":8229, "TATAG":11206, "TATTA":13082, "TATTT":23598, "TATTC":10683, "TATTG":12148, "TATCA":9550, "TATCT":11282, "TATCC":8123, "TATCG":5489, "TATGA":17899, "TATGT":11193, "TATGC":8471, "TATGG":11673, + "TACAA":15459, "TACAT":10976, "TACAC":9495, "TACAG":14922, "TACTA":10237, "TACTT":13192, "TACTC":10826, "TACTG":12531, "TACCA":13386, "TACCT":12619, "TACCC":10580, "TACCG":5625, "TACGA":8716, "TACGT":3521, "TACGC":4513, "TACGG":5276, + "TAGAA":486704, "TAGAT":50743, "TAGAC":9056, "TAGAG":15013, "TAGTA":10696, "TAGTT":10449, "TAGTC":10523, "TAGTG":13167, "TAGCA":12903, "TAGCT":16527, "TAGCC":13996, "TAGCG":10166, "TAGGA":24174, "TAGGT":11336, "TAGGC":12723, "TAGGG":16871, + "TTAAA":24791, "TTAAT":14839, "TTAAC":10950, "TTAAG":14884, "TTATA":12889, "TTATT":20219, "TTATC":9461, "TTATG":12478, "TTACA":14145, "TTACT":12139, "TTACC":9976, "TTACG":4560, "TTAGA":18793, "TTAGT":11845, "TTAGC":12834, "TTAGG":14132, + "TTTAA":22090, "TTTAT":20901, "TTTAC":11268, "TTTAG":16069, "TTTTA":25993, "TTTTT":60972, "TTTTC":24680, "TTTTG":29302, "TTTCA":18908, "TTTCT":28255, "TTTCC":19687, "TTTCG":7174, "TTTGA":25543, "TTTGT":23016, "TTTGC":16130, "TTTGG":24354, + "TTCAA":15176, "TTCAT":14683, "TTCAC":13714, "TTCAG":17777, "TTCTA":14080, "TTCTT":23680, "TTCTC":21945, "TTCTG":23295, "TTCCA":45751, "TTCCT":22879, "TTCCC":20434, "TTCCG":9444, "TTCGA":8778, "TTCGT":6114, "TTCGC":6978, "TTCGG":9940, + "TTGAA":18355, "TTGAT":45462, "TTGAC":8681, "TTGAG":19065, "TTGTA":12727, "TTGTT":19727, "TTGTC":11709, "TTGTG":18936, "TTGCA":14867, "TTGCT":19533, "TTGCC":18106, "TTGCG":9890, "TTGGA":25599, "TTGGT":16045, "TTGGC":26079, "TTGGG":29382, + "TCAAA":18353, "TCAAT":9679, "TCAAC":12756, "TCAAG":17624, "TCATA":10834, "TCATT":14871, "TCATC":13295, "TCATG":15829, "TCACA":15955, "TCACT":18164, "TCACC":17458, "TCACG":9679, "TCAGA":23524, "TCAGT":14863, "TCAGC":18975, "TCAGG":22473, + "TCTAA":12056, "TCTAT":9173, "TCTAC":12029, "TCTAG":13691, "TCTTA":13304, "TCTTT":22567, "TCTTC":17909, "TCTTG":21402, "TCTCA":20348, "TCTCT":24309, "TCTCC":23147, "TCTCG":16985, "TCTGA":475138, "TCTGT":20211, "TCTGC":19048, "TCTGG":26603, + "TCCAA":22689, "TCCAT":19579, "TCCAC":22221, "TCCAG":30271, "TCCTA":12565, "TCCTT":17927, "TCCTC":23859, "TCCTG":31538, "TCCCA":25760, "TCCCT":22421, "TCCCC":24363, "TCCCG":18320, "TCCGA":9469, "TCCGT":6532, "TCCGC":12919, "TCCGG":14376, + "TCGAA":5924, "TCGAT":31835, "TCGAC":7635, "TCGAG":6851, "TCGTA":6275, "TCGTT":6663, "TCGTC":537241, "TCGTG":9662, "TCGCA":7552, "TCGCT":12966, "TCGCC":14597, "TCGCG":11833, "TCGGA":525350, "TCGGT":7960, "TCGGC":14622, "TCGGG":19774, + "TGAAA":24321, "TGAAT":14532, "TGAAC":469207, "TGAAG":24172, "TGATA":10338, "TGATT":13043, "TGATC":156719, "TGATG":14406, "TGACA":12265, "TGACT":12871, "TGACC":12689, "TGACG":6558, "TGAGA":25142, "TGAGT":15122, "TGAGC":21379, "TGAGG":29725, + "TGTAA":15457, "TGTAT":13675, "TGTAC":9233, "TGTAG":492200, "TGTTA":11708, "TGTTT":22400, "TGTTC":12383, "TGTTG":21040, "TGTCA":13672, "TGTCT":17210, "TGTCC":13547, "TGTCG":9491, "TGTGA":25390, "TGTGT":22935, "TGTGC":17483, "TGTGG":29891, + "TGCAA":15014, "TGCAT":13369, "TGCAC":16269, "TGCAG":26314, "TGCTA":11770, "TGCTT":19227, "TGCTC":17846, "TGCTG":33514, "TGCCA":21421, "TGCCT":29200, "TGCCC":25386, "TGCCG":14064, "TGCGA":16927, "TGCGT":7591, "TGCGC":16293, "TGCGG":19505, + "TGGAA":21938, "TGGAT":58712, "TGGAC":13925, "TGGAG":32296, "TGGTA":12989, "TGGTT":16961, "TGGTC":14221, "TGGTG":33546, "TGGCA":32232, "TGGCT":28035, "TGGCC":27238, "TGGCG":25986, "TGGGA":47654, "TGGGT":23316, "TGGGC":34607, "TGGGG":51519, + "CAAAA":30778, "CAAAT":18999, "CAAAC":16397, "CAAAG":24929, "CAATA":12939, "CAATT":11902, "CAATC":9848, "CAATG":17016, "CAACA":19853, "CAACT":15200, "CAACC":15639, "CAACG":11713, "CAAGA":32613, "CAAGT":14317, "CAAGC":17737, "CAAGG":22974, + "CATAA":13804, "CATAT":11895, "CATAC":9191, "CATAG":13990, "CATTA":11124, "CATTT":20116, "CATTC":12744, "CATTG":16368, "CATCA":13796, "CATCT":16488, "CATCC":13844, "CATCG":11363, "CATGA":25256, "CATGT":15346, "CATGC":16680, "CATGG":25409, + "CACAA":19972, "CACAT":16497, "CACAC":22796, "CACAG":24247, "CACTA":13222, "CACTT":18160, "CACTC":18355, "CACTG":32387, "CACCA":26053, "CACCT":24879, "CACCC":43667, "CACCG":14124, "CACGA":17513, "CACGT":7582, "CACGC":14114, "CACGG":14190, + "CAGAA":27364, "CAGAT":44774, "CAGAC":14725, "CAGAG":26086, "CAGTA":15799, "CAGTT":14343, "CAGTC":12840, "CAGTG":25398, "CAGCA":24473, "CAGCT":25164, "CAGCC":32868, "CAGCG":20972, "CAGGA":38895, "CAGGT":21356, "CAGGC":33821, "CAGGG":32654, + "CTAAA":16867, "CTAAT":11421, "CTAAC":10215, "CTAAG":14513, "CTATA":9905, "CTATT":10352, "CTATC":8389, "CTATG":12157, "CTACA":11399, "CTACT":13513, "CTACC":13186, "CTACG":6518, "CTAGA":25801, "CTAGT":8946, "CTAGC":12681, "CTAGG":17817, + "CTTAA":13732, "CTTAT":9928, "CTTAC":9538, "CTTAG":14233, "CTTTA":13262, "CTTTT":23679, "CTTTC":17284, "CTTTG":23566, "CTTCA":14519, "CTTCT":21017, "CTTCC":22333, "CTTCG":10417, "CTTGA":28851, "CTTGT":14784, "CTTGC":16771, "CTTGG":31941, + "CTCAA":19159, "CTCAT":15460, "CTCAC":21267, "CTCAG":30998, "CTCTA":14618, "CTCTT":20935, "CTCTC":24583, "CTCTG":485513, "CTCCA":21979, "CTCCT":31252, "CTCCC":36017, "CTCCG":16441, "CTCGA":18862, "CTCGT":8477, "CTCGC":17156, "CTCGG":23080, + "CTGAA":474969, "CTGAT":63544, "CTGAC":14219, "CTGAG":30506, "CTGTA":485196, "CTGTT":16720, "CTGTC":18520, "CTGTG":30020, "CTGCA":23205, "CTGCT":26450, "CTGCC":33331, "CTGCG":22352, "CTGGA":42266, "CTGGT":20479, "CTGGC":28242, "CTGGG":56393, + "CCAAA":23564, "CCAAT":13976, "CCAAC":16729, "CCAAG":28028, "CCATA":12688, "CCATT":16964, "CCATC":17603, "CCATG":26951, "CCACA":22590, "CCACT":24119, "CCACC":35529, "CCACG":16219, "CCAGA":33512, "CCAGT":16617, "CCAGC":35182, "CCAGG":44487, + "CCTAA":12604, "CCTAT":9065, "CCTAC":10059, "CCTAG":18930, "CCTTA":10949, "CCTTT":18851, "CCTTC":18265, "CCTTG":28086, "CCTCA":23627, "CCTCT":25981, "CCTCC":39791, "CCTCG":22434, "CCTGA":40004, "CCTGT":23234, "CCTGC":29170, "CCTGG":45845, + "CCCAA":21475, "CCCAT":17419, "CCCAC":24631, "CCCAG":43081, "CCCTA":13021, "CCCTT":20057, "CCCTC":27609, "CCCTG":37908, "CCCCA":29578, "CCCCT":27947, "CCCCC":33446, "CCCCG":26916, "CCCGA":40594, "CCCGT":9701, "CCCGC":25536, "CCCGG":30954, + "CCGAA":8112, "CCGAT":27287, "CCGAC":8455, "CCGAG":39957, "CCGTA":4284, "CCGTT":5544, "CCGTC":9547, "CCGTG":12875, "CCGCA":11481, "CCGCT":14521, "CCGCC":31892, "CCGCG":25505, "CCGGA":19515, "CCGGT":9665, "CCGGC":23388, "CCGGG":36482, + "CGAAA":6599, "CGAAT":5102, "CGAAC":7480, "CGAAG":9217, "CGATA":3224, "CGATT":3987, "CGATC":125904, "CGATG":6083, "CGACA":5475, "CGACT":6628, "CGACC":7864, "CGACG":10368, "CGAGA":39391, "CGAGT":10070, "CGAGC":12549, "CGAGG":17717, + "CGTAA":3963, "CGTAT":3720, "CGTAC":4630, "CGTAG":8584, "CGTTA":3478, "CGTTT":5739, "CGTTC":6542, "CGTTG":8596, "CGTCA":5439, "CGTCT":8598, "CGTCC":10725, "CGTCG":536775, "CGTGA":13893, "CGTGT":441413, "CGTGC":11462, "CGTGG":19487, + "CGCAA":7677, "CGCAT":6306, "CGCAC":13616, "CGCAG":17264, "CGCTA":6760, "CGCTT":10521, "CGCTC":14783, "CGCTG":24804, "CGCCA":16927, "CGCCT":21105, "CGCCC":27555, "CGCCG":28845, "CGCGA":17133, "CGCGT":8505, "CGCGC":26433, "CGCGG":34746, + "CGGAA":11680, "CGGAT":36972, "CGGAC":524782, "CGGAG":22495, "CGGTA":5664, "CGGTT":6823, "CGGTC":9780, "CGGTG":18619, "CGGCA":12751, "CGGCT":18403, "CGGCC":27257, "CGGCG":35972, "CGGGA":32334, "CGGGT":16928, "CGGGC":34426, "CGGGG":42861, + "GAAAA":28339, "GAAAT":22224, "GAAAC":17802, "GAAAG":26091, "GAATA":12092, "GAATT":42685, "GAATC":12536, "GAATG":18029, "GAACA":20902, "GAACT":482442, "GAACC":17777, "GAACG":464115, "GAAGA":31870, "GAAGT":15608, "GAAGC":20720, "GAAGG":28769, + "GATAA":11243, "GATAT":8100, "GATAC":6900, "GATAG":12579, "GATTA":11030, "GATTT":13539, "GATTC":10523, "GATTG":11687, "GATCA":12332, "GATCT":20365, "GATCC":12448, "GATCG":551311, "GATGA":16077, "GATGT":10792, "GATGC":11066, "GATGG":22922, + "GACAA":11935, "GACAT":10538, "GACAC":11824, "GACAG":23459, "GACTA":9618, "GACTT":12878, "GACTC":16779, "GACTG":512459, "GACCA":13961, "GACCT":14418, "GACCC":16237, "GACCG":9650, "GACGA":9157, "GACGT":6081, "GACGC":10169, "GACGG":17273, + "GAGAA":51564, "GAGAT":34097, "GAGAC":19581, "GAGAG":28918, "GAGTA":10263, "GAGTT":14000, "GAGTC":14195, "GAGTG":21729, "GAGCA":17611, "GAGCT":21426, "GAGCC":29376, "GAGCG":21901, "GAGGA":33883, "GAGGT":23792, "GAGGC":41912, "GAGGG":35618, + "GTAAA":13870, "GTAAT":12639, "GTAAC":8367, "GTAAG":12070, "GTATA":9086, "GTATT":11476, "GTATC":6684, "GTATG":10288, "GTACA":9535, "GTACT":8800, "GTACC":7805, "GTACG":5421, "GTAGA":499569, "GTAGT":10927, "GTAGC":14928, "GTAGG":18502, + "GTTAA":9963, "GTTAT":8381, "GTTAC":7267, "GTTAG":11241, "GTTTA":10542, "GTTTT":21786, "GTTTC":15355, "GTTTG":16413, "GTTCA":11950, "GTTCT":13584, "GTTCC":12937, "GTTCG":7459, "GTTGA":16891, "GTTGT":12445, "GTTGC":14921, "GTTGG":21693, + "GTCAA":8590, "GTCAT":10032, "GTCAC":11620, "GTCAG":15974, "GTCTA":6879, "GTCTT":14125, "GTCTC":19044, "GTCTG":17605, "GTCCA":10073, "GTCCT":14016, "GTCCC":17661, "GTCCG":10873, "GTCGA":8143, "GTCGT":7433, "GTCGC":13387, "GTCGG":540040, + "GTGAA":16778, "GTGAT":42240, "GTGAC":12400, "GTGAG":24846, "GTGTA":18142, "GTGTT":15422, "GTGTC":13707, "GTGTG":27617, "GTGCA":16964, "GTGCT":19439, "GTGCC":19810, "GTGCG":15220, "GTGGA":31188, "GTGGT":23955, "GTGGC":31938, "GTGGG":45251, + "GCAAA":16852, "GCAAT":12398, "GCAAC":14332, "GCAAG":21205, "GCATA":9940, "GCATT":12753, "GCATC":11716, "GCATG":21074, "GCACA":18789, "GCACT":19802, "GCACC":37336, "GCACG":15962, "GCAGA":28589, "GCAGT":19226, "GCAGC":31695, "GCAGG":37819, + "GCTAA":12663, "GCTAT":8957, "GCTAC":11088, "GCTAG":17326, "GCTTA":10006, "GCTTT":16609, "GCTTC":17957, "GCTTG":22702, "GCTCA":20075, "GCTCT":21409, "GCTCC":23657, "GCTCG":18296, "GCTGA":36860, "GCTGT":24422, "GCTGC":37634, "GCTGG":53769, + "GCCAA":18324, "GCCAT":19780, "GCCAC":25653, "GCCAG":34002, "GCCTA":12605, "GCCTT":19693, "GCCTC":37066, "GCCTG":41948, "GCCCA":26858, "GCCCT":27215, "GCCCC":35640, "GCCCG":26600, "GCCGA":20034, "GCCGT":9793, "GCCGC":32741, "GCCGG":33450, + "GCGAA":7034, "GCGAT":48678, "GCGAC":8299, "GCGAG":18556, "GCGTA":4535, "GCGTT":6984, "GCGTC":10868, "GCGTG":20100, "GCGCA":14680, "GCGCT":19659, "GCGCC":31402, "GCGCG":36411, "GCGGA":32288, "GCGGT":16489, "GCGGC":45598, "GCGGG":53076, + "GGAAA":25299, "GGAAT":17488, "GGAAC":15925, "GGAAG":32817, "GGATA":10515, "GGATT":15431, "GGATC":171012, "GGATG":19170, "GGACA":19682, "GGACT":522209, "GGACC":16086, "GGACG":14159, "GGAGA":37217, "GGAGT":22386, "GGAGC":33141, "GGAGG":53899, + "GGTAA":12198, "GGTAT":9070, "GGTAC":7691, "GGTAG":18427, "GGTTA":9300, "GGTTT":17854, "GGTTC":14222, "GGTTG":19222, "GGTCA":12380, "GGTCT":15849, "GGTCC":14689, "GGTCG":15766, "GGTGA":30707, "GGTGT":20157, "GGTGC":21795, "GGTGG":49925, + "GGCAA":19231, "GGCAT":17746, "GGCAC":36519, "GGCAG":40897, "GGCTA":12960, "GGCTT":18733, "GGCTC":28693, "GGCTG":56521, "GGCCA":28442, "GGCCT":27986, "GGCCC":34082, "GGCCG":32329, "GGCGA":27323, "GGCGT":15620, "GGCGC":35073, "GGCGG":64595, + "GGGAA":28548, "GGGAT":67168, "GGGAC":21828, "GGGAG":53607, "GGGTA":12539, "GGGTT":20153, "GGGTC":20951, "GGGTG":38420, "GGGCA":29542, "GGGCT":33928, "GGGCC":37599, "GGGCG":49766, "GGGGA":52154, "GGGGT":31489, "GGGGC":49488, "GGGGG":530960 + }, + "overrepresented_sequences": { + "AAAAAAAAAAAAAAAAAAAA":78, + "AAAGATCGTCGGACTGTAGA":233, + "AACGATCGTCGGACTGTAGA":304, + "AAGGATCGTCGGACTGTAGA":292, + "AATGATCGTCGGACTGTAGA":322, + "ACAGATCGTCGGACTGTAGA":183, + "ACCGATCGTCGGACTGTAGA":203, + "ACCTGATCGTCGGACTGTAG":144, + "ACGGATCGTCGGACTGTAGA":198, + "ACTGATCGTCGGACTGTAGA":162, + "ACTGTAGAACTCTGAACGTG":18, + "AGATCGTCGGACTGTAGAAC":38, + "AGCGATCGTCGGACTGTAGA":427, + "AGCTGATCGTCGGACTGTAG":152, + "AGGGATCGTCGGACTGTAGA":291, + "AGTGATCGTCGGACTGTAGA":232, + "ATAGATCGTCGGACTGTAGA":294, + "ATCGATCGTCGGACTGTAGA":188, + "ATCGTCGGACTGTAGAACTC":36, + "ATGATCGTCGGACTGTAGAA":17, + "ATGGATCGTCGGACTGTAGA":204, + "ATTGATCGTCGGACTGTAGA":306, + "CAAAGATCGTCGGACTGTAG":136, + "CAAGATCGTCGGACTGTAGA":191, + "CAAGGATCGTCGGACTGTAG":138, + "CACGATCGTCGGACTGTAGA":248, + "CACTGATCGTCGGACTGTAG":182, + "CAGGATCGTCGGACTGTAGA":105, + "CATCGATCGTCGGACTGTAG":120, + "CATGATCGTCGGACTGTAGA":191, + "CATGGATCGTCGGACTGTAG":132, + "CCAAGATCGTCGGACTGTAG":225, + "CCAGATCGTCGGACTGTAGA":137, + "CCAGGATCGTCGGACTGTAG":183, + "CCATGATCGTCGGACTGTAG":187, + "CCCAGATCGTCGGACTGTAG":212, + "CCCCTGATCGTCGGACTGTA":138, + "CCCGATCGTCGGACTGTAGA":334, + "CCCGGATCGTCGGACTGTAG":115, + "CCCTGATCGTCGGACTGTAG":251, + "CCGATCGTCGGACTGTAGAA":92, + "CCGGATCGTCGGACTGTAGA":108, + "CCTAGATCGTCGGACTGTAG":178, + "CCTCGATCGTCGGACTGTAG":171, + "CCTGGATCGTCGGACTGTAG":256, + "CCTTGATCGTCGGACTGTAG":195, + "CGAGATCGTCGGACTGTAGA":136, + "CGATCGTCGGACTGTAGAAC":26, + "CGCGATCGTCGGACTGTAGA":212, + "CGCGGATCGTCGGACTGTAG":115, + "CGCTGATCGTCGGACTGTAG":134, + "CGGACTGTAGAACTCTGAAC":77, + "CGGGATCGTCGGACTGTAGA":243, + "CGGGGATCGTCGGACTGTAG":125, + "CGTCGGACTGTAGAACTCTG":68, + "CGTGATCGTCGGACTGTAGA":250, + "CTAGATCGTCGGACTGTAGA":168, + "CTAGGATCGTCGGACTGTAG":113, + "CTCGATCGTCGGACTGTAGA":322, + "CTCTGATCGTCGGACTGTAG":144, + "CTGCGATCGTCGGACTGTAG":124, + "CTGGATCGTCGGACTGTAGA":162, + "CTGGGATCGTCGGACTGTAG":227, + "CTGTGATCGTCGGACTGTAG":134, + "CTTGATCGTCGGACTGTAGA":205, + "CTTGGCACCCGAGAATTCCA":146, + "CTTTGATCGTCGGACTGTAG":123, + "GAAGATCGTCGGACTGTAGA":264, + "GACGATCGTCGGACTGTAGA":186, + "GACTGTAGAACTCTGAACGT":20, + "GAGATCGTCGGACTGTAGAA":95, + "GAGGATCGTCGGACTGTAGA":116, + "GATCGTCGGACTGTAGAACT":140, + "GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCT":247, + "GATGATCGTCGGACTGTAGA":189, + "GCAAGATCGTCGGACTGTAG":157, + "GCACGATCGTCGGACTGTAG":149, + "GCAGATCGTCGGACTGTAGA":194, + "GCAGGATCGTCGGACTGTAG":165, + "GCATGATCGTCGGACTGTAG":149, + "GCCAGATCGTCGGACTGTAG":195, + "GCCGATCGTCGGACTGTAGA":255, + "GCCTGATCGTCGGACTGTAG":198, + "GCGCGATCGTCGGACTGTAG":167, + "GCGGATCGTCGGACTGTAGA":214, + "GCGGGATCGTCGGACTGTAG":231, + "GCTAGATCGTCGGACTGTAG":148, + "GCTGGATCGTCGGACTGTAG":227, + "GCTTGATCGTCGGACTGTAG":142, + "GGACTGTAGAACTCTGAACG":77, + "GGAGATCGTCGGACTGTAGA":161, + "GGAGGATCGTCGGACTGTAG":136, + "GGATCGTCGGACTGTAGAAC":19, + "GGCAGATCGTCGGACTGTAG":150, + "GGCGATCGTCGGACTGTAGA":267, + "GGCGGATCGTCGGACTGTAG":210, + "GGCTGATCGTCGGACTGTAG":193, + "GGGAGATCGTCGGACTGTAG":164, + "GGGCGATCGTCGGACTGTAG":241, + "GGGGATCGTCGGACTGTAGA":111, + "GGGGGATCGTCGGACTGTAG":273, + "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG":607, + "GGTGATCGTCGGACTGTAGA":358, + "GGTGGATCGTCGGACTGTAG":163, + "GTAGATCGTCGGACTGTAGA":294, + "GTCGATCGTCGGACTGTAGA":151, + "GTCGGACTGTAGAACTCTGA":90, + "GTGGATCGTCGGACTGTAGA":224, + "GTGGGATCGTCGGACTGTAG":135, + "GTTGATCGTCGGACTGTAGA":232, + "NNNNNNNNNNNNNNNNNNNN":829, + "TAAGATCGTCGGACTGTAGA":318, + "TACGATCGTCGGACTGTAGA":217, + "TAGATCGTCGGACTGTAGAA":12, + "TAGGATCGTCGGACTGTAGA":222, + "TATGATCGTCGGACTGTAGA":234, + "TCAGATCGTCGGACTGTAGA":147, + "TCCTGATCGTCGGACTGTAG":146, + "TCGGACTGTAGAACTCTGAA":85, + "TCGGATCGTCGGACTGTAGA":211, + "TCGTCGGACTGTAGAACTCT":51, + "TCTGATCGTCGGACTGTAGA":171, + "TGAGATCGTCGGACTGTAGA":201, + "TGATCGTCGGACTGTAGAAC":30, + "TGCGATCGTCGGACTGTAGA":264, + "TGCTGATCGTCGGACTGTAG":149, + "TGGGATCGTCGGACTGTAGA":195, + "TGGGGATCGTCGGACTGTAG":181, + "TGTGATCGTCGGACTGTAGA":216, + "TTAGATCGTCGGACTGTAGA":258, + "TTCGATCGTCGGACTGTAGA":189, + "TTGGATCGTCGGACTGTAGA":318, + "TTTGATCGTCGGACTGTAGA":146 } + }, + "read1_after_filtering": { + "total_reads": 439519, + "total_bases": 16134044, + "q20_bases": 15454671, + "q30_bases": 15178418, + "total_cycles": 38, + "quality_curves": { + "A":[30.6532,30.5453,30.8913,30.8849,30.9311,34.3902,34.3326,34.2759,34.6565,33.5438,33.4735,33.5068,34.5057,33.0241,33.5308,34.9244,34.8535,33.9026,33.836,33.0547,34.616,33.1939,32.4045,34.6388,33.0761,33.7631,33.4267,33.0139,32.9276,32.7339,34.3503,32.8285,33.5863,33.5355,34.4883,32.7721,34.1664,34.5141], + "T":[31.3078,31.1954,31.2971,31.318,31.3362,35.0028,35.0142,35.0109,35.2502,35.2586,34.5057,35.0355,35.1825,34.3324,34.9186,35.2775,35.2669,35.2803,35.1731,34.3213,35.1984,34.2713,35.1441,35.2433,35.1893,35.183,35.0639,34.2394,34.7854,33.9934,35.1507,35.2173,35.1709,35.0959,35.1152,34.2133,35.1085,33.9155], + "C":[31.3065,31.2402,31.3489,31.3552,31.3797,35.0545,35.0538,35.0951,35.3558,35.2595,35.2501,34.3734,35.2926,35.1488,34.1112,35.2777,35.3457,35.338,34.4882,35.0701,35.3257,35.1122,35.2467,35.3024,35.1783,35.2334,34.4872,34.9965,33.8475,34.8599,35.2382,35.157,35.2116,34.5276,35.1893,35.0442,35.2739,34.8959], + "G":[31.0271,30.9235,31.1206,31.1744,31.1499,34.778,34.7776,34.767,34.2991,34.8659,35.0226,35.0349,34.1258,34.9288,34.9755,34.3316,34.3558,34.9292,35.1251,34.9157,34.2145,34.8996,34.7812,34.3093,34.8709,34.882,35.012,34.8599,34.7999,34.6789,33.9708,34.788,34.82,34.9695,34.027,34.7688,34.0838,34.7812], + "mean":[30.9883,30.9649,31.1654,31.189,31.2038,34.8159,34.7988,34.7871,34.8567,34.7021,34.5837,34.5052,34.732,34.3846,34.3951,34.9203,34.9251,34.8506,34.6568,34.3723,34.8104,34.4034,34.393,34.8491,34.5659,34.7538,34.4988,34.3057,34.105,34.1008,34.6506,34.4881,34.6932,34.5402,34.6847,34.2513,34.6906,34.5141] + }, + "content_curves": { + "A":[0.369968,0.266667,0.243432,0.231772,0.231583,0.228707,0.232379,0.240816,0.222382,0.261274,0.233696,0.231985,0.233023,0.231692,0.229876,0.223333,0.223679,0.251853,0.234267,0.233034,0.236363,0.235403,0.246242,0.236588,0.250974,0.253052,0.243299,0.239327,0.236056,0.236394,0.233299,0.248975,0.246062,0.235566,0.22024,0.223857,0.179453,0], + "T":[0.207816,0.261224,0.266491,0.263752,0.249741,0.241496,0.229412,0.235335,0.22748,0.226129,0.254091,0.224052,0.233546,0.259008,0.229471,0.231046,0.228532,0.223046,0.219397,0.248556,0.228104,0.235906,0.225647,0.224771,0.22549,0.225194,0.223146,0.235225,0.225901,0.236901,0.233142,0.227017,0.22854,0.230398,0.234023,0.248826,0.265889,0.349438], + "C":[0.148023,0.225119,0.234859,0.242479,0.252608,0.261572,0.254742,0.246699,0.247864,0.237278,0.239373,0.26491,0.232438,0.235885,0.272118,0.251607,0.252492,0.249466,0.277665,0.249682,0.248986,0.252754,0.247628,0.253903,0.242044,0.249389,0.263877,0.251336,0.260841,0.252284,0.250995,0.250514,0.253489,0.267365,0.260156,0.25354,0.269998,0.362529], + "G":[0.274186,0.24699,0.255218,0.261998,0.266068,0.268225,0.283467,0.277151,0.302269,0.275301,0.272839,0.279051,0.300981,0.273406,0.268534,0.294005,0.295293,0.275626,0.268671,0.268712,0.286545,0.275931,0.280479,0.284732,0.281469,0.272352,0.269659,0.274086,0.277164,0.274372,0.282503,0.273437,0.271825,0.266595,0.285551,0.273712,0.284603,0.287846], + "N":[6.82564e-06,0,0,0,0,0,0,0,4.55043e-06,1.82017e-05,0,2.27521e-06,1.13761e-05,9.10086e-06,0,9.10086e-06,4.55043e-06,9.10086e-06,0,1.59265e-05,2.27521e-06,6.82564e-06,4.55043e-06,6.82564e-06,2.27521e-05,1.36513e-05,1.84418e-05,2.57565e-05,3.82753e-05,4.85115e-05,6.13363e-05,5.7072e-05,8.2911e-05,7.59955e-05,3.03982e-05,6.47944e-05,5.57543e-05,0.000187018], + "GC":[0.422209,0.472109,0.490077,0.504476,0.518676,0.529797,0.538209,0.52385,0.550133,0.512579,0.512212,0.543961,0.533419,0.509291,0.540652,0.545612,0.547785,0.525092,0.546336,0.518394,0.535531,0.528685,0.528107,0.538634,0.523513,0.521741,0.533537,0.525423,0.538005,0.526656,0.533498,0.52395,0.525314,0.53396,0.545706,0.527253,0.554602,0.650375] + }, + "kmer_count": { + "AAAAA":66178, "AAAAT":31082, "AAAAC":19402, "AAAAG":21480, "AAATA":23999, "AAATT":21294, "AAATC":13821, "AAATG":19489, "AAACA":19196, "AAACT":16973, "AAACC":14126, "AAACG":6086, "AAAGA":20768, "AAAGT":17811, "AAAGC":14513, "AAAGG":18422, + "AATAA":18974, "AATAT":14556, "AATAC":10573, "AATAG":10198, "AATTA":15204, "AATTT":19652, "AATTC":30943, "AATTG":10658, "AATCA":11846, "AATCT":12302, "AATCC":13743, "AATCG":4148, "AATGA":13965, "AATGT":12859, "AATGC":11141, "AATGG":14646, + "AACAA":17877, "AACAT":14107, "AACAC":12605, "AACAG":13790, "AACTA":10695, "AACTT":13395, "AACTC":17790, "AACTG":13290, "AACCA":15248, "AACCT":14395, "AACCC":13180, "AACCG":4700, "AACGA":5965, "AACGT":9237, "AACGC":5360, "AACGG":5503, + "AAGAA":23262, "AAGAT":14829, "AAGAC":12114, "AAGAG":17498, "AAGTA":11333, "AAGTT":12680, "AAGTC":10647, "AAGTG":18066, "AAGCA":15329, "AAGCT":13513, "AAGCC":14527, "AAGCG":8619, "AAGGA":19226, "AAGGT":13096, "AAGGC":16269, "AAGGG":17595, + "ATAAA":19526, "ATAAT":12482, "ATAAC":9130, "ATAAG":8598, "ATATA":12206, "ATATT":13982, "ATATC":6852, "ATATG":9178, "ATACA":10785, "ATACT":8813, "ATACC":7325, "ATACG":2112, "ATAGA":9411, "ATAGT":8887, "ATAGC":7494, "ATAGG":8305, + "ATTAA":13355, "ATTAT":12464, "ATTAC":9589, "ATTAG":9794, "ATTTA":15478, "ATTTT":29205, "ATTTC":14134, "ATTTG":14693, "ATTCA":11866, "ATTCT":17584, "ATTCC":29174, "ATTCG":3388, "ATTGA":9727, "ATTGT":10406, "ATTGC":9603, "ATTGG":9953, + "ATCAA":9445, "ATCAT":10679, "ATCAC":10846, "ATCAG":10175, "ATCTA":7590, "ATCTT":12950, "ATCTC":13227, "ATCTG":12549, "ATCCA":10671, "ATCCT":13260, "ATCCC":13214, "ATCCG":4775, "ATCGA":3847, "ATCGT":12862, "ATCGC":5155, "ATCGG":5211, + "ATGAA":14748, "ATGAT":11357, "ATGAC":7302, "ATGAG":11966, "ATGTA":9339, "ATGTT":13478, "ATGTC":8564, "ATGTG":13453, "ATGCA":9580, "ATGCT":10963, "ATGCC":12176, "ATGCG":4633, "ATGGA":14658, "ATGGT":12975, "ATGGC":16000, "ATGGG":14677, + "ACAAA":19315, "ACAAT":10158, "ACAAC":10776, "ACAAG":11559, "ACATA":9702, "ACATT":12145, "ACATC":8685, "ACATG":12781, "ACACA":16408, "ACACT":11989, "ACACC":11427, "ACACG":5522, "ACAGA":17382, "ACAGT":12982, "ACAGC":14182, "ACAGG":18830, + "ACTAA":10024, "ACTAT":8920, "ACTAC":8822, "ACTAG":8314, "ACTTA":9154, "ACTTT":16562, "ACTTC":11729, "ACTTG":13822, "ACTCA":13457, "ACTCT":18581, "ACTCC":15119, "ACTCG":6976, "ACTGA":15203, "ACTGT":17301, "ACTGC":15511, "ACTGG":14699, + "ACCAA":12113, "ACCAT":12393, "ACCAC":18234, "ACCAG":14361, "ACCTA":7715, "ACCTT":11513, "ACCTC":16266, "ACCTG":18347, "ACCCA":14624, "ACCCT":12751, "ACCCC":14774, "ACCCG":23831, "ACCGA":4832, "ACCGT":4250, "ACCGC":8520, "ACCGG":7277, + "ACGAA":4187, "ACGAT":4042, "ACGAC":3849, "ACGAG":6050, "ACGTA":2957, "ACGTT":3512, "ACGTC":4186, "ACGTG":11000, "ACGCA":5486, "ACGCT":4976, "ACGCC":9092, "ACGCG":6577, "ACGGA":7032, "ACGGT":4574, "ACGGC":7477, "ACGGG":8911, + "AGAAA":25438, "AGAAT":34885, "AGAAC":21303, "AGAAG":20450, "AGATA":10026, "AGATT":11866, "AGATC":13078, "AGATG":16953, "AGACA":15036, "AGACT":12529, "AGACC":14386, "AGACG":8328, "AGAGA":23034, "AGAGT":13757, "AGAGC":18857, "AGAGG":26889, + "AGTAA":10560, "AGTAT":8723, "AGTAC":7342, "AGTAG":13897, "AGTTA":8926, "AGTTT":15994, "AGTTC":11696, "AGTTG":12273, "AGTCA":10919, "AGTCT":13587, "AGTCC":12551, "AGTCG":6853, "AGTGA":16941, "AGTGT":13803, "AGTGC":16858, "AGTGG":21464, + "AGCAA":13504, "AGCAT":11500, "AGCAC":13510, "AGCAG":22948, "AGCTA":11784, "AGCTT":12752, "AGCTC":16033, "AGCTG":26248, "AGCCA":21384, "AGCCT":22960, "AGCCC":20145, "AGCCG":16397, "AGCGA":7827, "AGCGT":6106, "AGCGC":13808, "AGCGG":17719, + "AGGAA":22341, "AGGAT":15281, "AGGAC":13557, "AGGAG":30615, "AGGTA":10552, "AGGTT":13927, "AGGTC":12442, "AGGTG":24629, "AGGCA":21995, "AGGCT":27065, "AGGCC":22647, "AGGCG":20392, "AGGGA":23083, "AGGGT":16395, "AGGGC":20996, "AGGGG":22466, + "TAAAA":24421, "TAAAT":15906, "TAAAC":10335, "TAAAG":12427, "TAATA":10493, "TAATT":13759, "TAATC":9894, "TAATG":9462, "TAACA":10198, "TAACT":9088, "TAACC":8380, "TAACG":3253, "TAAGA":11026, "TAAGT":9077, "TAAGC":7759, "TAAGG":10453, + "TATAA":11307, "TATAT":11776, "TATAC":6218, "TATAG":7731, "TATTA":10506, "TATTT":21341, "TATTC":9135, "TATTG":8806, "TATCA":7389, "TATCT":9162, "TATCC":6346, "TATCG":2189, "TATGA":8636, "TATGT":9555, "TATGC":6020, "TATGG":8206, + "TACAA":11003, "TACAT":8886, "TACAC":7136, "TACAG":12482, "TACTA":7774, "TACTT":11162, "TACTC":8468, "TACTG":9152, "TACCA":10475, "TACCT":9477, "TACCC":7907, "TACCG":3621, "TACGA":2662, "TACGT":2506, "TACGC":2892, "TACGG":3302, + "TAGAA":18321, "TAGAT":8502, "TAGAC":7169, "TAGAG":12581, "TAGTA":8426, "TAGTT":9152, "TAGTC":7604, "TAGTG":9429, "TAGCA":9269, "TAGCT":12492, "TAGCC":10393, "TAGCG":4675, "TAGGA":11608, "TAGGT":8753, "TAGGC":9005, "TAGGG":11093, + "TTAAA":19911, "TTAAT":12420, "TTAAC":8276, "TTAAG":11225, "TTATA":10535, "TTATT":18031, "TTATC":7712, "TTATG":9233, "TTACA":11786, "TTACT":10051, "TTACC":7955, "TTACG":2320, "TTAGA":10371, "TTAGT":10448, "TTAGC":10160, "TTAGG":10689, + "TTTAA":19166, "TTTAT":18707, "TTTAC":9644, "TTTAG":12882, "TTTTA":23565, "TTTTT":55952, "TTTTC":23065, "TTTTG":25174, "TTTCA":17293, "TTTCT":26493, "TTTCC":18098, "TTTCG":4992, "TTTGA":16991, "TTTGT":21092, "TTTGC":13573, "TTTGG":20120, + "TTCAA":12830, "TTCAT":12970, "TTCAC":11740, "TTCAG":16082, "TTCTA":12648, "TTCTT":21961, "TTCTC":20188, "TTCTG":20450, "TTCCA":32011, "TTCCT":20481, "TTCCC":17864, "TTCCG":8045, "TTCGA":3559, "TTCGT":4723, "TTCGC":5249, "TTCGG":7671, + "TTGAA":15705, "TTGAT":11392, "TTGAC":7445, "TTGAG":16233, "TTGTA":10830, "TTGTT":17932, "TTGTC":10279, "TTGTG":14727, "TTGCA":12133, "TTGCT":15959, "TTGCC":14341, "TTGCG":5143, "TTGGA":14742, "TTGGT":13394, "TTGGC":21489, "TTGGG":21689, + "TCAAA":14203, "TCAAT":7264, "TCAAC":8919, "TCAAG":12804, "TCATA":8380, "TCATT":13191, "TCATC":10185, "TCATG":11703, "TCACA":12669, "TCACT":15245, "TCACC":14161, "TCACG":6017, "TCAGA":14967, "TCAGT":13460, "TCAGC":17032, "TCAGG":19605, + "TCTAA":9475, "TCTAT":7808, "TCTAC":10200, "TCTAG":9655, "TCTTA":11149, "TCTTT":20793, "TCTTC":15555, "TCTTG":16899, "TCTCA":17409, "TCTCT":21461, "TCTCC":20698, "TCTCG":9924, "TCTGA":18142, "TCTGT":18447, "TCTGC":16903, "TCTGG":21437, + "TCCAA":16359, "TCCAT":14610, "TCCAC":16238, "TCCAG":23361, "TCCTA":10189, "TCCTT":15705, "TCCTC":19642, "TCCTG":25158, "TCCCA":21862, "TCCCT":18022, "TCCCC":19742, "TCCCG":13960, "TCCGA":5246, "TCCGT":5605, "TCCGC":11489, "TCCGG":12589, + "TCGAA":3968, "TCGAT":3718, "TCGAC":3436, "TCGAG":5202, "TCGTA":3071, "TCGTT":4729, "TCGTC":13628, "TCGTG":6351, "TCGCA":4946, "TCGCT":10136, "TCGCC":10877, "TCGCG":7423, "TCGGA":17294, "TCGGT":6632, "TCGGC":12005, "TCGGG":13804, + "TGAAA":18630, "TGAAT":12042, "TGAAC":19197, "TGAAG":16676, "TGATA":7882, "TGATT":11635, "TGATC":14398, "TGATG":11646, "TGACA":10398, "TGACT":10989, "TGACC":10952, "TGACG":4378, "TGAGA":18385, "TGAGT":13993, "TGAGC":18405, "TGAGG":25463, + "TGTAA":12961, "TGTAT":11136, "TGTAC":7242, "TGTAG":16213, "TGTTA":10064, "TGTTT":20749, "TGTTC":11559, "TGTTG":16650, "TGTCA":12046, "TGTCT":15920, "TGTCC":12608, "TGTCG":6849, "TGTGA":15022, "TGTGT":20246, "TGTGC":14650, "TGTGG":21142, + "TGCAA":11448, "TGCAT":10311, "TGCAC":11914, "TGCAG":22628, "TGCTA":8786, "TGCTT":15837, "TGCTC":14419, "TGCTG":24729, "TGCCA":16209, "TGCCT":22857, "TGCCC":19517, "TGCCG":10090, "TGCGA":5639, "TGCGT":5499, "TGCGC":11627, "TGCGG":13537, + "TGGAA":17775, "TGGAT":14341, "TGGAC":10449, "TGGAG":27340, "TGGTA":9674, "TGGTT":14487, "TGGTC":12178, "TGGTG":25142, "TGGCA":25667, "TGGCT":22047, "TGGCC":22031, "TGGCG":18560, "TGGGA":28145, "TGGGT":19315, "TGGGC":25264, "TGGGG":32006, + "CAAAA":20513, "CAAAT":13843, "CAAAC":11484, "CAAAG":16274, "CAATA":8422, "CAATT":8895, "CAATC":6748, "CAATG":10496, "CAACA":13514, "CAACT":10299, "CAACC":10842, "CAACG":5261, "CAAGA":14737, "CAAGT":11765, "CAAGC":12619, "CAAGG":15485, + "CATAA":9377, "CATAT":8979, "CATAC":6452, "CATAG":7800, "CATTA":8110, "CATTT":17543, "CATTC":10742, "CATTG":10732, "CATCA":10260, "CATCT":13062, "CATCC":11081, "CATCG":4575, "CATGA":11190, "CATGT":12693, "CATGC":12062, "CATGG":17439, + "CACAA":12544, "CACAT":11769, "CACAC":15849, "CACAG":18124, "CACTA":9003, "CACTT":14269, "CACTC":13271, "CACTG":20585, "CACCA":18526, "CACCT":17693, "CACCC":30932, "CACCG":9077, "CACGA":5116, "CACGT":5465, "CACGC":9386, "CACGG":8492, + "CAGAA":19251, "CAGAT":12460, "CAGAC":12518, "CAGAG":23582, "CAGTA":10391, "CAGTT":12934, "CAGTC":11410, "CAGTG":21604, "CAGCA":19752, "CAGCT":21405, "CAGCC":29417, "CAGCG":14711, "CAGGA":24320, "CAGGT":18676, "CAGGC":29405, "CAGGG":27442, + "CTAAA":12254, "CTAAT":8689, "CTAAC":7558, "CTAAG":9315, "CTATA":7169, "CTATT":8464, "CTATC":5800, "CTATG":7627, "CTACA":9229, "CTACT":11376, "CTACC":10616, "CTACG":4116, "CTAGA":10367, "CTAGT":6958, "CTAGC":8790, "CTAGG":11623, + "CTTAA":10479, "CTTAT":7746, "CTTAC":7434, "CTTAG":9676, "CTTTA":10858, "CTTTT":21292, "CTTTC":15431, "CTTTG":18218, "CTTCA":12530, "CTTCT":18754, "CTTCC":19515, "CTTCG":6657, "CTTGA":13794, "CTTGT":12139, "CTTGC":12706, "CTTGG":24342, + "CTCAA":13846, "CTCAT":11841, "CTCAC":16483, "CTCAG":24063, "CTCTA":10760, "CTCTT":17027, "CTCTC":19460, "CTCTG":28216, "CTCCA":18022, "CTCCT":24788, "CTCCC":27859, "CTCCG":13475, "CTCGA":5494, "CTCGT":6019, "CTCGC":12839, "CTCGG":16886, + "CTGAA":21336, "CTGAT":11064, "CTGAC":11888, "CTGAG":26022, "CTGTA":18086, "CTGTT":14502, "CTGTC":16810, "CTGTG":22363, "CTGCA":19974, "CTGCT":21440, "CTGCC":27028, "CTGCG":16411, "CTGGA":22210, "CTGGT":16802, "CTGGC":24290, "CTGGG":42210, + "CCAAA":16638, "CCAAT":9677, "CCAAC":11691, "CCAAG":17497, "CCATA":8311, "CCATT":12924, "CCATC":12639, "CCATG":17822, "CCACA":16047, "CCACT":17507, "CCACC":25212, "CCACG":9682, "CCAGA":17125, "CCAGT":13847, "CCAGC":28836, "CCAGG":34696, + "CCTAA":9148, "CCTAT":6616, "CCTAC":8082, "CCTAG":10663, "CCTTA":7866, "CCTTT":15476, "CCTTC":15496, "CCTTG":18802, "CCTCA":18369, "CCTCT":19007, "CCTCC":30490, "CCTCG":13394, "CCTGA":17181, "CCTGT":19083, "CCTGC":23717, "CCTGG":33683, + "CCCAA":14766, "CCCAT":11817, "CCCAC":17536, "CCCAG":32080, "CCCTA":8256, "CCCTT":14269, "CCCTC":19771, "CCCTG":23004, "CCCCA":19837, "CCCCT":17361, "CCCCC":21116, "CCCCG":19164, "CCCGA":24427, "CCCGT":7837, "CCCGC":20756, "CCCGG":24345, + "CCGAA":6378, "CCGAT":4360, "CCGAC":6382, "CCGAG":30246, "CCGTA":2980, "CCGTT":4376, "CCGTC":8142, "CCGTG":9482, "CCGCA":8877, "CCGCT":11305, "CCGCC":26529, "CCGCG":21020, "CCGGA":11563, "CCGGT":8134, "CCGGC":21242, "CCGGG":28916, + "CGAAA":4511, "CGAAT":3777, "CGAAC":4775, "CGAAG":6416, "CGATA":1867, "CGATT":3180, "CGATC":7730, "CGATG":4454, "CGACA":3898, "CGACT":4689, "CGACC":6490, "CGACG":5187, "CGAGA":25624, "CGAGT":6511, "CGAGC":10696, "CGAGG":14202, + "CGTAA":2431, "CGTAT":2236, "CGTAC":2731, "CGTAG":4110, "CGTTA":2271, "CGTTT":4523, "CGTTC":5851, "CGTTG":4932, "CGTCA":3951, "CGTCT":6701, "CGTCC":8854, "CGTCG":14777, "CGTGA":6685, "CGTGT":8119, "CGTGC":8645, "CGTGG":12697, + "CGCAA":4297, "CGCAT":3776, "CGCAC":8674, "CGCAG":12701, "CGCTA":4081, "CGCTT":7951, "CGCTC":11169, "CGCTG":16593, "CGCCA":10675, "CGCCT":15080, "CGCCC":19700, "CGCCG":23592, "CGCGA":6616, "CGCGT":6402, "CGCGC":19913, "CGCGG":24896, + "CGGAA":7547, "CGGAT":5725, "CGGAC":18619, "CGGAG":19656, "CGGTA":3600, "CGGTT":5417, "CGGTC":8541, "CGGTG":13881, "CGGCA":9527, "CGGCT":15488, "CGGCC":23384, "CGGCG":29890, "CGGGA":16696, "CGGGT":13060, "CGGGC":25377, "CGGGG":27474, + "GAAAA":21213, "GAAAT":16427, "GAAAC":14522, "GAAAG":18036, "GAATA":9437, "GAATT":30707, "GAATC":10642, "GAATG":12807, "GAACA":13337, "GAACT":18080, "GAACC":13968, "GAACG":11796, "GAAGA":18744, "GAAGT":13329, "GAAGC":16741, "GAAGG":21545, + "GATAA":8648, "GATAT":6541, "GATAC":5628, "GATAG":7024, "GATTA":9462, "GATTT":12340, "GATTC":9404, "GATTG":8783, "GATCA":10262, "GATCT":11166, "GATCC":10490, "GATCG":15868, "GATGA":10348, "GATGT":9470, "GATGC":9066, "GATGG":17904, + "GACAA":9035, "GACAT":8510, "GACAC":9372, "GACAG":16639, "GACTA":7629, "GACTT":10980, "GACTC":13562, "GACTG":19322, "GACCA":11776, "GACCT":12234, "GACCC":13634, "GACCG":7389, "GACGA":4049, "GACGT":4429, "GACGC":8514, "GACGG":10813, + "GAGAA":38764, "GAGAT":14403, "GAGAC":17892, "GAGAG":24996, "GAGTA":9034, "GAGTT":13035, "GAGTC":13224, "GAGTG":19067, "GAGCA":15126, "GAGCT":18533, "GAGCC":26275, "GAGCG":17009, "GAGGA":24972, "GAGGT":20654, "GAGGC":37347, "GAGGG":27249, + "GTAAA":10547, "GTAAT":10439, "GTAAC":6545, "GTAAG":8358, "GTATA":6618, "GTATT":9656, "GTATC":5184, "GTATG":6880, "GTACA":7157, "GTACT":6630, "GTACC":6177, "GTACG":3118, "GTAGA":15905, "GTAGT":8953, "GTAGC":10969, "GTAGG":10541, + "GTTAA":8162, "GTTAT":6898, "GTTAC":6113, "GTTAG":8394, "GTTTA":9251, "GTTTT":19947, "GTTTC":14331, "GTTTG":14109, "GTTCA":11048, "GTTCT":12625, "GTTCC":11989, "GTTCG":6306, "GTTGA":9417, "GTTGT":10683, "GTTGC":12423, "GTTGG":15550, + "GTCAA":6635, "GTCAT":8479, "GTCAC":9947, "GTCAG":13613, "GTCTA":5814, "GTCTT":12926, "GTCTC":17522, "GTCTG":14793, "GTCCA":8907, "GTCCT":12884, "GTCCC":16234, "GTCCG":9116, "GTCGA":3284, "GTCGT":4582, "GTCGC":10819, "GTCGG":20716, + "GTGAA":14021, "GTGAT":13228, "GTGAC":10845, "GTGAG":20612, "GTGTA":8683, "GTGTT":13723, "GTGTC":12433, "GTGTG":21577, "GTGCA":13853, "GTGCT":16268, "GTGCC":16625, "GTGCG":10905, "GTGGA":17835, "GTGGT":19107, "GTGGC":25365, "GTGGG":27669, + "GCAAA":11570, "GCAAT":8364, "GCAAC":9518, "GCAAG":12213, "GCATA":6349, "GCATT":9597, "GCATC":8379, "GCATG":12684, "GCACA":12544, "GCACT":13288, "GCACC":26352, "GCACG":8101, "GCAGA":18007, "GCAGT":17097, "GCAGC":27068, "GCAGG":29259, + "GCTAA":9160, "GCTAT":6535, "GCTAC":9240, "GCTAG":8824, "GCTTA":7170, "GCTTT":13830, "GCTTC":15505, "GCTTG":14619, "GCTCA":16188, "GCTCT":16997, "GCTCC":19541, "GCTCG":11753, "GCTGA":19487, "GCTGT":18698, "GCTGC":30840, "GCTGG":38524, + "GCCAA":12734, "GCCAT":14328, "GCCAC":18332, "GCCAG":24206, "GCCTA":8674, "GCCTT":14931, "GCCTC":27897, "GCCTG":29958, "GCCCA":19883, "GCCCT":18959, "GCCCC":24258, "GCCCG":21372, "GCCGA":12085, "GCCGT":7955, "GCCGC":28683, "GCCGG":27462, + "GCGAA":4968, "GCGAT":7026, "GCGAC":7099, "GCGAG":14569, "GCGTA":2608, "GCGTT":5511, "GCGTC":9016, "GCGTG":12525, "GCGCA":10132, "GCGCT":14317, "GCGCC":24016, "GCGCG":24114, "GCGGA":15490, "GCGGT":12733, "GCGGC":38964, "GCGGG":32408, + "GGAAA":19795, "GGAAT":13251, "GGAAC":11879, "GGAAG":24410, "GGATA":7471, "GGATT":13082, "GGATC":15826, "GGATG":13940, "GGACA":13007, "GGACT":23052, "GGACC":13457, "GGACG":9852, "GGAGA":26088, "GGAGT":20345, "GGAGC":29087, "GGAGG":43542, + "GGTAA":9220, "GGTAT":6562, "GGTAC":6115, "GGTAG":10896, "GGTTA":7510, "GGTTT":15964, "GGTTC":13173, "GGTTG":13890, "GGTCA":10816, "GGTCT":14542, "GGTCC":13653, "GGTCG":10738, "GGTGA":18866, "GGTGT":16395, "GGTGC":18558, "GGTGG":33543, + "GGCAA":11866, "GGCAT":11781, "GGCAC":25609, "GGCAG":30713, "GGCTA":8781, "GGCTT":14573, "GGCTC":23270, "GGCTG":41510, "GGCCA":21231, "GGCCT":21138, "GGCCC":26729, "GGCCG":26847, "GGCGA":13090, "GGCGT":11751, "GGCGC":27804, "GGCGG":44764, + "GGGAA":21489, "GGGAT":16725, "GGGAC":17734, "GGGAG":39822, "GGGTA":8665, "GGGTT":17071, "GGGTC":17680, "GGGTG":25501, "GGGCA":19855, "GGGCT":24711, "GGGCC":29960, "GGGCG":29327, "GGGGA":27124, "GGGGT":20831, "GGGGC":33446, "GGGGG":31304 + }, + "overrepresented_sequences": { + "AAAAAAAAAAAAAAAAAAAA":34, + "CTTGGCACCCGAGAATTCCA":128, + "GTCGGACTGTAGAACTCTGA":31, + "TCGGACTGTAGAACTCTGAA":54 } + }, + "command": "fastp --overrepresentation_analysis --thread 1 --in1 /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastq/tutorial_R2.fastq --adapter_sequence GATCGTCGGACTGTAGAACTCTGAAC --length_required 26 --html /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.html --json /sfs/lustre/allocations/shefflab/processed/peppro_tutorial/pe/07-31-19/results_pipeline/tutorial/fastqc/tutorial_R2_rmAdapter.json --report_title tutorial --stdout " +} \ No newline at end of file diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/objects.tsv b/docs/files/examples/tutorial/results_pipeline/tutorial/objects.tsv new file mode 100644 index 0000000..7750ae7 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/objects.tsv @@ -0,0 +1,7 @@ +FastQC report r1 fastqc/tutorial_R1_processed_fastqc.html FastQC report r1 None PEPPRO +TSS enrichment QC_hg38/tutorial_TSSenrichment.pdf TSS enrichment QC_hg38/tutorial_TSSenrichment.png PEPPRO +Pause index QC_hg38/tutorial_pause_index.pdf Pause index QC_hg38/tutorial_pause_index.png PEPPRO +Fragment distribution QC_hg38/tutorial_fragLenDistribution.pdf Fragment distribution QC_hg38/tutorial_fragLenDistribution.png PEPPRO +Plus FRiF QC_hg38/tutorial_plus_frif.pdf Plus FRiF QC_hg38/tutorial_plus_frif.png PEPPRO +Minus FRiF QC_hg38/tutorial_minus_frif.pdf Minus FRiF QC_hg38/tutorial_minus_frif.png PEPPRO +mRNA contamination QC_hg38/tutorial_mRNA_contamination.pdf mRNA contamination QC_hg38/tutorial_mRNA_contamination.png PEPPRO diff --git a/docs/files/examples/tutorial/results_pipeline/tutorial/stats.tsv b/docs/files/examples/tutorial/results_pipeline/tutorial/stats.tsv new file mode 100644 index 0000000..41d8403 --- /dev/null +++ b/docs/files/examples/tutorial/results_pipeline/tutorial/stats.tsv @@ -0,0 +1,29 @@ +File_mb 50.42 PEPPRO +Read_type paired PEPPRO +Genome hg38 PEPPRO +Raw_reads 2000000 PEPPRO +Fastq_reads 2000000 PEPPRO +Trimmed_reads 497796 PEPPRO +Trim_loss_rate 75.11 PEPPRO +Aligned_reads_human_rDNA 5860.0 PEPPRO +Alignment_rate_human_rDNA 1.18 PEPPRO +Mapped_reads 431804 PEPPRO +QC_filtered_reads 274898 PEPPRO +Aligned_reads 156906.0 PEPPRO +Alignment_rate 31.52 PEPPRO +Total_efficiency 7.85 PEPPRO +Read_depth 1.17 PEPPRO +Mitochondrial_reads 5357 PEPPRO +Maximum_read_length 30 PEPPRO +NRF 1.0 PEPPRO +PBC1 15975.0 PEPPRO +PBC2 15975.0 PEPPRO +Unmapped_reads 468224 PEPPRO +TSS_Plus_Score 33.3 PEPPRO +TSS_Minus_Score 4.3 PEPPRO +Pause_index 228.97 PEPPRO +Plus FRiP 0.07 PEPPRO +Minus FRiP 0.07 PEPPRO +mRNA_contamination 4.26 PEPPRO +Time 0:09:52 PEPPRO +Success 11-27-14:04:51 PEPPRO diff --git a/docs/files/examples/tutorial/tutorial_stats_summary.tsv b/docs/files/examples/tutorial/tutorial_stats_summary.tsv new file mode 100644 index 0000000..94ecadd --- /dev/null +++ b/docs/files/examples/tutorial/tutorial_stats_summary.tsv @@ -0,0 +1,2 @@ +sample_name organism protocol read_type read1 read2 File_mb Read_type Genome Raw_reads Fastq_reads Trimmed_reads Trim_loss_rate Aligned_reads_human_rDNA Alignment_rate_human_rDNA Mapped_reads QC_filtered_reads Aligned_reads Alignment_rate Total_efficiency Read_depth Mitochondrial_reads Maximum_read_length NRF PBC1 PBC2 Unmapped_reads TSS_Plus_Score TSS_Minus_Score Pause_index Plus FRiP Minus FRiP mRNA_contamination Time Success +tutorial human PROSEQ paired /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz 50.42 paired hg38 2000000 2000000 497796 75.11 5860.0 1.18 431804 274898 156906.0 31.52 7.85 1.17 5357 30 1.0 15975.0 15975.0 468224 33.3 4.3 228.97 0.07 0.07 4.26 0:09:52 11-27-14:04:51 diff --git a/docs/files/examples/tutorial/tutorial_summary.html b/docs/files/examples/tutorial/tutorial_summary.html new file mode 100644 index 0000000..5439d72 --- /dev/null +++ b/docs/files/examples/tutorial/tutorial_summary.html @@ -0,0 +1,700 @@ + + + + + + + + + + + + + + + + + + + Looper: tutorial summary + +
+ + + +
+
+

Looper tutorial summary

+ Stats summary file +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
sample_nameorganismprotocolread_typeread1read2File_mbRead_typeGenomeRaw_readsFastq_readsTrimmed_readsTrim_loss_rateAligned_reads_human_rDNAAlignment_rate_human_rDNAMapped_readsQC_filtered_readsAligned_readsAlignment_rateTotal_efficiencyRead_depthMitochondrial_readsMaximum_read_lengthNRFPBC1PBC2Unmapped_readsTSS_Plus_ScoreTSS_Minus_ScorePause_indexPlus FRiPMinus FRiPmRNA_contaminationTimeSuccess
+ tutorial + + human + + PROSEQ + + paired + + /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r1.fq.gz + + /scratch/jps3dp/DATA/proseq/data/fastq/tutorial_r2.fq.gz + + 50.42 + + paired + + hg38 + + 2000000 + + 2000000 + + 497796 + + 75.11 + + 5860.0 + + 1.18 + + 431804 + + 274898 + + 156906.0 + + 31.52 + + 7.85 + + 1.17 + + 5357 + + 30 + + 1.0 + + 15975.0 + + 15975.0 + + 468224 + + 33.3 + + 4.3 + + 228.97 + + 0.07 + + 0.07 + + 4.26 + + 0:09:52 + + 11-27-14:04:51 +
+
+ +
+ +
+ +
+
+ +
+
+

Plot a column

+
+
+
    +
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ +
+
+
+Generated with looper v0.12.5-dev +© 2018 Sheffield Computational Biology Lab + + + + + + + + + + + + + + + + +
+ \ No newline at end of file diff --git a/docs/howto/annotation_files.md b/docs/howto/annotation_files.md deleted file mode 100644 index 8cf94de..0000000 --- a/docs/howto/annotation_files.md +++ /dev/null @@ -1,136 +0,0 @@ -# Download or create annotation files for PEPPRO - - -For each annotation type (TSS, CpA sites, premature mRNA, or general features), we provide [downloadable defaults](http://big.databio.org/peppro/) for common genomes. You may also recreate these yourself as described below. - -### TSS - -To calculate [TSS enrichments](../glossary.md), you will need a [TSS annotation file](http://big.databio.org/refgenomes/) in your reference genome directory. If a pre-built version for your genome of interest isn't present, you can quickly create that file yourself. In the reference genome directory, you can perform the following commands for in this example, `hg38`: -```console -wget -O hg38_TSS_full.txt.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz \ -zcat hg38_TSS_full.txt.gz | \ - awk '{if($4=="+"){print $3"\t"$5"\t"$5"\t"$13"\t.\t"$4}else{print $3"\t"$6"\t"$6"\t"$13"\t.\t"$4}}' | \ - LC_COLLATE=C sort -k1,1 -k2,2n -u > hg38_TSS.bed -``` -This asset (`tss_annotation`) needs to be [included in your `$REFGENIE` configuration file](annotation_files.md#example-peppro-refgenie-configuration-file) for the pipeline to detect it automatically. Alternatively, you can use the `--TSS-name` pipeline option to provide a path directly to this file. - -### Pause index annotation (PI) - -To calculate [pause indicies](../glossary.md), you will need two files in your reference genome directory: a [PI TSS annotation file](http://big.databio.org/refgenomes/) and a [PI gene body annotation file](http://big.databio.org/refgenomes/). If a pre-built version for your genome of interest isn't present, you can quickly create that file yourself. In the reference genome directory, you can perform the following commands for in this example, `hg38`: -```console -wget ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz \ -zcat Homo_sapiens.GRCh38.97.gtf.gz | \ - grep 'exon_number "1"' | \ - sed 's/^/chr/' | \ - awk '{OFS="\t";} {print $1,$4,$5,$20,$14,$7}' | \ - sed 's/";//g' | \ - sed 's/"//g' | \ - awk '{if($6=="+"){print $1"\t"$2+20"\t"$3+120"\t"$4"\t"$5"\t"$6}else{print $1"\t"$3-120"\t"$3-20"\t"$4"\t"$5"\t"$6}}' | \ - LC_COLLATE=C sort -k1,1 -k2,2n -u > hg38_PI_TSS.bed - -zcat Homo_sapiens.GRCh38.97.gtf.gz | \ - awk '$3 == "gene"' | \ - sed 's/^/chr/' | \ - awk '{OFS="\t";} {print $1,$4,$5,$14,$6,$7}' | \ - sed 's/";//g' | \ - sed 's/"//g' | - awk '$4!="Metazoa_SRP"' | \ - awk '$4!="U3"' | \ - awk '$4!="7SK"' | \ - awk '($3-$2)>200' | \ - awk '{if($6=="+"){print $1"\t"$2+500"\t"$3"\t"$4"\t"$5"\t"$6}else{print $1"\t"$2"\t"$3-500"\t"$4"\t"$5"\t"$6}}' | \ - awk '$3>$2' | \ - LC_COLLATE=C sort -k4 -u > hg38_PI_gene_body.bed -``` -These assets (`pi_tss` and `pi_body`) need to be [included in your `$REFGENIE` configuration file](annotation_files.md#example-peppro-refgenie-configuration-file) for the pipeline to detect it automatically. Alternatively, you can use the `--pi-tss` and `--pi-body` pipeline options to provide paths directly to each file. - -### mRNA contamination - -To determine the amount of [mRNA contamination](../glossary.md), you will need two files in your reference genome directory: an [exon annotation file](http://big.databio.org/refgenomes/) and an [intron annotation file](http://big.databio.org/refgenomes/). If a pre-built version for your genome of interest isn't present, you can quickly create that file yourself. In the reference genome directory, you can perform the following commands for in this example, `hg38`: -```console -wget -O hg38_TSS_full.txt.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz \ -zcat hg38_TSS_full.txt.gz | \ - awk -v OFS="\t" '$9>1' | \ - awk -v OFS="\t" '{ n = split($10, a, ","); split($11, b, ","); for(i=1; i hg38_exons.bed - -zcat hg38_TSS_full.txt.gz | \ - awk -v OFS="\t" '$9>1' | \ - awk -F"\t" '{ exonCount=int($9);split($10,exonStarts,"[,]"); split($11,exonEnds,"[,]"); for(i=1;i hg38_introns.bed -``` -These assets (`exon_annotation` and `intron_annotation`) need to be [included in your `$REFGENIE` configuration file](annotation_files.md#example-peppro-refgenie-configuration-file) for the pipeline to detect it automatically. Alternatively, you can use the `--exon-name` and `--intron-name` pipeline options to provide paths directly to each file. - -### Premature mRNA - -To determine the [*F*raction of *R*eads *i*n *P*re-mature mRNA (*FRiP*)](../glossary.md), you will need a [pre-mature mRNA annotation file](http://big.databio.org/peppro/). If a pre-built version for your genome of interest isn't present, you can create that file yourself. In the reference genome directory, execute the following (for `hg38`): -```console -wget -O hg38_refGene.txt.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz -zcat hg38_refGene.txt.gz | grep 'cmpl' | \ - awk '{print $3"\t"$5"\t"$6"\t"$13"\t.\t"$4}' | \ - LC_COLLATE=C sort -k1,1 -k2,2n -u > hg38_pre-mRNA.bed -``` -This asset (`pre_mRNA_annotation`) needs to be [included in your `$REFGENIE` configuration file](#Example_PEPPRO_REFGENIE_configuration_file) for the pipeline to detect it automatically. Alternatively, you can use the `--pre-name` pipeline option to provide a path directly to this file. - -### Features - -We also have [downloadable genome feature annotation files](http://big.databio.org/peppro/) for both `hg38` and `hg19` that you can use. These files annotate 3' and 5' UTR, Exons, Introns, Promoters, and Promoter Flanking Regions. If present in the corresponding reference genome folder and included as an asset (named `feat_annotation`) in your `$REFGENIE` configuration file you don't need to do anything else as the pipeline will look there automatically. Alternatively, you can use the `--anno-name` pipeline option to just directly point to this file. - -#### Create a custom feature annotation file - -The pipeline will calculate the fraction of reads in genomic features using one of our [provided annotation files](http://big.databio.org/peppro/), but you can also specify this file yourself. - -This annotation file is really just a modified `BED` file, with the chromosomal coordinates and type of feature included. For example, the [downloadable `hg38_annotations.bed.gz` file](http://big.databio.org/peppro/hg38_annotations.bed.gz) looks like so: - -``` -chr1 28200 30001 Promoter . * -chr1 198800 200201 Promoter . * -chr1 778000 780001 Promoter . * -chr1 817400 817601 Promoter . * -chr1 826200 828801 Promoter . * -chr1 904200 905201 Promoter . * -chr1 923800 924601 Promoter . * -chr1 925000 925601 Promoter . * -chr1 941800 942201 Promoter . * -chr1 958400 961401 Promoter . * -``` - -Just like a standard `BED` file, the first three fields are: -1. **chrom** - the name of the chromosome -2. **chromStart** - the starting position of the feature -3. **chromEnd** - the ending position of the feature - -Column four is the **name** column, in our case the name of our feature of interest. The fifth column is the **score**, which would determine how darkly an item would be displayed in a genome browser if you chose to set that or if the information in your file of interest has ascribed a score to the features. The final, sixth, column is the **strand** column. - -After creating your `BED` file, you can point the pipeline to it using the `--anno-name` option followed with the path to your file. The pipeline will then use that file to determine the fractions of reads that cover those features. - -### Example `PEPPRO` `refgenie` configuration file - -As mentioned above, you can point the pipeline directly to your annotation files using the matching arguments. - -Alternatively, if they are all present in the corresponding reference genome folders, you can direct `refgenie` to detect them automatically. Here's an example of what a `refgenie` configuration file would look like: -```yaml -genome_folder: $GENOMES -genome_server: http://refgenomes.databio.org -genomes: - hg38: - bowtie2: - path: indexed_bowtie2 - chrom_sizes: - path: hg38.chrom.sizes - tss_annotation: - path: hg38_TSS.bed - pi_tss: - path: hg38_PI_TSS.bed - pi_body: - path: hg38_PI_gene_body.bed - pre_mRNA_annotation: - path: hg38_pre-mRNA.bed - feat_annotation: - path: hg38_annotations.bed.gz - exon_annotation: - path: hg38_exons.bed - intron_annotation: - path: hg38_introns.bed -``` \ No newline at end of file diff --git a/docs/howto/run_cluster.md b/docs/howto/run_cluster.md deleted file mode 100644 index 77978a0..0000000 --- a/docs/howto/run_cluster.md +++ /dev/null @@ -1,13 +0,0 @@ -# Run PEPPRO on a cluster - -`PEPPRO` by itself does not specify any cluster resources, so you could just roll your own and submit individual jobs to a cluster however you choose. But because `PEPPRO` is already `looper`-compatible, the easier way is to use `looper's` built-in template system, which `looper` uses to build flexible shell scripts for job submission. These templates can be used to run jobs in a container, to submit to a cluster resource manager, or both. - -To use `looper` templates, we must create a `divvy` computing configuration file (compute_config.yaml) and point an environment variable (`DIVCFG`) to that file. You then have access to any configured computing packages by using `looper --compute `, where `package` can be any computing system you configure. In short, you will need to: - -- Set up a compute configuration file that includes a containerized or cluster compute template (or both). -- Point the environment variable `DIVCFG` to the location of this file. -- Run the pipeline with `looper run --compute PACKAGE` (where `PACKAGE` is specified in your `DIVCFG` file). - -This enables you to adjust your computing preferences on-the-fly when you run a project. - -The complete description of setting up `looper` to use `DIVCFG` is generic to any pipeline. If you want to use looper with containers or clusters, you should consult the complete docs in the looper documentation on [configuring looper to use a cluster](http://code.databio.org/looper/cluster-computing/). diff --git a/docs/howto/use_looper.md b/docs/howto/use_looper.md deleted file mode 100644 index 4724279..0000000 --- a/docs/howto/use_looper.md +++ /dev/null @@ -1,51 +0,0 @@ -# Run samples through PEPPRO using `Looper` - - -This guide walks you through extending `PEPPRO` to run on multiple samples using `looper`. The pipeline can be run directly from the command line for a single sample ([see Install and run](../install.md)). If you need to run it on many samples, you could write your own sample handling code, but we have pre-configured everything to work nicely with `looper`, our sample handling engine. - -## 1: Install `looper` - -[`Looper`](http://looper.readthedocs.io/) is a pipeline submission engine that makes it easy to deploy any pipeline across samples. It will let you run the jobs locally, in containers, using any cluster resource manager, or in containers on a cluster. - -You can install `looper` using `pip`: - -```{bash} -pip install --user loopercli -``` - -## 2: Run an example through `looper` - -Start by running the example project (`peppro_test.yaml`) in the [`examples/meta/`](https://github.com/databio/peppro/tree/master/examples/meta) folder. Let's use `looper`'s `-d` argument to do a *dry run*, which will create job scripts for every sample in a project, but will not execute them: - -``` -cd peppro -looper run -d examples/meta/peppro_test.yaml -``` - -If the looper executable is not in your `$PATH`, add the following line to your `.bashrc` or `.profile`: -``` -export PATH=$PATH:~/.local/bin -``` -If that worked, let's actually run the example by taking out the `-d` flag: - -``` -looper run examples/meta/peppro_test.yaml -``` - -There are lots of other cool things you can do with looper, like dry runs, summarize results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [`looper` docs](http://looper.databio.org/). - -## 3: Configure your project files - -To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](https://pepkit.github.io/docs/home/) and is universal to all pipelines that read PEPs, including `PEPPRO`. To get you started, there are examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/peppro/tree/master/examples/meta/peppro_test.yaml)). In short, you need two files for your project: - - 1. project config file -- describes output locations, pointers to data, etc. - 2. sample annotation file -- comma-separated value (CSV) list of your samples. - -The sample annotation file must specify these columns: - -- sample_name -- library ('PRO' or 'PROSEQ' or 'PRO-seq') -- organism (e.g. 'human' or 'mouse') -- read1 -- read2 (if paired) -- whatever else you want diff --git a/docs/img/peppro_logo.svg b/docs/img/peppro_logo.svg index 5cfa336..3b8e52c 100644 --- a/docs/img/peppro_logo.svg +++ b/docs/img/peppro_logo.svg @@ -9,19 +9,19 @@ xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="3339.698" - height="3101.6738" + width="955.19135" + height="227.99561" id="svg2" version="1.1" - inkscape:version="0.92.4 (33fec40, 2019-01-16)" - sodipodi:docname="peppro_logo.svg"> + inkscape:version="0.91 r13725" + sodipodi:docname="peppro_logo3.svg"> image/svg+xml - + @@ -85,105 +85,105 @@ inkscape:label="Layer 1" inkscape:groupmode="layer" id="layer1" - transform="translate(1937.2167,1519.8908)"> + transform="translate(1709.3385,1366.0829)"> + transform="matrix(0.5246427,0,0,0.5246427,-1432.1606,-1298.4311)"> + transform="matrix(0.7353359,0,0,0.7353359,-1369.3683,-1375.8491)"> + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:115.00834656px;line-height:1.25;font-family:'Franklin Gothic';-inkscape-font-specification:'Franklin Gothic, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:#ffffff;stroke-width:1.15406287;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m -1374.8609,-1192.9145 0,-110.335 50.6038,0 q 13.5293,0 23.0201,3.9578 9.4908,3.9578 14.4179,11.5908 4.9272,7.633 4.9272,18.3757 0,9.2485 -4.1194,16.8815 -4.079,7.5925 -11.2678,11.7119 -5.8157,3.231 -12.5197,4.604 -6.7041,1.3733 -16.7603,1.3733 l -18.0122,0 0,41.84 z m 30.2896,-89.3343 0,25.6454 14.8621,0 q 8.5619,0 12.3178,-2.9484 3.7558,-2.9481 3.7558,-9.6118 0,-6.6638 -3.7558,-9.8543 -3.7559,-3.2309 -12.3178,-3.2309 z" /> + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:115.00834656px;line-height:1.25;font-family:'Franklin Gothic';-inkscape-font-specification:'Franklin Gothic, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:#ffffff;stroke-width:1.15406287;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m -1266.949,-1192.9145 0,-110.335 79.3991,0 0,23.1412 -49.1096,0 0,19.7085 31.259,0 0,23.1413 -31.259,0 0,21.2026 51.4117,0 0,23.1414 z" /> + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:115.00834656px;line-height:1.25;font-family:'Franklin Gothic';-inkscape-font-specification:'Franklin Gothic, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:#ffffff;stroke-width:1.15406287;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m -1171.9608,-1192.9145 0,-110.335 50.6039,0 q 13.5294,0 23.0202,3.9578 9.4907,3.9578 14.4178,11.5908 4.9271,7.633 4.9271,18.3757 0,9.2485 -4.1194,16.8815 -4.079,7.5925 -11.2676,11.7119 -5.8157,3.231 -12.5197,4.604 -6.7042,1.3733 -16.7605,1.3733 l -18.0122,0 0,41.84 z m 30.2896,-89.3343 0,25.6454 14.8621,0 q 8.5619,0 12.3179,-2.9484 3.7559,-2.9481 3.7559,-9.6118 0,-6.6638 -3.7559,-9.8543 -3.756,-3.2309 -12.3179,-3.2309 z" /> + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:115.00834656px;line-height:1.25;font-family:'Franklin Gothic';-inkscape-font-specification:'Franklin Gothic, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#aa4400;fill-opacity:1;stroke:#ffffff;stroke-width:1.15406287;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m -1065.1324,-1192.9145 0,-110.335 50.6039,0 q 13.5294,0 23.02019,3.9578 9.49077,3.9578 14.41784,11.5909 4.92719,7.6329 4.92719,18.3757 0,9.2483 -4.11942,16.8814 -4.07908,7.5925 -11.26777,11.7119 -5.81564,3.2309 -12.51973,4.6041 -6.7041,1.3732 -16.7602,1.3732 l -18.0122,0 0,41.84 z m 30.2898,-89.3343 0,25.6453 14.862,0 q 8.5619,0 12.3178,-2.9482 3.7559,-2.9481 3.7559,-9.6119 0,-6.6638 -3.7559,-9.8543 -3.7559,-3.2309 -12.3178,-3.2309 z" /> + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:115.00834656px;line-height:1.25;font-family:'Franklin Gothic';-inkscape-font-specification:'Franklin Gothic, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#aa4400;fill-opacity:1;stroke:#ffffff;stroke-width:1.15406287;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m -957.22037,-1192.9144 0,-110.335 48.3018,0 q 15.30636,0 25.32215,3.15 8.80417,2.706 14.29669,10.6216 5.53292,7.8754 5.53292,17.6488 0,11.8736 -7.59262,19.7085 -4.20014,4.4424 -12.76203,8.2388 l 20.03155,50.9673 -33.9244,0 -15.18518,-45.6364 -13.89286,0 0,45.6364 z m 30.12802,-89.3343 0,23.0201 17.85068,0 q 6.46179,0 10.09654,-3.0692 3.63478,-3.0694 3.63478,-8.5216 0,-5.4521 -3.59437,-8.4406 -3.55398,-2.9887 -10.13695,-2.9887 z" /> + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:115.00834656px;line-height:1.25;font-family:'Franklin Gothic';-inkscape-font-specification:'Franklin Gothic, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#aa4400;fill-opacity:1;stroke:#ffffff;stroke-width:1.15406287;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m -853.79145,-1248.0819 q 0,-17.4064 6.17913,-30.3705 6.17906,-12.9639 17.48719,-19.8295 11.30813,-6.8658 25.9683,-6.8658 12.03508,0 21.32391,4.3618 9.32922,4.3214 15.58906,12.0352 6.25987,7.7137 9.36961,17.6083 3.1501,9.8946 3.1501,21.0816 0,12.0755 -2.94819,22.7777 -2.90781,10.7024 -8.92534,18.8201 -5.97715,8.1175 -15.50831,12.8023 -9.49073,4.6444 -22.05084,4.6444 -10.41962,0 -19.46611,-3.5943 -9.00615,-3.5944 -15.8314,-10.7427 -6.78488,-7.1888 -10.5812,-17.9315 -3.75591,-10.7831 -3.75591,-24.7971 z m 31.74357,0 q 0,35.0551 17.89105,35.0551 10.01577,0 13.85246,-9.7329 3.83669,-9.7333 3.83669,-27.1395 0,-33.238 -17.68915,-33.238 -6.58296,0 -10.54079,4.281 -3.95784,4.2809 -5.65407,12.0755 -1.69619,7.7541 -1.69619,18.6988 z" /> diff --git a/docs/img/peppro_logo2.svg b/docs/img/peppro_logo2.svg new file mode 100644 index 0000000..0da2ae5 --- /dev/null +++ b/docs/img/peppro_logo2.svg @@ -0,0 +1,193 @@ + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/img/peppro_logo_gray.svg b/docs/img/peppro_logo_gray.svg new file mode 100644 index 0000000..6cdf71e --- /dev/null +++ b/docs/img/peppro_logo_gray.svg @@ -0,0 +1,220 @@ + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/img/peppro_logo_large.svg b/docs/img/peppro_logo_large.svg new file mode 100644 index 0000000..5cfa336 --- /dev/null +++ b/docs/img/peppro_logo_large.svg @@ -0,0 +1,189 @@ + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/install.md b/docs/install.md index bc5a6a7..4c454d2 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,15 +1,14 @@ -# Getting started +# Install and run PEPPRO ## 1: Clone the `PEPPRO` pipeline -Clone the pipeline: ``` git clone https://github.com/databio/peppro.git ``` ## 2: Install required software -`PEPPRO` requires a series of publicly-available, common bioinformatics tools including: [samtools](http://www.htslib.org/), [bedtools](https://bedtools.readthedocs.io/en/latest/content/installation.html), [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [seqkit](https://bioinf.shenwei.me/seqkit/), [fastp](https://github.com/OpenGene/fastp), [seqtk](https://github.com/lh3/seqtk), [preseq](http://smithlabresearch.org/software/preseq/), [fastq-pair](https://github.com/linsalrob/fastq-pair.git), [picard](http://broadinstitute.github.io/picard/), [wigToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigCat](http://hgdownload.soe.ucsc.edu/admin/exe/). +PEPPRO requires a set of Python and R packages to run. ### Python packages @@ -27,53 +26,91 @@ pip install --user -r requirements.txt Rscript -e 'install.packages("PEPPROr", repos=NULL, type="source")' ``` -### Optional software - -Optionally, `PEPPRO` can mix and match tools for adapter removal, read trimming, deduplication, and reverse complementation. The use of `fqdedup`, in particular, is useful if you wish to minimize memory use at the expense of speed. We suggest using the default tools simply due to the fact that `fastx toolkit` has not been supported since 2012. +### Tools -`seqOutBias` can be used to take into account the mappability at a given read length to filter the sample signal. +The pipeline also relies on a set of publicly available bioinformatic tools, but if you don't want to install the prerequisite software used by PEPPRO natively, you can follow our tutorial on [running PEPPRO directly in a container](container.md) and skip this step. -*Optional tools:* [fqdedup](https://github.com/guertinlab/fqdedup), [fastx toolkit](http://hannonlab.cshl.edu/fastx_toolkit/), [seqOutBias](https://github.com/guertinlab/seqOutBias), [fastqc](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc), and [pigz (v2.3.4+)](https://zlib.net/pigz/). +Otherwise, you'll need to install the following: [bedtools](https://bedtools.readthedocs.io/en/latest/content/installation.html), [bigWigCat](http://hgdownload.soe.ucsc.edu/admin/exe/), [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [fastq-pair](https://github.com/linsalrob/fastq-pair.git), [flash](https://ccb.jhu.edu/software/FLASH/), [picard](https://broadinstitute.github.io/picard/), [preseq](http://smithlabresearch.org/software/preseq/), [seqkit](https://bioinf.shenwei.me/seqkit/), [samtools](http://www.htslib.org/), [seqtk](https://github.com/lh3/seqtk), and [wigToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/). If you need help, we have [detailed installation instructions](detailed_install.md) for installing these. -## 3: Download `refgenie` assemblies +## 3: Download `refgenie` assets -The pipeline relies on [`refgenie` assemblies](http://refgenie.databio.org/en/dev/install/) for alignment. First, initialize a folder for genome indexes and the `refgenie` config file. +PEPPRO uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this: ```console export REFGENIE=your_genome_folder/genome_config.yaml refgenie init -c $REFGENIE ``` -Then, just pull the assets you need. +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: ```console -refgenie pull -g hg38 -a bowtie2 -refgenie pull -g rCRSd -a bowtie2 -refgenie pull -g human_repeats -a bowtie2 +refgenie pull -g hg38 -a bowtie2_index ensembl_gtf ensembl_rb refgene_anno feat_annotation ``` +PEPPRO also requires `bowtie2_index` for any pre-alignment genomes: -(Add `REFGENIE` to your .bashrc or .profile to ensure it persists). Alternatively, you can skip the `REFGENIE` variable and simply change the value of the `resources.genome_config` option in the [`pipeline_config.yaml`](https://github.com/databio/peppro/blob/master/pipelines/peppro.yaml) file to point to the folder where you stored the assemblies. +```console +refgenie pull -g human_rDNA -a bowtie2_index +``` + +### Optional software -## 4: Run the pipeline script directly +Optionally, `PEPPRO` can mix and match tools for adapter removal, read trimming, deduplication, and reverse complementation. The use of `fqdedup`, in particular, is useful if you wish to minimize memory use at the expense of speed. We suggest using the default tools simply due to the fact that `fastx toolkit` has not been supported since 2012. `seqOutBias` can be used to take into account the mappability at a given read length to filter the sample signal. -The pipeline at its core is just a python script, and you can run it on the command line for a single sample (see [command-line usage](usage)), which you can also get on the command line by running `pipelines/peppro.py --help`. You just need to pass a few command-line parameters to specify sample name, reference genome, input files, etc. Here's the basic command to run the included small test example through the pipeline: +*Optional tools:* + +* [fastp](https://github.com/OpenGene/fastp) +* [fqdedup](https://github.com/guertinlab/fqdedup) +* [fastx toolkit](http://hannonlab.cshl.edu/fastx_toolkit/) +* [seqOutBias](https://github.com/guertinlab/seqOutBias) +* [fastqc](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc) +* [pigz (v2.3.4+)](https://zlib.net/pigz/) + +## 4: Run an example project through PEPPRO + +Start by running the example project (`peppro_test.yaml`) in the [`examples/meta/`](https://github.com/databio/peppro/tree/master/examples/meta) folder. PEPPRO uses a project management tool called [looper](https://looper.databio.org) to run the pipeline across samples in a project. Let's use the `-d` argument to do a *dry run*, which will create job scripts for every sample in a project, but will not execute them: + +``` +cd peppro +looper run -d examples/meta/peppro_test.yaml +``` + +If the looper executable is not in your `$PATH`, add the following line to your `.bashrc` or `.profile`: +``` +export PATH=$PATH:~/.local/bin +``` +If that worked, let's actually run the example by taking out the `-d` flag: + +```console +looper run examples/meta/peppro_test.yaml +``` + +Or, if you're using containers, adjust the `--compute` argument accordingly: ```console -/pipelines/peppro.py \ - --sample-name test \ - --genome hg38 \ - --input examples/data/test_r1.fq.gz \ - --single-or-paired single \ - -O $HOME/peppro_example/ +looper run examples/meta/peppro_test.yaml --compute docker +looper run examples/meta/peppro_test.yaml --compute singularity ``` -This test example takes less than 5 minutes to complete. Read more about how to [run the test sample using `Looper`](howto/use_looper.md) with the included [example `peppro_test.yaml` file](https://github.com/databio/peppro/blob/master/examples/meta/peppro_test.yaml). +There are lots of other cool things you can do with looper, like dry runs, summarize results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [`looper` docs](http://looper.databio.org/). + +## 5: Configure your project files + +To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](https://pepkit.github.io/docs/home/) and is universal to all pipelines that read PEPs, including `PEPPRO`. To get you started, there are examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/peppro/tree/master/examples/meta/peppro_test.yaml)). In short, you need two files for your project: + + 1. project config file -- describes output locations, pointers to data, etc. + 2. sample annotation file -- comma-separated value (CSV) list of your samples. + +The sample annotation file must specify these columns: -# 5. Next steps +- sample_name +- library (*e.g.* 'PRO', 'PROSEQ', 'PRO-seq', 'GRO', 'GROSEQ', 'GRO-seq') +- organism (*e.g.* 'human' or 'mouse') +- read1 +- read2 (if paired) +- anything else you wish to include -This is just the beginning. For your next step, take a look at one of these user guides: +## Next steps -- [Extended tutorial for running a single sample](tutorial.md) -- [Running on multiple samples with looper](howto/use_looper.md) -- [Running the pipeline directly in a container](howto/use_container.md) -- See other detailed user guide links in the side menu +This is just the beginning. For your next step, the [extended tutorial](tutorial.md) will walk you through a real project. Or, take a look at one of other detailed user guide links in the side menu. diff --git a/docs/howto/prealignments.md b/docs/prealignments.md similarity index 96% rename from docs/howto/prealignments.md rename to docs/prealignments.md index 873183a..9241ee6 100644 --- a/docs/howto/prealignments.md +++ b/docs/prealignments.md @@ -14,7 +14,7 @@ In this example, we'll align sequentially to human mitochondrial sequence (e.g. /pipelines/peppro.py \ --sample-name test \ --genome hg38 \ - --prealignments rCRSd human_repeats \ + --prealignments human_rDNA rCRSd \ --input examples/data/test_r1.fq.gz \ --single-or-paired single \ -O $HOME/peppro_example/ diff --git a/docs/run_direct.md b/docs/run_direct.md new file mode 100644 index 0000000..f0cf859 --- /dev/null +++ b/docs/run_direct.md @@ -0,0 +1,16 @@ +# Running the pipeline script directly + +It's easiest to run PEPPRO using `looper`, as described in the [install](install.md) and [tutorial](tutorial.md) guides. This simplifies running the pipeline across many samples, on a cluster, or in containers. But really, the pipeline at its core is just a python script, and you can run it on the command line for a single sample (see [command-line usage](usage.md) by running `pipelines/peppro.py --help`). You just need to pass a few command-line parameters to specify sample name, reference genome, input files, etc. Here's the basic command to run the included small test example through the pipeline: + +```console +cd peppro +./pipelines/peppro.py \ + --sample-name test \ + --genome hg38 \ + --input examples/data/test_r1.fq.gz \ + --single-or-paired single \ + -O $HOME/peppro_example/ +``` + +This test example takes less than 5 minutes to complete. + diff --git a/docs/tutorial.md b/docs/tutorial.md index 46b39fc..2cae198 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -1,8 +1,26 @@ -# PEPPRO pipeline step-by-step guide +# PEPPRO pipeline step-by-step guide -In this guide, we'll walk you through the step by step procedure of running a tutorial PRO-seq dataset through the pipeline. The output from this process is the same as you see in the [example PRO-seq output](browse_output.md) we've provided. To use this tutorial, you should have a basic familiarity with [working in a command line driven environment](http://matt.might.net/articles/basic-unix/). You also need to have already installed `PEPPRO` prerequisites, which you can do following the [detailed installation guide](howto/detailed_install.md). +In this guide, we'll walk you through the step by step procedure of running a tutorial PRO-seq dataset through the pipeline. The output from this process is the same as you see in the [example PRO-seq output](browse_output.md) we've provided. To use this tutorial, you should have a basic familiarity with [working in a command line driven environment](http://matt.might.net/articles/basic-unix/). You also need to have already installed `PEPPRO` prerequisites, which you can do following the [basic installation instructions](install.md). -## 1: Download tutorial read files +## 1: Set up folders + +From an open terminal, let's first create a directory we'll use to run through this guide: +```console +mkdir peppro_tutorial +``` + +Let's move into our newly created directory and create a few more folders that we'll use later. +```console +cd peppro_tutorial/ +mkdir data +mkdir genomes +mkdir processed +mkdir templates +mkdir tools +cd tools/ +``` + +## 2: Download tutorial read files We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial_r1.fastq.gz](http://big.databio.org/peppro/tutorial_r1.fq.gz) and [tutorial_r2.fq.gz](http://big.databio.org/peppro/tutorial_r2.fastq.gz) files. ```console @@ -16,7 +34,7 @@ mv tutorial_r1.fq.gz peppro/examples/data/ mv tutorial_r2.fq.gz peppro/examples/data/ ``` -## 2: Configure project files +## 3: Configure project files We're going to use `looper` to analyze our data. For that, we need to pass looper a configuration file. This project config file describes your project. See [`looper` docs](https://looper.readthedocs.io/en/latest/) for details. A configuration file has been provided for you in the pipeline itself, conveniently named `tutorial.yaml`. This configuration file also points to our sample. In this case, we've provided a sample for you with the pipeline. You don't have to do anything else at this point and may [skip right to running the sample if you'd like](tutorial.md#3-using-looper-to-run-the-pipeline). Otherwise, we'll briefly touch on what those configuration files look like. @@ -55,8 +73,22 @@ tutorial,human,PROSEQ,paired,R1,R2 That's it! Let's analyze that sample! -## 3: Using `looper` to run the pipeline -Looper requires a few variables and configuration files to work for the specific user. Let's get those set up now. `Looper` uses [`divvy`](http://code.databio.org/divvy) to manage computing resource configuration so that projects and pipelines can easily travel among environments. For more detailed information, [check out the `looper` docs](https://looper.readthedocs.io/en/latest/cluster-computing/). Let's set it up. +## 4: Create environment variables + +We also need to create some environment variables to help point `looper` to where we keep our data files and our tools. You may either set the environment variables up, like we're going to do now, or you may simply hard code the necessary locations in our configuration files. +First, let's create a `PROCESSED` variable that represents the location where we want to save output. +``` +export PROCESSED="/path/to/peppro_tutorial/processed/" +``` +Second, we'll create a variable representing the root path to all our tools named `CODEBASE`. +``` +export CODEBASE="/path/to/peppro_tutorial/tools/" +``` +(Add these environment variables to your `.bashrc` or `.profile` so you don't have to always do this step). +Fantastic! Now that we have the pipeline and its requirements installed, we're ready to get our reference genome(s). + +## 5: Use `looper` to run the pipeline +Looper requires a few variables and configuration files to work for the specific user. Let's get those set up now. `Looper` uses [`divvy`](https://divvy.databio.org/) to manage computing resource configuration so that projects and pipelines can easily travel among environments. For more detailed information, [check out the `looper` docs](https://looper.readthedocs.io/en/latest/cluster-computing/). Let's set it up. ``` cd /path/to/peppro_tutorial/ touch compute_config.yaml @@ -77,7 +109,7 @@ Now, let's close and save that file and create an environment variable pointing export DIVCFG="/path/to/peppro_tutorial/compute_config.yaml" ``` (Remember to add `DIVCFG` to your `.bashrc` or `.profile` to ensure it persists). -The `looper` environment configuration file points to submission template(s) in order to know how to run a samples locally or using cluster resources. If you'd like to learn more, check out the [`DIVCFG` configuration file and submission templates](http://code.databio.org/divvy). We're going to simply setup a local template for the purposes of this tutorial. You can also easily create [templates for cluster or container use as well](https://github.com/pepkit/divcfg/tree/master/templates)! +The `looper` environment configuration file points to submission template(s) in order to know how to run a samples locally or using cluster resources. If you'd like to learn more, check out the [`DIVCFG` configuration file and submission templates](https://divvy.databio.org/). We're going to simply setup a local template for the purposes of this tutorial. You can also easily create [templates for cluster or container use as well](https://github.com/pepkit/divcfg/tree/master/templates)! Let's change to our `templates/` directory to make our first submission template. ``` cd /path/to/peppro_tutorial/templates/ @@ -94,6 +126,7 @@ echo 'Start time:' `date +'%Y-%m-%d %T'` {CODE} } | tee {LOGFILE} --ignore-interrupts ``` + Save and close that file, and return to our main tutorial directory. ``` cd ../ @@ -107,7 +140,7 @@ Congratulations! Your first sample should be running through the pipeline now. After the pipeline is finished, we can look through the output directory together. We've provided a breakdown of that directory in the [browse output page](/browse_output/). -## 4: Generate an `HTML` report using `looper` +## 6: Generate an `HTML` report using `looper` Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](../files/examples/tutorial/tutorial_summary.html). Otherwise, let's generate a report ourselves. Using our same configuration file we used to run the samples through the pipeline, we'll now employ the `summarize` function of `looper`. diff --git a/docs/usage.md b/docs/usage.md index afc546c..e032030 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,21 +6,26 @@ `python pipelines/peppro.py --help` ```{console} -usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-C CONFIG_FILE] -O - PARENT_OUTPUT_FOLDER [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] - -S SAMPLE_NAME -I INPUT_FILES [INPUT_FILES ...] +usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--verbosity V] [--silent] + [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I + INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] [--runon {pro,gro}] - [--adapter {fastp,cutadapt}] [--dedup {seqkit,fqdedup}] - [--trimmer {seqtk,fastx}] [--umi] [--umi_len UMI_LEN] - [--max_len MAX_LEN] [--sob] [--scale] [--parts PARTS] + [-Q SINGLE_OR_PAIRED] + [--protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq}] + [--adapter-tool {cutadapt,fastp}] + [--dedup-tool {seqkit,fqdedup}] + [--trimmer-tool {seqtk,fastx}] [--umi-len UMI_LEN] + [--max-len MAX_LEN] [--sob] [--scale] [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] - [--TSS-name TSS_NAME] [--pi-tss PI_TSS] [--pi-body PI_BODY] - [--pre-name PRE_NAME] [--anno-name ANNO_NAME] - [--exon-name EXON_NAME] [--intron-name INTRON_NAME] - [--coverage] [--keep] [--noFIFO] [--complexity] [-V] + [--TSS-name TSS_NAME] [--pi-tss ENSEMBL_TSS] + [--pi-body ENSEMBL_GENE_BODY] [--pre-name PRE_NAME] + [--anno-name ANNO_NAME] [--exon-name EXON_NAME] + [--intron-name INTRON_NAME] [--search-file SEARCH_FILE] + [--coverage] [--keep] [--noFIFO] [--no-complexity] + [--prioritize] [-V] -PEPPRO version 0.8.0 +PEPPRO version 0.8.6 optional arguments: -h, --help show this help message and exit @@ -28,6 +33,10 @@ optional arguments: -N, --new-start Overwrite all results to start a fresh run -D, --dirty Don't auto-delete intermediate files -F, --force-follow Always run 'follow' commands + -T, --testmode Only print commands, don't run + --verbosity V Set logging level (1-5 or logging module level name) + --silent Silence logging. Overrides verbosity. + --logdev Expand content of logging message format. -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. @@ -41,30 +50,29 @@ optional arguments: Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol - --runon {pro,gro} Run on sequencing type. - --adapter {fastp,cutadapt} - Name of adapter removal program - --dedup {seqkit,fqdedup} - Name of program that removes duplicate reads - --trimmer {seqtk,fastx} - Name of read trimming program - --umi Remove umi with fastp - --umi_len UMI_LEN Specify the length of the UMI.If your data does not - utilize UMIs, set to 0. - --max_len MAX_LEN Trim reads to maximum length. Set to -1 to disable - length trimming. + --protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq} + Run on sequencing type. + --adapter-tool {cutadapt,fastp} + Name of adapter removal program. Default: cutadapt + --dedup-tool {seqkit,fqdedup} + Program to use to duplicate reads. Default: seqkit + --trimmer-tool {seqtk,fastx} + Name of read trimming program. Default: seqtk + --umi-len UMI_LEN Specify the length of the UMI.If your data does not + utilize UMIs, set to 0. Default: 0 + --max-len MAX_LEN Trim reads to maximum length. Set to -1 to disable + length trimming. Default: 30 --sob Use seqOutBias to produce signal tracks and incorporate mappability information. --scale Scale output with seqOutBias when producing signal tracks. - --parts PARTS Split suffix tree generation into parts. Increase - this value to lower memory use. --prealignments PREALIGNMENTS [PREALIGNMENTS ...] Space-delimited list of reference genomes to align to before primary alignment. --TSS-name TSS_NAME file_name of TSS annotation file. - --pi-tss PI_TSS file_name of pause index TSS annotation file. - --pi-body PI_BODY file_name of pause index gene body annotation file. + --pi-tss ENSEMBL_TSS file_name of pause index TSS annotation file. + --pi-body ENSEMBL_GENE_BODY + file_name of pause index gene body annotation file. --pre-name PRE_NAME file_name of pre-mRNA annotation file. --anno-name ANNO_NAME file_name of genomic annotation file. @@ -72,11 +80,17 @@ optional arguments: file_name of exon annotation file. --intron-name INTRON_NAME file_name of intron annotation file. + --search-file SEARCH_FILE + file_name of read length matched gt tallymer index + search file --coverage Report library complexity using coverage: reads / (bases in genome / read length) --keep Keep prealignment BAM files --noFIFO Do NOT use named pipes during prealignments. - --complexity Disable library complexity calculation (faster). + --no-complexity Disable library complexity calculation (faster). + --prioritize Plot FRiF/PRiF using mutually exclusive priority + ranked features based on the order of feature + appearance in the feature annotation asset. -V, --version show program's version number and exit required named arguments: diff --git a/examples/meta/peppro_test.yaml b/examples/meta/peppro_test.yaml index 1e9611f..44d654a 100644 --- a/examples/meta/peppro_test.yaml +++ b/examples/meta/peppro_test.yaml @@ -3,8 +3,8 @@ name: test metadata: sample_annotation: "peppro_test.csv" - output_dir: "$HOME/peppro_test/" - pipeline_interfaces: "$HOME/peppro/pipeline_interface.yaml" + output_dir: "$PROCESSED/peppro/peppro_test/" + pipeline_interfaces: "$CODE/peppro/pipeline_interface.yaml" derived_columns: [read1] @@ -15,23 +15,19 @@ implied_columns: organism: human: genome: hg38 - prealignments: human_rDNA rCRSd - adapter: fastp # Default - dedup: seqkit # Default - trimmer: seqtk # Default - runon: pro # Default - umi_len: 8 # Default - max_len: 30 # Default - TSS_name: $HOME/genomes/hg38/hg38_TSS.tsv # Default. Pipeline checks corresponding genome folder without specifying. - CpA_name: $HOME/genomes/hg38/hg38_CpA.tsv # Default. Pipeline checks corresponding genome folder without specifying. - pre_name: $HOME/genomes/hg38/hg38_pre-mRNA.tsv # Default. Pipeline checks corresponding genome folder without specifying. - anno_name: $HOME/genomes/hg38/hg38_annotations.bed.gz # Default. Pipeline checks corresponding genome folder without specifying. + prealignments: human_rDNA + adapter: cutadapt # Default + dedup: seqkit # Default + trimmer: seqtk # Default + protocol: pro # Default + umi_len: 0 # Default + max_len: -1 # Disable length trimming pipeline_args: # peppro.py: +# "--prioritize": null # Default is FALSE. Pass flag to prioritize features by the order they appear in the feat_annotation asset when calculating FRiF/PRiF # "--sob": null # Default is FALSE. Pass flag to use seqOutBias for signal track generation and to incorporate mappability # "--scale": null # Default is FALSE. Pass flag to scale seqOutBias signal tracks -# "--parts": 4 # Default. Split suffix tree generation into <4> parts. Higher number == less memory use. # "--coverage": null # Default is FALSE. Pass flag to use coverage when producing library complexity plots. # "--keep": null # Default is FALSE. Pass flag to keep prealignment BAM files. # "--noFIFO": null # Default is FALSE. Pass flag to NOT use named pipes during prealignments. diff --git a/mkdocs.yml b/mkdocs.yml index 1ef40ce..e8e6ab8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,7 +3,7 @@ theme: databio site_name: peppro site_author: Jason Smith site_url: http://peppro.databio.org/ -site_logo: img/peppro_inline_logo.svg +site_logo: img/peppro_logo_gray.svg repo_url: https://github.com/databio/peppro/ #google_analytics: ['UA-127092878-1', 'code.databio.org/peppro'] @@ -18,13 +18,14 @@ nav: - Install and run: 'install.md' - Extended tutorial: 'tutorial.md' - How-to Guides: - - Annotation files: 'howto/annotation_files/' - - Configuring prealignments: 'howto/prealignments/' - - Detailed install guide: 'howto/detailed_install.md' - - Running on a cluster: 'howto/run_cluster/' - - Running in a container: 'howto/use_container/' - - Using looper: 'howto/use_looper/' + - Run PEPPRO directly: 'run_direct.md' + - Run PEPPRO on a cluster: 'cluster.md' + - Run PEPPRO in containers: 'container.md' + - Configure prealignments: 'prealignments.md' + - Detailed install guide: 'detailed_install.md' + - Use custom reference data: 'annotation.md' - Reference: + - FAQ: 'faq.md' - Usage: 'usage.md' - Changelog: 'changelog.md' - Support: 'https://github.com/databio/peppro/issues' diff --git a/peppro_bulker_manifest.yaml b/peppro_bulker_manifest.yaml new file mode 100644 index 0000000..2681d37 --- /dev/null +++ b/peppro_bulker_manifest.yaml @@ -0,0 +1,32 @@ +manifest: + name: peppro + version: 1.0.0 + commands: + - command: samtools + dockerargs: "-i" + docker_image: quay.io/biocontainers/samtools:1.9--h91753b0_8 + - command: bowtie2 + docker_image: quay.io/biocontainers/bowtie2:2.3.5--py37he860b03_0 + - command: seqkit + docker_image: quay.io/biocontainers/seqkit:0.10.2--0 + - command: fastp + docker_image: quay.io/biocontainers/fastp:0.20.0--hdbcaa40_0 + - command: seqtk + dockerargs: "-i" + docker_image: quay.io/biocontainers/seqtk:1.3--hed695b0_2 + - command: preseq + docker_image: quay.io/biocontainers/preseq:2.0.3--h26b358d_2 + - command: fastq_pair + docker_image: quay.io/biocontainers/fastq-pair:1.0--hf484d3e_0 + - command: wigToBigWig + docker_image: quay.io/biocontainers/ucsc-wigtobigwig:357--h35c10e6_3 + - command: bigWigCat + docker_image: quay.io/biocontainers/ucsc-bigwigcat:357--1 + - command: fastqc + docker_image: quay.io/biocontainers/fastqc:0.11.8--1 + - command: pigz + docker_image: nsheff/pigz + - command: cutadapt + docker_image: quay.io/biocontainers/cutadapt:2.4--py37h14c3975_0 + - command: flash + docker_image: quay.io/biocontainers/flash:1.2.11--hed695b0_5 \ No newline at end of file diff --git a/pipeline_interface.yaml b/pipeline_interface.yaml index a0e9725..239c51a 100644 --- a/pipeline_interface.yaml +++ b/pipeline_interface.yaml @@ -1,13 +1,19 @@ protocol_mapping: - PRO: peppro.py - pro: peppro.py - PRO-SEQ: peppro.py - PRO-seq: peppro.py - PROSEQ: peppro.py - proseq: peppro.py + PRO: peppro + pro: peppro + GRO: peppro + gro: peppro + PRO-SEQ: peppro + PRO-seq: peppro + PROSEQ: peppro + proseq: peppro + groseq: peppro + GROSEQ: peppro + GRO-SEQ: peppro + GRO-seq: peppro pipelines: - peppro.py: + peppro: name: PEPPRO path: pipelines/peppro.py looper_args: True @@ -21,29 +27,43 @@ pipelines: "--single-or-paired": read_type optional_arguments: "--input2": read2 - "--runon": runon - "--adapter": adapter - "--dedup": dedup - "--trimmer": trimmer - "--umi": umi - "--umi_len": umi_len - "--max_len": max_len + "--protocol": protocol + "--adapter-tool": adapter + "--dedup-tool": dedup + "--trimmer-tool": trimmer + "--umi-len": umi_len + "--max-len": max_len "--sob": sob "--scale": scale - "--parts": parts "--prealignments": prealignments "--TSS-name": TSS_name "--pre-name": pre_name "--anno-name": anno_name + "--search-file": search_file "--coverage": coverage "--keep": keep "--noFIFO": no_fifo "--complexity": complexity + "--prioritize": prioritize compute: singularity_image: ${SIMAGES}peppro docker_image: databio/peppro + bulker_crate: peppro + outputs: + cutadapt_report: "cutadapt/{sample.sample_name}_cutadapt.txt" + plus_bw: "signal_{sample.genome}/{sample.sample_name}_plus_body_0-mer.bw" + minus_bw: "signal_{sample.genome}/{sample.sample_name}_minus_body_0-mer.bw" + plus_bam: "aligned_{sample.genome}/{sample.sample_name}_plus.bam" + minus_bam: "aligned_{sample.genome}/{sample.sample_name}_minus.bam" + gene_counts_bed: "signal_{sample.genome}/{sample.sample_name}_gene_coverage.bed" + pause_indicies_bed: "QC_{sample.genome}/{sample.sample_name}_pause_index.bed.gz" + mrna_contamination_bed: "QC_{sample.genome}/{sample.sample_name}_exon_intron_ratios.bed.gz" summarizers: - - tools/PEPPRO_summarizer.R + - tools/PEPPRO_complexity_curves.R + - tools/PEPPRO_counts.R + bioconductor: + readFunName: readPepproGeneCounts + readFunPath: BiocProject/readPepproGeneCounts.R summary_results: - library_complexity_file: caption: "Library complexity file" diff --git a/pipelines/peppro.py b/pipelines/peppro.py index 3a0c9c5..8777390 100755 --- a/pipelines/peppro.py +++ b/pipelines/peppro.py @@ -5,7 +5,7 @@ __author__ = ["Jason Smith", "Nathan Sheffield", "Mike Guertin"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.8.0" +__version__ = "0.8.6" from argparse import ArgumentParser @@ -19,12 +19,21 @@ from refgenconf import RefGenConf as RGC, select_genome_config TOOLS_FOLDER = "tools" -RUNON_SOURCE = ["pro", "gro"] -ADAPTER_REMOVAL = ["fastp", "cutadapt"] +RUNON_SOURCE_PRO = ["PRO", "pro", "PRO-SEQ", "PRO-seq", "proseq", "PROSEQ"] +RUNON_SOURCE_GRO = ["GRO", "gro", "groseq", "GROSEQ", "GRO-SEQ", "GRO-seq"] +RUNON_SOURCE = RUNON_SOURCE_PRO + RUNON_SOURCE_GRO + +ADAPTER_REMOVERS = ["cutadapt", "fastp"] DEDUPLICATORS = ["seqkit", "fqdedup"] TRIMMERS = ["seqtk", "fastx"] -BT2_IDX_KEY = "bowtie2_index" +DEFAULT_REMOVER = "cutadapt" +DEFAULT_DEDUPLICATOR = "seqkit" +DEFAULT_TRIMMER = "seqtk" + +BT2_IDX_KEY = "bowtie2_index" +DEFAULT_UMI_LEN = 0 +DEFAULT_MAX_LEN = 30 def parse_arguments(): """ @@ -38,35 +47,36 @@ def parse_arguments(): required=["input", "genome", "sample-name", "output-parent"]) # Pipeline-specific arguments - parser.add_argument("--runon", dest="runon", + parser.add_argument("--protocol", dest="protocol", default="pro", choices=RUNON_SOURCE, help="Run on sequencing type.") - parser.add_argument("--adapter", dest="adapter", - default="fastp", choices=ADAPTER_REMOVAL, - help="Name of adapter removal program") - - parser.add_argument("--dedup", dest="dedup", - default="seqkit", choices=DEDUPLICATORS, - help="Name of program that removes duplicate reads") + parser.add_argument("--adapter-tool", dest="adapter", + default=DEFAULT_REMOVER, choices=ADAPTER_REMOVERS, + help="Name of adapter removal program. " + "Default: {}".format(DEFAULT_REMOVER)) - parser.add_argument("--trimmer", dest="trimmer", - default="seqtk", choices=TRIMMERS, - help="Name of read trimming program") + parser.add_argument("--dedup-tool", dest="dedup", + default=DEFAULT_DEDUPLICATOR, choices=DEDUPLICATORS, + help="Program to use to duplicate reads. " + "Default: {}".format(DEFAULT_DEDUPLICATOR)) - parser.add_argument("--umi", action='store_true', default=False, - dest="umi", - help="Remove umi with fastp") + parser.add_argument("--trimmer-tool", dest="trimmer", + default=DEFAULT_TRIMMER, choices=TRIMMERS, + help="Name of read trimming program. " + "Default: {}".format(DEFAULT_TRIMMER)) - parser.add_argument("--umi_len", dest="umi_len", - default="8", + parser.add_argument("--umi-len", + default=DEFAULT_UMI_LEN, type=int, help="Specify the length of the UMI." - "If your data does not utilize UMIs, set to 0.") + "If your data does not utilize UMIs, set to 0. " + "Default: {}".format(DEFAULT_UMI_LEN)) - parser.add_argument("--max_len", dest="max_len", - default="30", - help="Trim reads to maximum length." - " Set to -1 to disable length trimming.") + parser.add_argument("--max-len", + default=DEFAULT_MAX_LEN, + help="Trim reads to maximum length. " + "Set to -1 to disable length trimming. " + "Default: {}".format(DEFAULT_MAX_LEN)) parser.add_argument("--sob", action='store_true', dest="sob", default=False, @@ -78,11 +88,6 @@ def parse_arguments(): help="Scale output with seqOutBias when producing" " signal tracks.") - parser.add_argument("--parts", dest="parts", - default="4", - help="Split suffix tree generation into parts. " - "Increase this value to lower memory use.") - parser.add_argument("--prealignments", default=[], type=str, nargs="+", help="Space-delimited list of reference genomes to " "align to before primary alignment.") @@ -92,11 +97,11 @@ def parse_arguments(): help="file_name of TSS annotation file.") parser.add_argument("--pi-tss", default=None, - dest="pi_tss", type=str, + dest="ensembl_tss", type=str, help="file_name of pause index TSS annotation file.") parser.add_argument("--pi-body", default=None, - dest="pi_body", type=str, + dest="ensembl_gene_body", type=str, help="file_name of pause index gene body annotation file.") parser.add_argument("--pre-name", default=None, @@ -115,6 +120,11 @@ def parse_arguments(): dest="intron_name", type=str, help="file_name of intron annotation file.") + parser.add_argument("--search-file", default=None, + dest="search_file", type=str, + help="file_name of read length matched gt tallymer " + "index search file") + parser.add_argument("--coverage", action='store_true', default=False, dest="coverage", help="Report library complexity using coverage: " @@ -128,10 +138,16 @@ def parse_arguments(): dest="no_fifo", help="Do NOT use named pipes during prealignments.") - parser.add_argument("--complexity", action='store_false', default=True, + parser.add_argument("--no-complexity", action='store_true', default=False, dest="complexity", help="Disable library complexity calculation (faster).") + parser.add_argument("--prioritize", action='store_true', default=False, + dest="prioritize", + help="Plot FRiF/PRiF using mutually exclusive priority" + " ranked features based on the order of feature" + " appearance in the feature annotation asset.") + parser.add_argument("-V", "--version", action="version", version="%(prog)s {v}".format(v=__version__)) @@ -144,71 +160,64 @@ def parse_arguments(): return args -def _process_fastq(args, tools, read2, fq_file, outfolder): +def _remove_adapters(args, res, tools, read2, fq_file, outfolder): """ - A helper function to prepare read files for downstream processing. + A helper function to build a command for adapter removal. :param argparse.Namespace args: binding between option name and argument, e.g. from parsing command-line options + :param looper.models.AttributeDict res: binding between resources and + value, e.g. for resources used by the pipeline :param looper.models.AttributeDict tools: binding between tool name and - value, e.g. for tools/resources used by the pipeline - :param bool read2: if True, use paired-end processing + value, e.g. for tools used by the pipeline + :param bool read2: if True, do not deduplicate and do not retain + intermediate files :param str fq_file: path to FASTQ file :param str outfolder: path to output directory for the pipeline - :return (str, str): pair (R1, R2) of paths to FASTQ files + :return str: command to remove adapters """ - # Create names for processed FASTQ files. + + sname = args.sample_name # for concise code + + cutadapt_folder = os.path.join(outfolder, "cutadapt") + fastp_folder = os.path.join(outfolder, "fastp") fastq_folder = os.path.join(outfolder, "fastq") - fastqc_folder=os.path.join(outfolder, "fastqc") - - noadap_fastq = os.path.join( - fastq_folder, args.sample_name + "_R1_noadap.fastq") - dedup_fastq = os.path.join( - fastq_folder, args.sample_name + "_R1_dedup.fastq") - trimmed_fastq = os.path.join( - fastq_folder, args.sample_name + "_R1_trimmed.fastq") - processed_fastq = os.path.join( - fastq_folder, args.sample_name + "_R1_processed.fastq") - - adapter_html = os.path.join( - fastqc_folder, args.sample_name + "_R1_rmAdapter.html") - adapter_json = os.path.join( - fastqc_folder, args.sample_name + "_R1_rmAdapter.json") - adapter_report = os.path.join( - fastqc_folder, args.sample_name + "_R1_rmAdapter.txt") - umi_report = os.path.join( - fastqc_folder, args.sample_name + "_R1_rmUmi.html") - umi_json = os.path.join( - fastqc_folder, args.sample_name + "_R1_rmUmi.json") - - # PE2 names - noadap_fastq_R2 = os.path.join( - fastq_folder, args.sample_name + "_R2_noadap.fastq") - trimmed_fastq_R2 = os.path.join( - fastq_folder, args.sample_name + "_R2_trimmed.fastq") - trimmed_dups_fastq_R2 = os.path.join( - fastq_folder, args.sample_name + "_R2_trimmed_dups.fastq") - - adapter_html_R2 = os.path.join( - fastqc_folder, args.sample_name + "_R2_rmAdapter.html") - adapter_json_R2 = os.path.join( - fastqc_folder, args.sample_name + "_R2_rmAdapter.json") - adapter_report_R2 = os.path.join( - fastqc_folder, args.sample_name + "_R2_rmAdapter.txt") - umi_report_R2 = os.path.join( - fastqc_folder, args.sample_name + "_R2_rmUmi.html") - umi_json_R2 = os.path.join( - fastqc_folder, args.sample_name + "_R2_rmUmi.json") - - # If single-end, must use cutadapt for plotting purposes - if not args.paired_end: - args.adapter = "cutadapt" + + if read2: + cutadapt_report = os.path.join(cutadapt_folder, sname + "_R2_cutadapt.txt") + noadap_fastq = os.path.join(fastq_folder, sname + "_R2_noadap.fastq") + fastp_pfx = os.path.join(fastp_folder, sname + "_R2_fastp_adapter") + else: + cutadapt_report = os.path.join(cutadapt_folder, sname + "_R1_cutadapt.txt") + noadap_fastq = os.path.join(fastq_folder, sname + "_R1_noadap.fastq") + fastp_pfx = os.path.join(fastp_folder, sname + "_R1_fastp_adapter") + + fastp_report_txt = fastp_pfx + ".txt" + fastp_report_html = fastp_pfx + ".html" + fastp_report_json = fastp_pfx + ".json" + + if _itsa_file(res.adapters): + five_prime = pm.checkprint("awk '/5prime/{getline; print}' " + + res.adapters) or "TGGAATTCTCGGGTGCCAAGG" + three_prime = pm.checkprint("awk '/3prime/{getline; print}' " + + res.adapters) or "GATCGTCGGACTGTAGAACTCTGAAC" + else: + # Default to the hardcoded values as a fallback + five_prime = "TGGAATTCTCGGGTGCCAAGG" + three_prime = "GATCGTCGGACTGTAGAACTCTGAAC" + + # Setup report output folders + if args.adapter == "cutadapt": + ngstk.make_dir(cutadapt_folder) + adapter_report = cutadapt_report + elif args.adapter == "fastp": + ngstk.make_dir(fastp_folder) + adapter_report = fastp_report_txt # Check quality encoding for use with FastX_Tools if args.trimmer == "fastx": encoding = _guess_encoding(fq_file) - #print("Encoding: {}".format(str(encoding))) # DEBUG - + # Create adapter trimming command(s). if args.adapter == "fastp": adapter_cmd_chunks = [ @@ -219,68 +228,54 @@ def _process_fastq(args, tools, read2, fq_file, outfolder): ] if read2: adapter_cmd_chunks.extend([ - ("--adapter_sequence", "GATCGTCGGACTGTAGAACTCTGAAC"), - ("--length_required", (18 + int(float(args.umi_len)))), - ("--html", adapter_html_R2), - ("--json", adapter_json_R2), - ("--report_title", ("'" + args.sample_name + "'")) + ("--adapter_sequence", three_prime) ]) else: adapter_cmd_chunks.extend([ - ("--adapter_sequence", "TGGAATTCTCGGGTGCCAAGG"), - ("--length_required", (18 + int(float(args.umi_len)))), - ("--html", adapter_html), - ("--json", adapter_json), - ("--report_title", ("'" + args.sample_name + "'")) + ("--adapter_sequence", five_prime) ]) - if args.complexity and not read2: - adapter_cmd_chunks.extend([("-o", noadap_fastq)]) - else: - adapter_cmd_chunks.extend([("--stdout")]) + adapter_cmd_chunks.extend([ + #("--length_required", (18 + int(float(args.umi_len)))), + ("--length_required", 5), # For insert size plotting + ("--html", fastp_report_html), + ("--json", fastp_report_json), + ("--report_title", ("'" + sname + "'")) + ]) + # If calculating library complexity and this is read 1 or single-end, + # must produce an intermediate file. + #if not args.complexity and args.umi_len > 0 and not read2 : + adapter_cmd_chunks.extend([("-o", noadap_fastq)]) # Must keep intermediates always now + # else: + # adapter_cmd_chunks.extend([("--stdout")]) adapter_cmd_chunks.extend([ - (") 2>", adapter_report) + (") 2>", fastp_report_txt) ]) adapter_cmd = build_command(adapter_cmd_chunks) elif args.adapter == "cutadapt": + # Must keep intermediates always now cut_version = float(pm.checkprint("cutadapt --version")) - + adapter_cmd_chunks = ["(" + tools.cutadapt] + # old versions of cutadapt can not use multiple cores + if cut_version >= 1.15: + adapter_cmd_chunks.extend([("-j", str(pm.cores))]) + adapter_cmd_chunks.extend([ + #("-m", (18 + int(float(args.umi_len)))), + ("-m", 5), # For insert size plotting + ("-O", 1) + ]) if read2: - adapter_cmd_chunks = [tools.cutadapt] - # old versions of cutadapt can not use multiple cores - if cut_version >= 1.15: - adapter_cmd_chunks.extend([("-j", str(pm.cores))]) - adapter_cmd_chunks.extend([ - ("-m", (18 + int(float(args.umi_len)))), - ("-a", "GATCGTCGGACTGTAGAACTCTGAAC"), - fq_file - ]) + adapter_cmd_chunks.extend([("-a", three_prime)]) else: - if args.complexity and args.umi_len > 0: - adapter_cmd_chunks = ["(" + tools.cutadapt] - # old versions of cutadapt can not use multiple cores - if cut_version >= 1.15: - adapter_cmd_chunks.extend([("-j", str(pm.cores))]) - adapter_cmd_chunks.extend([ - ("-m", (18 + int(float(args.umi_len)))), - ("-a", "TGGAATTCTCGGGTGCCAAGG"), - fq_file, - ("-o", noadap_fastq + ")"), - (">", adapter_report) - ]) - else: - adapter_cmd_chunks = [tools.cutadapt] - # old versions of cutadapt can not use multiple cores - if cut_version >= 1.15: - adapter_cmd_chunks.extend([("-j", str(pm.cores))]) - adapter_cmd_chunks.extend([ - ("-m", (18 + int(float(args.umi_len)))), - ("-a", "TGGAATTCTCGGGTGCCAAGG"), - fq_file - ]) + adapter_cmd_chunks.extend([("-a", five_prime)]) + adapter_cmd_chunks.extend([ + fq_file, + ("-o", noadap_fastq + ")"), + (">", cutadapt_report) + ]) adapter_cmd = build_command(adapter_cmd_chunks) @@ -294,34 +289,56 @@ def _process_fastq(args, tools, read2, fq_file, outfolder): ] if read2: adapter_cmd_chunks.extend([ - ("--adapter_sequence", "GATCGTCGGACTGTAGAACTCTGAAC"), - ("--length_required", (18 + int(float(args.umi_len)))), - ("--html", adapter_html_R2), - ("--json", adapter_json_R2), - ("--report_title", ("'" + args.sample_name + "'")) + ("--adapter_sequence", three_prime) ]) else: adapter_cmd_chunks.extend([ - ("--adapter_sequence", "TGGAATTCTCGGGTGCCAAGG"), - ("--length_required", (18 + int(float(args.umi_len)))), - ("--html", adapter_html), - ("--json", adapter_json), - ("--report_title", ("'" + args.sample_name + "'")) + ("--adapter_sequence", five_prime) ]) - if args.complexity and not read2: - adapter_cmd_chunks.extend([("-o", noadap_fastq)]) - else: - adapter_cmd_chunks.extend([("--stdout")]) + adapter_cmd_chunks.extend([ + #("--length_required", (18 + int(float(args.umi_len)))), + ("--length_required", 5), # For insert size plotting + ("--html", fastp_report_html), + ("--json", fastp_report_json), + ("--report_title", ("'" + sname + "'")) + ]) + # If calculating library complexity and this is read 1 or single-end, + # must produce an intermediate file. + #if not args.complexity and args.umi_len > 0 and not read2 : + adapter_cmd_chunks.extend([("-o", noadap_fastq)]) # Must keep intermediates always now + # else: + # adapter_cmd_chunks.extend([("--stdout")]) adapter_cmd_chunks.extend([ - (") 2>", adapter_report) + (") 2>", fastp_report_txt) ]) adapter_cmd = build_command(adapter_cmd_chunks) + return adapter_cmd + + +def _deduplicate(args, tools, fq_file, outfolder): + """ + A helper function to build a command for deduplication. + + :param argparse.Namespace args: binding between option name and argument, + e.g. from parsing command-line options + :param looper.models.AttributeDict tools: binding between tool name and + value, e.g. for tools/resources used by the pipeline + :param str fq_file: path to FASTQ file + :param str outfolder: path to output directory for the pipeline + :return str: command to remove adapters + """ + sname = args.sample_name # for concise code + + fastq_folder = os.path.join(outfolder, "fastq") + noadap_fastq = os.path.join(fastq_folder, sname + "_R1_noadap.fastq") + dedup_fastq = os.path.join(fastq_folder, sname + "_R1_dedup.fastq") + # Create deduplication command(s). - if not read2 and not args.umi_len <=0: + if not args.complexity and args.umi_len > 0: if args.dedup == "seqkit": dedup_cmd_chunks = [ (tools.seqkit, "rmdup"), @@ -330,26 +347,18 @@ def _process_fastq(args, tools, read2, fq_file, outfolder): "--ignore-case", "-o" ] - if args.complexity and args.umi_len > 0: + if not args.complexity: dedup_cmd_chunks.extend([ (dedup_fastq, noadap_fastq) ]) else: dedup_cmd_chunks.extend(["-"]) - dedup_cmd = build_command(dedup_cmd_chunks) - elif args.dedup == "fqdedup": dedup_cmd_chunks = [tools.fqdedup] - if args.complexity and args.umi_len > 0: - dedup_cmd_chunks.extend([("-i", noadap_fastq)]) - dedup_cmd_chunks.extend([("-o", dedup_fastq)]) - else: - dedup_cmd_chunks.extend([("-i", "-")]) - dedup_cmd_chunks.extend([("-o", "-")]) - + dedup_cmd_chunks.extend([("-i", noadap_fastq)]) + dedup_cmd_chunks.extend([("-o", dedup_fastq)]) dedup_cmd = build_command(dedup_cmd_chunks) - else: # Default to seqkit dedup_cmd_chunks = [ @@ -359,665 +368,962 @@ def _process_fastq(args, tools, read2, fq_file, outfolder): "--ignore-case", "-o" ] - if args.complexity and args.umi_len > 0: - dedup_cmd_chunks.extend([ - (dedup_fastq, noadap_fastq) - ]) - else: - dedup_cmd_chunks.extend(["-"]) - + dedup_cmd_chunks.extend([ + (dedup_fastq, noadap_fastq) + ]) dedup_cmd = build_command(dedup_cmd_chunks) + else: + # Don't deduplicate a read2 file nor deduplicate if there are no UMI's + dedup_cmd = "" - # Create trimming and reverse complementing command(s). - # TODO: Can also use seqkit for these steps instead of seqtk... - if args.umi: - if args.adapter != "fastp": - print("To remove UMI intelligently, you must process your reads using 'fastp'") - print("Defaulting to removing the first {} " - "bp instead via trimming".format(str(args.umi_len))) - if args.trimmer == "seqtk": - if read2: - trim_cmd_chunks_R2 = [ - tools.seqtk, - "trimfq", - ("-e", str(args.umi_len)) - ] - trim_cmd_chunks_R2.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - (">", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend(["|"]) - trim_cmd_chunks_R2.extend([ - tools.seqtk, - ("seq", "-r"), - ("-", ">"), - trimmed_fastq_R2 - ]) - else: - trim_cmd_chunks = [ - tools.seqtk, - "trimfq", - ("-b", str(args.umi_len)) - ] - - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-L", str(args.max_len)) - ]) - if args.complexity and args.umi_len > 0: - # Need undeduplicated results for complexity calculation - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() # python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([noadap_fastq]) - trim_cmd_chunks.extend([dedup_fastq]) - else: - trim_cmd_chunks.extend(["-"]) - - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - trim_cmd_chunks_nodedup.extend([ - (">", trimmed_fastq) - ]) - else: - trim_cmd_chunks.extend(["|"]) - trim_cmd_chunks.extend([ - tools.seqtk, - ("seq", "-r"), - ("-", ">"), - processed_fastq - ]) - trim_cmd_chunks_nodedup.extend(["|"]) - trim_cmd_chunks_nodedup.extend([ - tools.seqtk, - ("seq", "-r"), - ("-", ">"), - trimmed_fastq - ]) - - elif args.trimmer == "fastx": - trim_tool = tools.fastx + "_trimmer" - rc_tool = tools.fastx + "_reverse_complement" - trim_cmd_chunks = [trim_tool] - - if encoding == "Illumina-1.8": - trim_cmd_chunks.extend([ - ("-Q", str(33)) - ]) + return dedup_cmd + + +def _trim_deduplicated_files(args, tools, fq_file, outfolder): + """ + A helper function to build a command for read trimming using fastq files + that have been deduplicated. + + :param argparse.Namespace args: binding between option name and argument, + e.g. from parsing command-line options + :param looper.models.AttributeDict tools: binding between tool name and + value, e.g. for tools/resources used by the pipeline + :param str fq_file: path to FASTQ file + :param str outfolder: path to output directory for the pipeline + :return str: command to trim adapter trimmed and deduplicated reads + """ + + # Only call this when args.complexity32 and args.umi_len > 0 + sname = args.sample_name # for concise code + + fastq_folder = os.path.join(outfolder, "fastq") + dedup_fastq = os.path.join(fastq_folder, sname + "_R1_dedup.fastq") + #processed_fastq = os.path.join(fastq_folder, sname + "_R1_processed.fastq") + #trimmed_fastq = os.path.join(fastq_folder, sname + "_R1_trimmed.fastq") + processed_fastq = os.path.join(fastq_folder, sname + "_R1_trimmed.fastq") + + fastp_folder = os.path.join(outfolder, "fastp") + umi_report = os.path.join(fastp_folder, sname + "_R1_rmUmi.html") + umi_json = os.path.join(fastp_folder, sname + "_R1_rmUmi.json") + + # Check quality encoding for use with FastX_Tools + if args.trimmer == "fastx": + encoding = _guess_encoding(fq_file) + + if args.adapter == "fastp": + # Remove UMI by specifying location of UMI + # Location is still read1 because it's being treated as SE data + trim_cmd_chunks = [ + tools.fastp, + ("--thread", str(pm.cores)), + ("-i", dedup_fastq), + "--stdout", + "--umi", + ("--umi_loc", "read1"), + ("--umi_len", args.umi_len), + ("--html", umi_report), + ("--json", umi_json) + ] + + # Trim to max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "trimfq") + ("-L", args.max_len), + "-" + ]) + + # Remove too short reads + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5) + ]) + + # Reverse complement for PRO-seq + if args.protocol.lower() in RUNON_SOURCE_PRO: + trim_cmd_chunks.extend([("-r", "-")]) + else: + trim_cmd_chunks.extend(["-"]) + + trim_cmd_chunks.extend([(">", processed_fastq)]) + + elif args.trimmer == "seqtk": + # Remove UMI by blind trimming + trim_cmd_chunks = [ + tools.seqtk, + "trimfq", + ("-b", str(args.umi_len)) + ] + + # Trim to max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([("-L", str(args.max_len))]) + + trim_cmd_chunks.extend([dedup_fastq]) + + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", processed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", "-"), + (">", processed_fastq) + ]) + elif args.trimmer == "fastx": + trim_tool = tools.fastx + "_trimmer" + rc_tool = tools.fastx + "_reverse_complement" + trim_cmd_chunks = [trim_tool] + + if encoding == "Illumina-1.8": + trim_cmd_chunks.extend([ + ("-Q", str(33)) + ]) + + # Remove UMI blindly + trim_cmd_chunks.extend([("-f", str(int(float(args.umi_len)) + 1))]) + + # Trim to max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([ + ("-l", (str(int(float(args.max_len)) + int(float(args.umi_len))))) + ]) + + trim_cmd_chunks.extend([("-i", dedup_fastq)]) + + # Do not reverse complement if GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([("-o", processed_fastq)]) + else: + trim_cmd_chunks.extend([("|", rc_tool)]) + if encoding == "Illumina-1.8": trim_cmd_chunks.extend([ - ("-f", str(int(float(args.umi_len)) + 1)) + ("-Q", str(33)) ]) - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-l", (str(int(float(args.max_len)) + int(float(args.umi_len))))) - ]) - if read2: - trim_cmd_chunks_R2 = [trim_tool] - if encoding == "Illumina-1.8": - trim_cmd_chunks_R2.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks_R2.extend([ - ("-t", str(int(float(args.umi_len)))) - ]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - ("-o", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend([ - ("|", rc_tool) - ]) - if encoding == "Illumina-1.8": - trim_cmd_chunks_R2.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks_R2.extend([ - ("-o", trimmed_fastq_R2) - ]) - else: - if args.complexity and args.umi_len > 0: - # Need undeduplicated results for complexity calculation - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() #python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([ - ("-i", noadap_fastq) - ]) - trim_cmd_chunks.extend([ - ("-i", dedup_fastq) - ]) - - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - ("-o", processed_fastq) - ]) - trim_cmd_chunks_nodedup.extend([ - ("-o", trimmed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - ("|", rc_tool) - ]) - if encoding == "Illumina-1.8": - trim_cmd_chunks.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks_nodedup.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks.extend([ - ("-o", processed_fastq) - ]) - trim_cmd_chunks_nodedup.extend([ - ("-o", trimmed_fastq) - ]) - - else: - # Default to seqtk - if read2: - trim_cmd_chunks_R2 = [ - tools.seqtk, - "trimfq", - ("-e", str(args.umi_len)) - ] - if args.max_len != -1: - trim_cmd_chunks_R2.extend([ - ("-L", str(args.max_len)) - ]) - trim_cmd_chunks_R2.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - (">", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend(["|"]) - trim_cmd_chunks_R2.extend([ - tools.seqtk, - ("seq", "-r"), - ("-", ">"), - trimmed_fastq_R2 - ]) - else: - trim_cmd_chunks = [ - tools.seqtk, - "trimfq", - ("-b", str(args.umi_len)) - ] - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-L", str(args.max_len)) - ]) - if args.complexity and args.umi_len > 0: - # Need undeduplicated results for complexity calculation - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() # python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([noadap_fastq]) - trim_cmd_chunks.extend([dedup_fastq]) - else: - trim_cmd_chunks.extend(["-"]) - - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - trim_cmd_chunks_nodedup.extend([ - (">", trimmed_fastq) - ]) - else: - trim_cmd_chunks.extend(["|"]) - trim_cmd_chunks.extend([ - tools.seqtk, - ("seq", "-r"), - ("-", ">"), - processed_fastq - ]) - trim_cmd_chunks_nodedup.extend(["|"]) - trim_cmd_chunks_nodedup.extend([ - tools.seqtk, - ("seq", "-r"), - ("-", ">"), - trimmed_fastq - ]) + trim_cmd_chunks.extend([("-o", processed_fastq)]) + else: + # Default to seqtk + trim_cmd_chunks = [ + tools.seqtk, + "trimfq", + ("-b", str(args.umi_len)) + ] + + if int(args.max_len) > 0: + trim_cmd_chunks.extend([("-L", str(args.max_len))]) + + trim_cmd_chunks.extend([dedup_fastq]) + + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", processed_fastq) + ]) else: - if read2: - trim_cmd_chunks_R2 = [ - tools.fastp, - ("--thread", str(pm.cores)), - ("--stdin", "--stdout"), - "--umi", - ("--umi_loc", "read2"), - ("--umi_len", args.umi_len), - ("--html", umi_report_R2), - ("--json", umi_json_R2), + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", "-"), + (">", processed_fastq) + ]) + + trim_cmd = build_command(trim_cmd_chunks) + pm.debug("trim_deduplicated_cmd: {}".format(build_command(trim_cmd_chunks))) + + return trim_cmd + + +def _trim_adapter_files(args, tools, read2, fq_file, outfolder): + """ + A helper function to build a command for read trimming using fastq files + without deduplication. + + :param argparse.Namespace args: binding between option name and argument, + e.g. from parsing command-line options + :param looper.models.AttributeDict tools: binding between tool name and + value, e.g. for tools/resources used by the pipeline + :param bool read2: if True, do not deduplicate and do not retain + intermediate files + :param str fq_file: path to FASTQ file + :param str outfolder: path to output directory for the pipeline + :return str: command to trim adapter trimmed files + """ + # Need undeduplicated results for complexity calculation + sname = args.sample_name # for concise code + + fastq_folder = os.path.join(outfolder, "fastq") + if read2: + noadap_fastq = os.path.join(fastq_folder, sname + "_R2_noadap.fastq") + trimmed_fastq = os.path.join(fastq_folder, sname + "_R2_trimmed.fastq") + else: + noadap_fastq = os.path.join(fastq_folder, sname + "_R1_noadap.fastq") + trimmed_fastq = os.path.join(fastq_folder, sname + "_R1_processed.fastq") + + fastp_folder = os.path.join(outfolder, "fastp") + if read2: + umi_report = os.path.join(fastp_folder, sname + "_R2_rmUmi.html") + umi_json = os.path.join(fastp_folder, sname + "_R2_rmUmi.json") + else: + umi_report = os.path.join(fastp_folder, sname + "_R1_rmUmi.html") + umi_json = os.path.join(fastp_folder, sname + "_R1_rmUmi.json") + + # Check quality encoding for use with FastX_Tools + if args.trimmer == "fastx": + encoding = _guess_encoding(fq_file) + + if args.adapter == "fastp": + # Remove UMI and specify location of UMI + # Still requires seqtk for reverse complementation + if int(args.umi_len) > 0: + trim_cmd_chunks = [ + tools.fastp, + ("--thread", str(pm.cores)), + ("-i", noadap_fastq), + "--stdout", + "--umi", + ("--umi_loc", "read1"), + ("--umi_len", args.umi_len), + ("--html", umi_report), + ("--json", umi_json) + ] + + if int(args.max_len) > 0: + # Trim to max length if specified + trim_cmd_chunks.extend([ "|", (tools.seqtk, "trimfq") - ] - if args.max_len != -1: - trim_cmd_chunks_R2.extend([ - ("-L", args.max_len) - ]) - trim_cmd_chunks_R2.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - (">", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", trimmed_fastq_R2) - ]) + ("-L", args.max_len), + "-" + ]) + + elif int(args.max_len) > 0: + trim_cmd_chunks = [ + (tools.seqtk, "trimfq"), + ("-L", args.max_len), + noadap_fastq + ] + else: + trim_cmd_chunks = [] + + if trim_cmd_chunks: + # Reverse complement for PRO-seq + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5) + ]) + if args.protocol.lower() in RUNON_SOURCE_PRO: + trim_cmd_chunks.extend([("-r", "-")]) else: - if args.complexity and args.umi_len > 0: - trim_cmd_chunks = [ - tools.fastp, - ("--thread", str(pm.cores)) - ] - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() #python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([ - ("-i", noadap_fastq), - "--stdout", - "--umi", - ("--umi_loc", "read1"), - ("--umi_len", args.umi_len), - ("--html", umi_report), - ("--json", umi_json), - "|", - (tools.seqtk, "trimfq") - ]) - if args.max_len != -1: - trim_cmd_chunks_nodedup.extend([ - ("-L", args.max_len) - ]) - trim_cmd_chunks_nodedup.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks_nodedup.extend([ - (">", trimmed_fastq) - ]) - else: - trim_cmd_chunks_nodedup.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", trimmed_fastq) - ]) - trim_cmd_chunks.extend([ - ("-i", dedup_fastq), - "--stdout", - "--umi", - ("--umi_loc", "read1"), - ("--umi_len", args.umi_len), - ("--html", umi_report), - ("--json", umi_json), - "|", - (tools.seqtk, "trimfq") - ]) - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-L", args.max_len) - ]) - trim_cmd_chunks.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", processed_fastq) - ]) - else: - trim_cmd_chunks = [ - tools.fastp, - ("--thread", str(pm.cores)), - ("--stdin", "--stdout"), - "--umi", - ("--umi_loc", "read1"), - ("--umi_len", args.umi_len), - ("--html", umi_report), - ("--json", umi_json), - "|", - (tools.seqtk, "trimfq") - ] - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-L", args.max_len) - ]) - trim_cmd_chunks.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", processed_fastq) - ]) + trim_cmd_chunks.extend(["-"]) - else: - if args.trimmer == "seqtk": - if read2: - trim_cmd_chunks_R2 = [ - tools.seqtk, - "trimfq", - ("-e", str(args.umi_len)) - ] - trim_cmd_chunks_R2.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - (">", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", trimmed_fastq_R2) - ]) + trim_cmd_chunks.extend([(">", trimmed_fastq)]) + + else: + # If no UMI removal or read trimming, just reverse complement + if args.protocol.lower() in RUNON_SOURCE_PRO: + trim_cmd_chunks.extend([ + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", noadap_fastq), + (">", trimmed_fastq) + ]) + # Otherwise just make sure to remove too short reads else: trim_cmd_chunks = [ - tools.seqtk, - "trimfq", - ("-b", str(args.umi_len)) + (tools.seqtk, "seq"), + ("-L", 5), + noadap_fastq, + (">", trimmed_fastq) ] - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-L", str(args.max_len)) - ]) - if args.complexity and args.umi_len > 0: - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() #python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([noadap_fastq]) - if args.runon.lower() == "gro": - trim_cmd_chunks_nodedup.extend([ - (">", trimmed_fastq) - ]) - else: - trim_cmd_chunks_nodedup.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", trimmed_fastq) - ]) - trim_cmd_chunks.extend([dedup_fastq]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", processed_fastq) - ]) - - elif args.trimmer == "fastx": - trim_tool = tools.fastx + "_trimmer" - rc_tool = tools.fastx + "_reverse_complement" - trim_cmd_chunks = [trim_tool] + elif args.trimmer == "seqtk": + # Remove UMI blindly by position + trim_cmd_chunks = [ + tools.seqtk, + "trimfq", + ("-b", str(args.umi_len)) + ] + + # Trim tp max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([ + ("-L", str(args.max_len)) + ]) + + trim_cmd_chunks.extend([noadap_fastq]) + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", trimmed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", "-"), + (">", trimmed_fastq) + ]) + elif args.trimmer == "fastx": + trim_tool = tools.fastx + "_trimmer" + rc_tool = tools.fastx + "_reverse_complement" + trim_cmd_chunks = [trim_tool] + if encoding == "Illumina-1.8": + trim_cmd_chunks.extend([ + ("-Q", str(33)) + ]) + + # Remove UMI blindly by position only + trim_cmd_chunks.extend([ + ("-f", str(int(float(args.umi_len)) + 1)) + ]) + + # Trim tp max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([ + ("-l", (str(int(float(args.max_len)) + int(float(args.umi_len))))) + ]) + + # Need undeduplicated results for complexity calculation + trim_cmd_chunks.extend([ + ("-i", noadap_fastq) + ]) + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + ("-o", trimmed_fastq) + ]) + else: if encoding == "Illumina-1.8": trim_cmd_chunks.extend([ ("-Q", str(33)) ]) trim_cmd_chunks.extend([ - ("-f", str(int(float(args.umi_len)) + 1)) + ("-o", trimmed_fastq) + ]) + else: + # Default to seqtk + # Remove UMI blindly by position only + trim_cmd_chunks = [ + tools.seqtk, + "trimfq", + ("-b", str(args.umi_len)) + ] + + # Trim to max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([ + ("-L", str(args.max_len)) + ]) + + trim_cmd_chunks.extend([noadap_fastq]) + if args.protocol.lower() in RUNON_SOURCE_GRO: + # Do not reverse complement + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", trimmed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", "-"), + (">", trimmed_fastq) + ]) + + trim_cmd = build_command(trim_cmd_chunks) + pm.debug("trim_cmd_nodedup: {}".format(build_command(trim_cmd_chunks))) + + return trim_cmd + + +def _trim_pipes(args, tools, read2, fq_file, outfolder): + """ + A helper function to build a command for read trimming using pipes. + + :param argparse.Namespace args: binding between option name and argument, + e.g. from parsing command-line options + :param looper.models.AttributeDict tools: binding between tool name and + value, e.g. for tools/resources used by the pipeline + :param bool read2: if True, do not deduplicate and do not retain + intermediate files + :param str fq_file: path to FASTQ file + :param str outfolder: path to output directory for the pipeline + :return str: command to trim adapter trimmed and deduplicated reads + """ + + # Only call this when NOT args.complexity or NOT args.umi_len > 0 + sname = args.sample_name # for concise code + + fastq_folder = os.path.join(outfolder, "fastq") + if read2: + noadap_fastq = os.path.join(fastq_folder, sname + "_R2_noadap.fastq") + processed_fastq = os.path.join(fastq_folder, sname + "_R2_trimmed.fastq") + else: + noadap_fastq = os.path.join(fastq_folder, sname + "_R1_noadap.fastq") + processed_fastq = os.path.join(fastq_folder, sname + "_R1_processed.fastq") + + fastp_folder = os.path.join(outfolder, "fastp") + if read2: + umi_report = os.path.join(fastp_folder, sname + "_R2_rmUmi.html") + umi_json = os.path.join(fastp_folder, sname + "_R2_rmUmi.json") + else: + umi_report = os.path.join(fastp_folder, sname + "_R1_rmUmi.html") + umi_json = os.path.join(fastp_folder, sname + "_R1_rmUmi.json") + + # Check quality encoding for use with FastX_Tools + if args.trimmer == "fastx": + encoding = _guess_encoding(fq_file) + + if args.adapter == "fastp": + # There are no intermediate files, just pipes + # Remove UMI + if args.umi_len > 0: + trim_cmd_chunks = [ + tools.fastp, + ("--thread", str(pm.cores)), + ("--stdin", "--stdout"), + "--umi", + ("--umi_loc", "read1"), + ("--umi_len", args.umi_len), + ("--html", umi_report), + ("--json", umi_json), + "|", + (tools.seqtk, "trimfq") + ] + + # if read2: + # # still 'read1' because it's being processed as a SE file + # trim_cmd_chunks.extend([("--umi_loc", "read1")]) + # else: + # trim_cmd_chunks.extend([("--umi_loc", "read1")]) + + # trim_cmd_chunks.extend([ + # ("--umi_len", args.umi_len), + # ("--html", umi_report), + # ("--json", umi_json), + # "|", + # (tools.seqtk, "trimfq") + # ]) + + # Trim to max length if specified + if int(args.max_len) > 0: + trim_cmd_chunks.extend([("-L", args.max_len)]) + + trim_cmd_chunks.extend(["-"]) + + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", processed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", "-"), + (">", processed_fastq) + ]) + elif int(args.max_len) > 0: + # No UMI, but still trim max length + trim_cmd_chunks = [ + (tools.seqtk, "trimfq"), + ("-L", args.max_len), + "-" + ] + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", processed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", "-"), + (">", processed_fastq) + ]) + else: + # No UMI and no trimming + if args.protocol.lower() in RUNON_SOURCE_PRO: + trim_cmd_chunks = [ + (tools.seqtk, "seq"), + ("-L", 5), + ("-r", noadap_fastq), + (">", processed_fastq) + ] + else: + trim_cmd_chunks = [] + # if not args.complexity and args.umi_len > 0 retain intermediate files + elif args.trimmer == "seqtk": + trim_cmd_chunks = [ + tools.seqtk, + "trimfq" + ] + + if read2: + trim_cmd_chunks.extend([("-e", str(args.umi_len))]) + else: + trim_cmd_chunks.extend([("-b", str(args.umi_len))]) + if int(args.max_len) > 0: + trim_cmd_chunks.extend([("-L", str(args.max_len))]) + + trim_cmd_chunks.extend(["-"]) + + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", processed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + tools.seqtk, + ("seq", "-r"), + ("-L", 5), + ("-", ">"), + processed_fastq + ]) + elif args.trimmer == "fastx": + trim_tool = tools.fastx + "_trimmer" + rc_tool = tools.fastx + "_reverse_complement" + trim_cmd_chunks = [trim_tool] + + if encoding == "Illumina-1.8": + trim_cmd_chunks.extend([("-Q", str(33))]) + + if read2: + trim_cmd_chunks.extend([("-t", str(int(float(args.umi_len))))]) + else: + trim_cmd_chunks.extend([("-f", str(int(float(args.umi_len)) + 1))]) + if int(args.max_len) > 0: + trim_cmd_chunks.extend([ + ("-l", (str(int(float(args.max_len)) + int(float(args.umi_len))))) + ]) + + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([("-o", processed_fastq)]) + else: + trim_cmd_chunks.extend([("|", rc_tool)]) + if encoding == "Illumina-1.8": + trim_cmd_chunks.extend([("-Q", str(33))]) + trim_cmd_chunks.extend([("-o", processed_fastq)]) + else: + # Default to seqtk + trim_cmd_chunks = [ + tools.seqtk, + "trimfq" + ] + + if read2: + trim_cmd_chunks.extend([("-e", str(args.umi_len))]) + else: + trim_cmd_chunks.extend([("-b", str(args.umi_len))]) + if int(args.max_len) > 0: + trim_cmd_chunks.extend([("-L", str(args.max_len))]) + + trim_cmd_chunks.extend(["-"]) + + # Do not reverse complement for GRO-seq + if args.protocol.lower() in RUNON_SOURCE_GRO: + trim_cmd_chunks.extend([ + "|", + (tools.seqtk, "seq"), + ("-L", 5), + "-", + (">", processed_fastq) + ]) + else: + trim_cmd_chunks.extend([ + "|", + tools.seqtk, + ("seq", "-r"), + ("-L", 5), + ("-", ">"), + processed_fastq ]) - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-l", (str(int(float(args.max_len)) + int(float(args.umi_len))))) - ]) - if read2: - trim_cmd_chunks_R2 = [trim_tool] - if encoding == "Illumina-1.8": - trim_cmd_chunks_R2.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks_R2.extend([ - ("-t", str(int(float(args.umi_len)))) - ]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - ("-o", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend([ - ("|", rc_tool) - ]) - if encoding == "Illumina-1.8": - trim_cmd_chunks_R2.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks_R2.extend([ - ("-o", trimmed_fastq_R2) - ]) - else: - if args.complexity and args.umi_len > 0: - # Need undeduplicated results for complexity calculation - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() #python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([ - ("-i", noadap_fastq) - ]) - trim_cmd_chunks.extend([ - ("-i", dedup_fastq) - ]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - ("-o", processed_fastq) - ]) - trim_cmd_chunks_nodedup.extend([ - ("-o", trimmed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - ("|", rc_tool) - ]) - if encoding == "Illumina-1.8": - trim_cmd_chunks.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks_nodedup.extend([ - ("-Q", str(33)) - ]) - trim_cmd_chunks.extend([ - ("-o", processed_fastq) - ]) - trim_cmd_chunks_nodedup.extend([ - ("-o", trimmed_fastq) - ]) + trim_cmd = build_command(trim_cmd_chunks) + pm.debug("trim_pipes_cmd: {}".format(build_command(trim_cmd_chunks))) + pm.debug("trim_pipes_cmd read2 status: {}".format(read2)) - else: - # Default to seqtk - if read2: - trim_cmd_chunks_R2 = [ - tools.seqtk, - "trimfq", - ("-e", str(args.umi_len)) - ] - trim_cmd_chunks_R2.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks_R2.extend([ - (">", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks_R2.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", trimmed_fastq_R2) - ]) - else: - trim_cmd_chunks = [ - tools.seqtk, - "trimfq", - ("-b", str(args.umi_len)) - ] - if args.max_len != -1: - trim_cmd_chunks.extend([ - ("-L", str(args.max_len)) - ]) - if args.complexity and args.umi_len > 0: - #trim_cmd_chunks_nodedup = trim_cmd_chunks.copy() #python3 - trim_cmd_chunks_nodedup = list(trim_cmd_chunks) - trim_cmd_chunks_nodedup.extend([noadap_fastq]) - if args.runon.lower() == "gro": - trim_cmd_chunks_nodedup.extend([ - (">", trimmed_fastq) - ]) - else: - trim_cmd_chunks_nodedup.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", trimmed_fastq) - ]) - trim_cmd_chunks.extend([dedup_fastq]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend(["-"]) - if args.runon.lower() == "gro": - trim_cmd_chunks.extend([ - (">", processed_fastq) - ]) - else: - trim_cmd_chunks.extend([ - "|", - (tools.seqtk, "seq"), - ("-r", "-"), - (">", processed_fastq) - ]) + return trim_cmd - if read2: - trim_cmd2 = build_command(trim_cmd_chunks_R2) + +def _process_fastq(args, tools, res, read2, fq_file, outfolder): + """ + A helper function to prepare read files for downstream processing. + + :param argparse.Namespace args: binding between option name and argument, + e.g. from parsing command-line options + :param looper.models.AttributeDict res: binding between resources and + value, e.g. for resources used by the pipeline + :param looper.models.AttributeDict tools: binding between tool name and + value, e.g. for tools used by the pipeline + :param bool read2: if True, do not deduplicate and do not retain + intermediate files + :param str fq_file: path to FASTQ file + :param str outfolder: path to output directory for the pipeline + :return (str, str): pair (R1, R2) of paths to FASTQ files + """ + # Create names for processed FASTQ files. + fastq_folder = os.path.join(outfolder, "fastq") + fastqc_folder = os.path.join(outfolder, "fastqc") + fastp_folder = os.path.join(outfolder, "fastp") + + sname = args.sample_name # for concise code + + noadap_fq1 = os.path.join(fastq_folder, sname + "_R1_noadap.fastq") + noadap_fq2 = os.path.join(fastq_folder, sname + "_R2_noadap.fastq") + dedup_fq = os.path.join(fastq_folder, sname + "_R1_dedup.fastq") + trimmed_fq1 = os.path.join(fastq_folder, sname + "_R1_trimmed.fastq") + trimmed_fq2 = os.path.join(fastq_folder, sname + "_R2_trimmed.fastq") + trimmed_dups_fq2 = os.path.join(fastq_folder, sname + "_R2_trimmed_dups.fastq") + processed_fastq = os.path.join(fastq_folder, sname + "_R1_processed.fastq") + + if args.adapter == "cutadapt": + cutadapt_folder = os.path.join(outfolder, "cutadapt") + if read2: + cutadapt_report = os.path.join(cutadapt_folder, + sname + "_R2_cutadapt.txt") + else: + cutadapt_report = os.path.join(cutadapt_folder, + sname + "_R1_cutadapt.txt") + adapter_report = cutadapt_report + else: + adapter_report = os.path.join(fastqc_folder, + sname + "_R1_rmAdapter.txt") + + fastp_pfx = os.path.join(fastp_folder, sname + "_R1_fastp_adapter") + fastp_report_txt = fastp_pfx + ".txt" + fastp_report_html = fastp_pfx + ".html" + + adapter_command = _remove_adapters(args, tools, res, read2, fq_file, outfolder) + pm.debug("Adapter command: {}".format(adapter_command)) + pm.debug("Read2 status: {}".format(read2)) + + # To plot fragment sizes requires keeping intermediate files + if not args.complexity and args.umi_len > 0: + deduplicate_command = _deduplicate(args, tools, fq_file, outfolder) + pm.debug("Dedup command: {}".format(deduplicate_command)) + trim_command = _trim_adapter_files(args, tools, read2, fq_file, outfolder) + trim_command2 = _trim_deduplicated_files(args, tools, fq_file, outfolder) else: - trim_cmd1 = build_command(trim_cmd_chunks) - if args.complexity and args.umi_len > 0: - trim_cmd_nodedup = build_command(trim_cmd_chunks_nodedup) + trim_command = _trim_adapter_files(args, tools, read2, fq_file, outfolder) + + # original method included option to not retain intermediates + # if read2: + # trim_command = _trim_pipes(args, tools, True, fq_file, outfolder) + # elif not args.complexity and args.umi_len > 0: + # deduplicate_command = _deduplicate(args, tools, fq_file, outfolder) + # pm.debug("Dedup command: {}".format(deduplicate_command)) + # trim_command = _trim_adapter_files(args, tools, fq_file, outfolder) + # trim_command2 = _trim_deduplicated_files(args, tools, fq_file, outfolder) + # else: + # trim_command = _trim_pipes(args, tools, False, fq_file, outfolder) def report_fastq(): """ Report QC metrics on intermediate steps of fastq file preparation """ - if args.adapter == "fastp": - adapter_term = "reads with adapter trimmed:" - too_short_term = "reads failed due to too short:" - total_bases_term = "total bases:" - - ac_cmd = ("grep '" + adapter_term + "' " + - adapter_report + " | head -n 1 | awk '{print $NF}'") - ts_cmd = ("grep '" + too_short_term + "' " + - adapter_report + " | head -n 1 | awk '{print $NF}'") - bases = ("grep '" + total_bases_term + "' " + - adapter_report + " | head -n 1 | awk '{print $NF}'") - adapter_bases = ("grep 'bases trimmed due to adapters:' " + - adapter_report + " | awk '{print $NF}'") - - pm.report_object("FastP_report", adapter_html) - - elif args.adapter == "cutadapt": + if args.adapter == "cutadapt": + report = cutadapt_report adapter_term = "Reads with adapters:" too_short_term = "Reads that were too short:" total_bases_term = "Total basepairs processed:" ac_cmd = ("grep '" + adapter_term + "' " + - adapter_report + " | awk '{print $(NF-1)}'") + report + " | awk '{print $(NF-1)}'") ts_cmd = ("grep '" + too_short_term + "' " + - adapter_report + " | awk '{print $(NF-1)}'") + report + " | awk '{print $(NF-1)}'") bases = ("grep '" + total_bases_term + "' " + - adapter_report + " | awk '{print $(NF-1)}'") + report + " | awk '{print $(NF-1)}'") adapter_bases = ("awk '{sum+=$1*$2} END {printf \"%.0f\", sum}' " + - adapter_report) + report) else: # default to fastp + report = fastp_report_txt adapter_term = "reads with adapter trimmed:" too_short_term = "reads failed due to too short:" total_bases_term = "total bases:" ac_cmd = ("grep '" + adapter_term + "' " + - adapter_report + " | head -n 1 | awk '{print $NF}'") + report + " | head -n 1 | awk '{print $NF}'") ts_cmd = ("grep '" + too_short_term + "' " + - adapter_report + " | head -n 1 | awk '{print $NF}'") + report + " | head -n 1 | awk '{print $NF}'") bases = ("grep '" + total_bases_term + "' " + - adapter_report + " | head -n 1 | awk '{print $NF}'") + report + " | head -n 1 | awk '{print $NF}'") adapter_bases = ("grep 'bases trimmed due to adapters:' " + - adapter_report + " | awk '{print $NF}'") + report + " | awk '{print $NF}'") - pm.report_object("FastP_report", adapter_html) + pm.report_object("FastP_report", fastp_report_html) - ac = float(pm.checkprint(ac_cmd).replace(',','')) - pm.report_result("Reads_with_adapter", ac) - total_bases = float(pm.checkprint(bases).replace(',','')) - total_adapter = float(pm.checkprint(adapter_bases).replace(',','')) - pm.report_result("Pct_adapter_contamination", - round(float(total_adapter/total_bases), 2)) + if _itsa_file(report): + ac = float(pm.checkprint(ac_cmd).replace(',','')) + pm.report_result("Reads_with_adapter", ac) + total_bases = float(pm.checkprint(bases).replace(',','')) + total_adapter = float(pm.checkprint(adapter_bases).replace(',','')) + # pm.report_result("Pct_adapter_contamination", + # round(float(total_adapter/total_bases), 2)) + + ts = float(pm.checkprint(ts_cmd).replace(',','')) + pm.report_result("Reads_too_short", ts) + + tr = int(ngstk.count_lines(noadap_fq1).strip()) + dr = int(ngstk.count_lines(dedup_fq).strip()) + dups = max(0, (float(tr)/4 - float(dr)/4)) + pm.report_result("Duplicate_reads", dups) + + pr = int(ngstk.count_lines(processed_fastq).strip()) + pm.report_result("Pct_reads_too_short", round(float(ts/pr), 2)) + else: + pm.fail_pipeline("Could not find '{}' to report adapter " + "removal statistics.".format(report)) + + + def plot_fragments(infolder, outfolder): + """ + Plot adapter insertion distribution (from PE data only) + + :param str infolder: path to fastq containing directory + :param str outfolder: path to output directory for functions + """ + # merge short fragment reads + noadap_fq1 = os.path.join(infolder, + args.sample_name + "_R1_noadap.fastq") + noadap_fq2 = os.path.join(infolder, + args.sample_name + "_R2_noadap.fastq") + rep_fq1 = os.path.join(infolder, + args.sample_name + "_R1_noadap.fastq.paired.fq") + rep_fq2 = os.path.join(infolder, + args.sample_name + "_R2_noadap.fastq.paired.fq") + flash_hist = os.path.join(outfolder, args.sample_name + ".hist") + flash_gram = os.path.join(outfolder, args.sample_name + ".histogram") + + tmp = float(pm.get_stat("Raw_reads")) + if tmp: + rr = float(tmp) + else: + rr = 0 + if (rr < 1): + pm.fail_pipeline(RuntimeError("Raw_reads were not reported. " + "Check output ({})".format(param.outfolder))) + + cmd1 = (tools.fastqpair + " -t " + str(int(0.9*rr)) + " " + + noadap_fq1 + " " + noadap_fq2) + ziptool = "pigz" if ngstk.check_command("pigz") else "gzip" + cmd2 = (tools.flash + " -q -t " + str(pm.cores) + + " --compress-prog=" + ziptool + " --suffix=gz " + + rep_fq1 + " " + rep_fq2 + " -o " + args.sample_name + + " -d " + outfolder) + pm.run([cmd1, cmd2], [flash_hist, flash_gram]) + + + pm.timestamp("### Plot adapter insertion distribution") + + degradation_pdf = os.path.join(outfolder, + args.sample_name + "_adapter_insertion_distribution.pdf") + degradation_png = os.path.join(outfolder, + args.sample_name + "_adapter_insertion_distribution.png") + cmd = (tools.Rscript + " " + tool_path("PEPPRO.R") + + " adapt -i " + flash_hist + " -o " + outfolder) + if int(args.umi_len) > 0: + cmd += (" -u " + str(args.umi_len)) + umi_len = args.umi_len + else: + umi_len = 0 + pm.run(cmd, degradation_pdf, nofail=True) + pm.report_object("Adapter insertion distribution", degradation_pdf, + anchor_image=degradation_png) + + if not pm.get_stat('Peak_adapter_insertion_size') or args.new_start: + # Report the peak insertion size + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + flash_hist + + " | awk 'BEGIN{max= 0; max_len=0; len=0}{if ($2>0+max)" + + " {max=$2; len=$1}; max_len=$1} END{print len-" + + str(umi_len) + "}'") + adapter_peak = pm.checkprint(cmd) + if adapter_peak: + ap = int(adapter_peak) + pm.report_result("Peak_adapter_insertion_size", ap) + + # Report the degradation ratio + if not pm.get_stat('Degradation_ratio') or args.new_start: + pm.timestamp("### Calculating degradation ratio") + + cmd = ("awk '{ if (($1-" + str(umi_len) + ") == 10) {status = 1}} " + + "END {if (status) {print status} else {print 0}}' " + + flash_hist) + degraded_lower = pm.checkprint(cmd) + cmd = ("awk '{ if (($1-" + str(umi_len) + ") == 20) {status = 1}} " + + "END {if (status) {print status} else {print 0}}' " + + flash_hist) + degraded_upper = pm.checkprint(cmd) + cmd = ("awk '{ if (($1-" + str(umi_len) + ") == 30) {status = 1}} " + + "END {if (status) {print status} else {print 0}}' " + + flash_hist) + intact_lower = pm.checkprint(cmd) + cmd = ("awk '{ if (($1-" + str(umi_len) + ") == 40) {status = 1}} " + + "END {if (status) {print status} else {print 0}}' " + + flash_hist) + intact_upper = pm.checkprint(cmd) + + if degraded_lower: + dl = int(degraded_lower) + if dl == 1: + dl = 10 + else: + cmd = ("awk 'NR==1 {print ($1-" + str(umi_len) + ")}' " + flash_hist) + degraded_lower = pm.checkprint(cmd) + dl = int(degraded_lower) if degraded_lower else 1 + if dl < 1: + dl = 1 + + if degraded_upper: + du = int(degraded_upper) + if du == 1: + du = 20 + else: + du = int(degraded_lower) + 10 - ts = float(pm.checkprint(ts_cmd).replace(',','')) - pm.report_result("Reads_too_short", ts) + if intact_upper: + iu = int(intact_upper) + if iu == 1: + iu = 40 + else: + cmd = ("awk 'END {print ($1-" + str(umi_len) + ")}' " + flash_hist) + intact_upper = pm.checkprint(cmd) + dl = int(intact_upper) if intact_upper else 40 + + if intact_lower: + il = int(intact_lower) + if il == 1: + il = 30 + else: + il = int(intact_upper) - 10 + if il < 1: + il = 30 + + cmd = ("awk '($1 <= " + str(du) + " && $1 >= " + str(dl) + + "){degradedSum += $2}; " + "($1 >= " + + str(il) + " && $1 <= " + str(iu) + + "){intactSum += $2} END {if (intactSum < 1) " + + "{intactSum = 1} print degradedSum/intactSum}' " + + flash_hist) + degradation_ratio = pm.checkprint(cmd) + if degradation_ratio: + dr = float(degradation_ratio) + pm.report_result("Degradation_ratio", round(dr, 4)) - tr = int(ngstk.count_lines(noadap_fastq).strip()) - dr = int(ngstk.count_lines(dedup_fastq).strip()) - dups = max(0, (float(tr)/4 - float(dr)/4)) - pm.report_result("Duplicate_reads", dups) # Put it all together if read2: - process_fastq_cmd2 = build_command([ - adapter_cmd, "|", trim_cmd2]) - #print("process_fastq_cmd2: {}".format(process_fastq_cmd2)) - pm.run(process_fastq_cmd2, trimmed_fastq_R2) - cp_cmd = ("cp " + trimmed_fastq_R2 + " " + trimmed_dups_fastq_R2) - pm.run(cp_cmd, trimmed_dups_fastq_R2) - return trimmed_fastq_R2, trimmed_dups_fastq_R2 - else: - if args.complexity and args.umi_len > 0: - pm.run([adapter_cmd, dedup_cmd, trim_cmd_nodedup], - trimmed_fastq, follow=report_fastq) - pm.run(trim_cmd1, processed_fastq, - follow=ngstk.check_trim(processed_fastq, False, None)) - pm.clean_add(noadap_fastq) - pm.clean_add(dedup_fastq) - pm.clean_add(trimmed_fastq) - return processed_fastq, trimmed_fastq + pm.run([adapter_command, trim_command], trimmed_fq2, + follow=ngstk.check_trim(trimmed_fq2, False, None, + fastqc_folder=fastqc_folder)) + if args.adapter == "cutadapt": + output_folder = os.path.join(outfolder, "cutadapt") else: - process_fastq_cmd = build_command([ - adapter_cmd, "|", dedup_cmd, "|", trim_cmd1]) - pm.run(process_fastq_cmd, processed_fastq, - follow=ngstk.check_trim(processed_fastq, False, None)) - return processed_fastq + output_folder = os.path.join(outfolder, "fastp") + cp_cmd = ("cp " + trimmed_fq2 + " " + trimmed_dups_fq2) + pm.run(cp_cmd, trimmed_dups_fq2, + follow=plot_fragments(fastq_folder, output_folder)) + return trimmed_fq2, trimmed_dups_fq2 + elif not args.complexity and args.umi_len > 0: + # This trim command DOES need the adapter file... + pm.debug("\ntrim_command1: {} +\n {}\n".format(adapter_command, trim_command)) + pm.run([adapter_command, trim_command], processed_fastq, + follow=ngstk.check_trim(processed_fastq, False, None, + fastqc_folder=fastqc_folder)) + # This needs to produce the trimmed_fastq file + # TODO: trim_command2 doesn't need to produce fastp html files and json files... + # TODO: if I'm using seqkit rmdup here, I DON'T NEED the other fastp!!! umi command + pm.debug("\ntrim_command2: {} +\n {}\n".format(deduplicate_command, trim_command2)) + pm.run([deduplicate_command, trim_command2], + trimmed_fq1, follow=report_fastq) + pm.clean_add(noadap_fq1) + pm.clean_add(dedup_fq) + pm.clean_add(trimmed_fq1) + return processed_fastq, trimmed_fq1 + else: + pm.debug("\nELSE: trim_command: {} + {}\n".format(adapter_command, trim_command)) + pm.run([adapter_command, trim_command], processed_fastq, + follow=ngstk.check_trim(processed_fastq, False, None, + fastqc_folder=fastqc_folder)) + return processed_fastq + + # Put it all together (original included option to not retain intermediates) + # if read2: + # # cutadapt directs its report to stderr if the command lacks + # # a -o and the actual reads are directed to stdout. + # process_fastq_cmd2 = build_command([ + # "(", adapter_command, "|", trim_command, ") 2> ", adapter_report]) + # pm.debug("process_fastq_cmd2: {}".format(process_fastq_cmd2)) + # pm.run(process_fastq_cmd2, trimmed_fastq_R2) + # cp_cmd = ("cp " + trimmed_fq2 + " " + trimmed_dups_fq2) + # pm.run(cp_cmd, trimmed_dups_faq2) + # return trimmed_fastq_R2, trimmed_dups_fastq_R2 + # else: + # if not args.complexity and args.umi_len > 0: + # # This trim command DOES need the adapter file... + # pm.debug("\ntrim_command1: {} +\n {}\n".format(adapter_command, trim_command)) + # pm.run([adapter_command, trim_command], processed_fastq, + # follow=ngstk.check_trim(processed_fastq, False, None, + # fastqc_folder=fastqc_folder)) + # # This needs to produce the trimmed_fastq file + # # TODO: trim_command2 doesn't need to produce fastp html files and json files... + # # TODO: if I'm using seqkit rmdup here, I DON'T NEED the other fastp!!! umi command + # pm.debug("\ntrim_command2: {} +\n {}\n".format(deduplicate_command, trim_command2)) + # pm.run([deduplicate_command, trim_command2], + # trimmed_fastq, follow=report_fastq) + # pm.clean_add(noadap_fq1) + # pm.clean_add(dedup_fq) + # pm.clean_add(trimmed_fq1) + # return processed_fastq, trimmed_fq1 + # else: + # pm.debug("\nELSE: trim_command: {} + {}\n".format(adapter_command, trim_command)) + # process_fastq_cmd = build_command([ + # "(", adapter_command, "|", trim_command, ") 2> ", adapter_report]) + # pm.run(process_fastq_cmd, processed_fastq, + # follow=ngstk.check_trim(processed_fastq, False, None, + # fastqc_folder=fastqc_folder)) + # return processed_fastq def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, @@ -1074,8 +1380,9 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, out_fastq_r1 = out_fastq_pre + '_unmap_R1.fq' out_fastq_r2 = out_fastq_pre + '_unmap_R2.fq' - out_fastq_r1_gz = out_fastq_r1 + '.gz' - out_fastq_r2_gz = out_fastq_r2 + '.gz' + # Use the undeduplicated even for duplicates on re-runs + out_fastq_r1_gz = out_fastq_pre + '_unmap_R1.fq.gz' + out_fastq_r2_gz = out_fastq_pre + '_unmap_R2.fq.gz' if useFIFO and paired and not args.keep: if dups: @@ -1097,7 +1404,7 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, else: out_fastq_tmp = out_fastq_pre + '_unmap.fq' - out_fastq_tmp_gz = out_fastq_tmp + ".gz" + out_fastq_tmp_gz = out_fastq_pre + '_unmap.fq.gz' filter_pair = build_command([tools.perl, tool_path("filter_paired_fq.pl"), out_fastq_tmp, @@ -1135,51 +1442,80 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, cmd += " | " + tools.samtools + " sort - -@ 1" # sort output cmd += " -T " + tempdir cmd += " -o " + mapped_bam + cmd += ") 2>&1" else: - cmd += " > /dev/null" - cmd += ") 2>" + summary_file + cmd += " 2>&1 > /dev/null)" + #cmd += ")" + #cmd += ") 2> " + summary_file if paired: if args.keep or not useFIFO: - pm.run([cmd, filter_pair], mapped_bam) + # checkprint() doesn't know how to handle targets + # must recreate that effect ourselves + if not _itsa_file(mapped_bam) or args.new_start: + aln_stats = pm.checkprint(cmd) + pm.run(filter_pair, mapped_bam) else: pm.wait = False - pm.run(filter_pair, [summary_file, out_fastq_r2_gz], - container=pm.container) + pm.run(filter_pair, out_fastq_r2_gz) pm.wait = True - pm.run(cmd, [summary_file, out_fastq_r2_gz], - container=pm.container) + if not _itsa_file(out_fastq_r2_gz) or args.new_start: + aln_stats = pm.checkprint(cmd) else: if args.keep: - pm.run(cmd, mapped_bam) + if not _itsa_file(mapped_bam) or args.new_start: + aln_stats = pm.checkprint(cmd) else: # TODO: switch to this once filter_paired_fq works with SE #pm.run(cmd2, summary_file) #pm.run(cmd1, out_fastq_r1) - pm.run(cmd, out_fastq_tmp_gz) + if not _itsa_file(out_fastq_tmp_gz) or args.new_start: + aln_stats = pm.checkprint(cmd) + else: + aln_stats = None pm.clean_add(out_fastq_tmp) if not dups: - cmd = ("grep 'aligned exactly 1 time' " + summary_file + - " | awk '{print $1}'") - align_exact = pm.checkprint(cmd) - if align_exact: - ar = float(align_exact)*2 - else: - ar = 0 - - # report aligned reads - pm.report_result("Aligned_reads_" + assembly_identifier, ar) - try: - # wrapped in try block in case Trimmed_reads is not reported - # in this pipeline. - tr = float(pm.get_stat("Trimmed_reads")) - except: - print("Trimmed reads is not reported.") - else: - res_key = "Alignment_rate_" + assembly_identifier - pm.report_result(res_key, round(float(ar) * 100 / float(tr), 2)) + if not pm.get_stat("Aligned_reads_" + assembly_identifier) or args.new_start: + if aln_stats: + pm.info(aln_stats) # Log alignment statistics + try: + align_exact = re.search(r".*aligned exactly 1 time", aln_stats).group().split()[0] + except AttributeError: + align_exact = None + except: + err_msg = "Unable to determine alignment statistics for {}." + pm.fail_pipeline(RuntimeError(err_msg.format(args.genome_assembly))) + else: + align_exact = None + # cmd = ("grep 'aligned exactly 1 time' " + summary_file + + # " | awk '{print $1}'") + # align_exact = pm.checkprint(cmd) + + if align_exact: + if paired: + ar = float(align_exact)*2 + else: + ar = float(align_exact) + else: + ar = 0 + + # report aligned reads + pm.report_result("Aligned_reads_" + assembly_identifier, ar) + try: + # wrapped in try block in case Trimmed_reads is not reported + # in this pipeline. + tr = float(pm.get_stat("Trimmed_reads")) + except: + print("Trimmed reads is not reported.") + else: + res_key = "Alignment_rate_" + assembly_identifier + if float(ar) > 0: + pm.report_result(res_key, + round(float(ar) * 100 / float(tr), 2)) + else: + pm.report_result(res_key, 0) if paired: unmap_fq1 = out_fastq_r1 @@ -1331,14 +1667,14 @@ def calc_frip(bamfile, ftfile, frip_func, pipeline_manager, """ Calculate the fraction of reads in feature file. - Use the given function and data from an aligned reads file and a called - peaks file, along with a PipelineManager, to calculate. + Use the given function and data from an aligned reads file and a features + file, along with a PipelineManager, to calculate. - :param str peakfile: path to called peaks file + :param str bamfile: path to aligned reads file + :param str ftfile: path to features of interest to calculate overlap :param callable frip_func: how to calculate the fraction of reads in feat; this must accept the path to the aligned reads file and the path to the called peaks file as arguments. - :param str bamfile: path to aligned reads file :param pypiper.PipelineManager pipeline_manager: the PipelineManager in use for the pipeline calling this function :param str aligned_reads_key: name of the key from a stats (key-value) file @@ -1352,127 +1688,118 @@ def calc_frip(bamfile, ftfile, frip_func, pipeline_manager, return float(num_in_reads) / float(num_aligned_reads) -def _add_resources(args, res): +def _itsa_file(anyfile): + """ + Helper function to confirm a file exists and is not empty. + + :param str anyfile: path to a file + """ + return(os.path.isfile(anyfile) and os.stat(anyfile).st_size > 0) + + +def _itsa_empty_file(anyfile): + """ + Helper function to confirm a file exists but is empty. + + :param str anyfile: path to a file + """ + return(os.path.isfile(anyfile) and os.stat(anyfile).st_size == 0) + + +def is_gzipped(file_name): + """ + Determine whether indicated file appears to be gzipped. + :param str file_name: Name/path of file to check as gzipped. + :return bool: Whether indicated file appears to be in gzipped format. + """ + _, ext = os.path.splitext(file_name) + return file_name.endswith(".gz") + + +def _add_resources(args, res, asset_dict=None): """ Add additional resources needed for pipeline. :param argparse.Namespace args: binding between option name and argument, e.g. from parsing command-line options :param pm.config.resources res: pipeline manager resources list + :param asset_dict list: list of dictionary of assets to add """ + rgc = RGC(select_genome_config(res.get("genome_config"))) - # REQ - for asset in ["chrom_sizes", BT2_IDX_KEY]: - res[asset] = rgc.get_asset(args.genome_assembly, asset) + key_errors = [] + exist_errors = [] + required_list = [] + + # Check that bowtie2 indicies exist for specified prealignments for reference in args.prealignments: for asset in [BT2_IDX_KEY]: - res[asset] = rgc.get_asset(reference, asset) - - # OPT - msg = "The '{}' asset is not present in your REFGENIE config file." - err = "The '{}' asset does not exist." + try: + res[asset] = rgc.get_asset(reference, asset) + except KeyError: + err_msg = "{} for {} is missing from REFGENIE config file." + pm.fail_pipeline(KeyError(err_msg.format(asset, reference))) + except: + err_msg = "{} for {} does not exist." + pm.fail_pipeline(IOError(err_msg.format(asset, reference))) - asset = "tss_annotation" - if args.TSS_name: - res[asset] = os.path.abspath(args.TSS_name) - else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --TSS-name.\n") - print(err.format(asset)) - print(msg) - - asset = "pi_tss" - if args.pi_tss: - res[asset] = os.path.abspath(args.pi_tss) + # Check specified assets + if not asset_dict: + return res, rgc else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --pi-tss.\n") - print(err.format(asset)) - print(msg) - - asset = "pi_body" - if args.pi_body: - res[asset] = os.path.abspath(args.pi_body) - else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --pi-body.\n") - print(err.format(asset)) - print(msg) - - asset = "pre_mRNA_annotation" - if args.pre_name: - res[asset] = os.path.abspath(args.pre_name) - else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --pre-name.\n") - print(err.format(asset)) - print(msg) - - asset = "feat_annotation" - if args.anno_name: - res[asset] = os.path.abspath(args.anno_name) - else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --anno-name.\n") - print(err.format(asset)) - print(msg) - - asset = "exon_annotation" - if args.exon_name: - res[asset] = os.path.abspath(args.exon_name) - else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --exon-name.\n") - print(err.format(asset)) - print(msg) - - asset = "intron_annotation" - if args.intron_name: - res[asset] = os.path.abspath(args.intron_name) - else: - try: - res[asset] = rgc.get_asset(args.genome_assembly, asset) - except KeyError: - print(msg.format(asset)) - except: - msg = ("Update your REFGENIE config file to include this asset, or " - "point directly to the file using --intron-name.\n") - print(err.format(asset)) - print(msg) + for item in asset_dict: + pm.debug("item: {}".format(item)) # DEBUG + asset = item["asset_name"] + seek_key = item["seek_key"] or item["asset_name"] + tag = item["tag_name"] or "default" + arg = item["arg"] + user_arg = item["user_arg"] + req = item["required"] + + if arg and hasattr(args, arg) and getattr(args, arg): + res[seek_key] = os.path.abspath(getattr(args, arg)) + else: + try: + pm.debug("{} - {}.{}:{}".format(args.genome_assembly, + asset, + seek_key, + tag)) # DEBUG + res[seek_key] = rgc.get_asset(args.genome_assembly, + asset_name=str(asset), + tag_name=str(tag), + seek_key=str(seek_key)) + except KeyError: + key_errors.append(item) + if req: + required_list.append(item) + except: + exist_errors.append(item) + if req: + required_list.append(item) + + if len(key_errors) > 0 or len(exist_errors) > 0: + pm.info("Some assets are not found. You can update your REFGENIE " + "config file or point directly to the file using the noted " + "command-line arguments:") + + if len(key_errors) > 0: + if required_list: + err_msg = "Required assets missing from REFGENIE config file: {}" + pm.fail_pipeline(IOError(err_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name}".format(**x) for x in required_list])))) + else: + warning_msg = "Optional assets missing from REFGENIE config file: {}" + pm.info(warning_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name}".format(**x) for x in key_errors]))) + + if len(exist_errors) > 0: + if required_list: + err_msg = "Required assets not existing: {}" + pm.fail_pipeline(IOError(err_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name} (--{user_arg})".format(**x) for x in required_list])))) + else: + warning_msg = "Optional assets not existing: {}" + pm.info(warning_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name} (--{user_arg})".format(**x) for x in exist_errors]))) - res.rgc = rgc - return res + return res, rgc ############################################################################### @@ -1499,6 +1826,7 @@ def main(): tools = pm.config.tools param = pm.config.parameters res = pm.config.resources + #sstructure = pm.sample_structure # maybe possible in the future? # Check that the required tools are callable by the pipeline tool_list = [v for k,v in tools.items()] # extract tool list @@ -1525,18 +1853,50 @@ def main(): pm.fail_pipeline(RuntimeError(err_msg)) # Set up reference resource according to genome prefix. - res = _add_resources(args, res) + check_list = [ + {"asset_name":"fasta", "seek_key":"chrom_sizes", + "tag_name":"default", "arg":None, "user_arg":None, + "required":True}, + {"asset_name":"fasta", "seek_key":None, + "tag_name":"default", "arg":None, "user_arg":None, + "required":True}, + {"asset_name":BT2_IDX_KEY, "seek_key":None, + "tag_name":"default", "arg":None, "user_arg":None, + "required":True}, + {"asset_name":"refgene_anno", "seek_key":"refgene_tss", + "tag_name":"default", "arg":"TSS_name", "user_arg":"TSS-name", + "required":False}, + {"asset_name":"ensembl_gtf", "seek_key":"ensembl_tss", + "tag_name":"default", "arg":"ensembl_tss", "user_arg":"pi-tss", + "required":False}, + {"asset_name":"ensembl_gtf", "seek_key":"ensembl_gene_body", + "tag_name":"default", "arg":"ensembl_gene_body", "user_arg":"pi-body", + "required":False}, + {"asset_name":"refgene_anno", "seek_key":"refgene_pre_mRNA", + "tag_name":"default", "arg":"pre_name", "user_arg":"pre-name", + "required":False}, + {"asset_name":"feat_annotation", "seek_key":"feat_annotation", + "tag_name":"default", "arg":"anno_name", "user_arg":"anno-name", + "required":False}, + {"asset_name":"refgene_anno", "seek_key":"refgene_exon", + "tag_name":"default", "arg":"exon_name", "user_arg":"exon-name", + "required":False}, + {"asset_name":"refgene_anno", "seek_key":"refgene_intron", + "tag_name":"default", "arg":"intron_name", "user_arg":"intron-name", + "required":False} + ] + res, rgc = _add_resources(args, res, check_list) # Adapter file can be set in the config; if left null, we use a default. - # TODO: use this option or just specify directly the adapter sequence as I do now - res.adapters = res.adapters or tool_path("PRO-seq_adapter.fa") + # Expects headers to include >5prime and >3prime + res.adapters = res.adapters or tool_path("adapter.fa") param.outfolder = outfolder # Check that the input file(s) exist before continuing - if os.path.isfile(args.input[0]) and os.stat(args.input[0]).st_size > 0: + if _itsa_file(args.input[0]): print("Local input file: " + args.input[0]) - elif os.path.isfile(args.input[0]) and os.stat(args.input[0]).st_size == 0: + elif _itsa_empty_file(args.input[0]): # The read1 file exists but is empty err_msg = "File exists but is empty: {}" pm.fail_pipeline(IOError(err_msg.format(args.input[0]))) @@ -1546,9 +1906,9 @@ def main(): pm.fail_pipeline(IOError(err_msg.format(args.input[0]))) if args.input2: - if os.path.isfile(args.input2[0]) and os.stat(args.input2[0]).st_size > 0: + if _itsa_file(args.input2[0]): print("Local input file: " + args.input2[0]) - elif os.path.isfile(args.input2[0]) and os.stat(args.input2[0]).st_size == 0: + elif _itsa_empty_file(args.input2[0]): # The read1 file exists but is empty err_msg = "File exists but is empty: {}" pm.fail_pipeline(IOError(err_msg.format(args.input2[0]))) @@ -1569,8 +1929,14 @@ def main(): pm.report_result("Genome", args.genome_assembly) # PRO-seq pipeline - # Each (major) step should have its own subfolder + if args.protocol.lower() in RUNON_SOURCE_GRO: + pm.info("Detected GRO input") + elif args.protocol.lower() in RUNON_SOURCE_PRO: + pm.info("Detected PRO input") + else: + pm.fail_pipeline(RuntimeError("Input protocol must be GRO or PRO.")) + # Each (major) step should have its own subfolder raw_folder = os.path.join(param.outfolder, "raw") fastq_folder = os.path.join(param.outfolder, "fastq") fastqc_folder=os.path.join(param.outfolder, "fastqc") @@ -1583,12 +1949,18 @@ def main(): [args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder) + + #if not pm.get_stat("Raw_reads") or args.new_start: + # TODO: improve the skipping of these steps on recovery runs + # issue here is that process_fastq is still trying to run + # if we skip this step pm.run(cmd, unaligned_fastq, follow=ngstk.check_fastq( - local_input_files, unaligned_fastq, args.paired_end), - container=pm.container) + local_input_files, unaligned_fastq, args.paired_end)) pm.clean_add(out_fastq_pre + "*.fastq", conditional=True) - print(local_input_files) + + pm.info(local_input_files) + untrimmed_fastq1 = out_fastq_pre + "_R1.fastq" untrimmed_fastq2 = out_fastq_pre + "_R2.fastq" if args.paired_end else None @@ -1596,22 +1968,49 @@ def main(): # Process read files # ############################################################################ pm.timestamp("### FASTQ processing: ") + cutadapt_folder = os.path.join(outfolder, "cutadapt") + cutadapt_report = os.path.join(cutadapt_folder, + args.sample_name + "_R1_cutadapt.txt") + rmUMI_target = os.path.join(fastq_folder, "readname_repaired.flag") + rmUMI_dups_target = os.path.join(fastq_folder, "readname_dups_repaired.flag") + repair_target = os.path.join(fastq_folder, "repaired.flag") + dups_repair_target = os.path.join(fastq_folder, "dups_repaired.flag") - adapter_report = os.path.join( - fastqc_folder, args.sample_name + "_R1_rmAdapter.txt") - + # If single-end, must use cutadapt for plotting purposes + if not args.paired_end: + if args.adapter != "cutadapt": + pm.warning("You set adapter arg to '{}' but you must select " + "'cutadapt' to plot the adapter insertion distribution " + "for single end data.".format(args.adapter)) + #args.adapter = "cutadapt" + + # If we've already aligned to the primary genome, skip these steps unless + # it's a --new-start + #if not pm.get_stat("Aligned_reads") or args.new_start: if args.paired_end: - if args.complexity and args.umi_len > 0: - unmap_fq1, unmap_fq1_dups = _process_fastq(args, tools, False, - untrimmed_fastq1, - outfolder=param.outfolder) + if not args.complexity and args.umi_len > 0: + unmap_fq1, unmap_fq1_dups = _process_fastq( + args, tools, res, False, + untrimmed_fastq1, outfolder=param.outfolder) else: - unmap_fq1 = _process_fastq(args, tools, False, - untrimmed_fastq1, - outfolder=param.outfolder) - unmap_fq2, unmap_fq2_dups = _process_fastq(args, tools, True, - untrimmed_fastq2, - outfolder=param.outfolder) + unmap_fq1 = _process_fastq( + args, tools, res, False, + untrimmed_fastq1, outfolder=param.outfolder) + unmap_fq2, unmap_fq2_dups = _process_fastq( + args, tools, res, True, + untrimmed_fastq2, outfolder=param.outfolder) + + pm.debug("\n\nunmap_fq1: {}\nunmap_fq2: {}\n\n".format(unmap_fq1, unmap_fq2)) + + # Gut check + # Processing fastq should have trimmed the reads. + tmp = pm.get_stat("Trimmed_reads") + if tmp: + tr = float(tmp) + else: + tr = 0 + if (tr < 1): + pm.fail_pipeline(RuntimeError("No reads left after trimming. Check trimmer settings")) # Re-pair fastq files r1_repair = os.path.join( @@ -1624,63 +2023,212 @@ def main(): r2_repair_single = os.path.join( fastq_folder, args.sample_name + "_R2_trimmed.fastq.single.fq") - rr = float(pm.get_stat("Raw_reads")) - cmd = (tools.fastqpair + " -t " + str(int(0.9*rr)) + " " + unmap_fq1 + - " " + unmap_fq2) + tmp = float(pm.get_stat("Raw_reads")) + if tmp: + rr = float(tmp) + else: + rr = 0 + if (rr < 1): + pm.fail_pipeline(RuntimeError("Raw_reads were not reported. Check output ({})".format(param.outfolder))) + + if args.adapter == "fastp" and int(args.umi_len) > 0: + noUMI_fq1 = os.path.join(fastq_folder, + args.sample_name + "_R1_processed_noUMI.fastq") + noUMI_fq2 = os.path.join(fastq_folder, + args.sample_name + "_R2_trimmed_noUMI.fastq") + cmd1 = ("sed -e 's|\\:[^:]*\\([[:space:]].*\\)|\\1 |g'" + + " " + unmap_fq1 + " > " + noUMI_fq1) + cmd2 = ("sed -e 's|\\:[^:]*\\([[:space:]].*\\)|\\1 |g'" + + " " + unmap_fq2 + " > " + noUMI_fq2) + pm.run([cmd1, cmd2], [noUMI_fq1, noUMI_fq2], shell=True) + cmd1 = ("mv " + noUMI_fq1 + " " + unmap_fq1) + cmd2 = ("mv " + noUMI_fq2 + " " + unmap_fq2) + cmd3 = ("touch " + rmUMI_target) + pm.run([cmd1, cmd2, cmd3], rmUMI_target) + + cmd = (tools.fastqpair + " -t " + str(int(0.9*rr)) + " " + + unmap_fq1 + " " + unmap_fq2) pm.run(cmd, [r1_repair, r2_repair]) pm.clean_add(r1_repair_single) pm.clean_add(r2_repair_single) cmd1 = ("mv " + r1_repair + " " + unmap_fq1) cmd2 = ("mv " + r2_repair + " " + unmap_fq2) - repair_target = os.path.join(fastq_folder, "repaired.flag") - cmd3 = ("touch repaired.flag") + cmd3 = ("touch " + repair_target) pm.run([cmd1, cmd2, cmd3], repair_target) - pm.clean_add(repair_target) - - r1_dups_repair = os.path.join( - fastq_folder, args.sample_name + "_R1_trimmed.fastq.paired.fq") - r2_dups_repair = os.path.join( - fastq_folder, args.sample_name + "_R2_trimmed_dups.fastq.paired.fq") - - r1_dups_repair_single = os.path.join( - fastq_folder, args.sample_name + "_R1_trimmed.fastq.single.fq") - r2_dups_repair_single = os.path.join( - fastq_folder, args.sample_name + "_R2_trimmed_dups.fastq.single.fq") - - cmd = (tools.fastqpair + " -t " + str(int(0.9*rr)) + " " + - unmap_fq1_dups + " " + unmap_fq2_dups) - pm.run(cmd, [r1_dups_repair, r2_dups_repair]) - pm.clean_add(r1_dups_repair_single) - pm.clean_add(r2_dups_repair_single) - cmd1 = ("mv " + r1_dups_repair + " " + unmap_fq1_dups) - cmd2 = ("mv " + r2_dups_repair + " " + unmap_fq2_dups) - dups_repair_target = os.path.join(fastq_folder, "dups_repaired.flag") - cmd3 = ("touch dups_repaired.flag") - pm.run([cmd1, cmd2, cmd3], dups_repair_target) - pm.clean_add(dups_repair_target) + + # Re-pair the duplicates (but only if we could identify duplicates) + if args.umi_len > 0: + r1_dups_repair = os.path.join( + fastq_folder, args.sample_name + "_R1_trimmed.fastq.paired.fq") + r2_dups_repair = os.path.join( + fastq_folder, args.sample_name + "_R2_trimmed_dups.fastq.paired.fq") + + r1_dups_repair_single = os.path.join( + fastq_folder, args.sample_name + "_R1_trimmed.fastq.single.fq") + r2_dups_repair_single = os.path.join( + fastq_folder, args.sample_name + "_R2_trimmed_dups.fastq.single.fq") + + if args.adapter == "fastp" and int(args.umi_len) > 0: + noUMI_fq1_dups = os.path.join(fastq_folder, + args.sample_name + "_R1_trimmed_dups_noUMI.fastq") + noUMI_fq2_dups = os.path.join(fastq_folder, + args.sample_name + "_R2_trimmed_dups_noUMI.fastq") + cmd1 = ("sed -e 's|\\:[^:]*\\([[:space:]].*\\)|\\1 |g'" + + " " + unmap_fq1_dups + " > " + noUMI_fq1_dups) + cmd2 = ("sed -e 's|\\:[^:]*\\([[:space:]].*\\)|\\1 |g'" + + " " + unmap_fq2_dups + " > " + noUMI_fq2_dups) + pm.run([cmd1, cmd2], [noUMI_fq1_dups, noUMI_fq2_dups], shell=True) + cmd1 = ("mv " + noUMI_fq1_dups + " " + unmap_fq1_dups) + cmd2 = ("mv " + noUMI_fq2_dups + " " + unmap_fq2_dups) + cmd3 = ("touch " + rmUMI_dups_target) + pm.run([cmd1, cmd2, cmd3], rmUMI_dups_target) + + cmd = (tools.fastqpair + " -t " + str(int(0.9*rr)) + " " + + unmap_fq1_dups + " " + unmap_fq2_dups) + pm.run(cmd, [r1_dups_repair, r2_dups_repair]) + pm.clean_add(r1_dups_repair_single) + pm.clean_add(r2_dups_repair_single) + cmd1 = ("mv " + r1_dups_repair + " " + unmap_fq1_dups) + cmd2 = ("mv " + r2_dups_repair + " " + unmap_fq2_dups) + cmd3 = ("touch " + dups_repair_target) + pm.run([cmd1, cmd2, cmd3], dups_repair_target) else: - if args.complexity and args.umi_len > 0: - unmap_fq1, unmap_fq1_dups = _process_fastq(args, tools, False, - untrimmed_fastq1, - outfolder=param.outfolder) + if not args.complexity and args.umi_len > 0: + unmap_fq1, unmap_fq1_dups = _process_fastq( + args, tools, res, False, + untrimmed_fastq1, outfolder=param.outfolder) unmap_fq2 = "" unmap_fq2_dups = "" else: - unmap_fq1 = _process_fastq(args, tools, False, - untrimmed_fastq1, - outfolder=param.outfolder) + unmap_fq1 = _process_fastq( + args, tools, res, False, + untrimmed_fastq1, outfolder=param.outfolder) unmap_fq2 = "" - pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True) - pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True) - pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True) + # NOTE: maintain this functionality for single-end data + # for paired-end it has already been generated at this point + if not args.paired_end: + pm.timestamp("### Plot adapter insertion distribution") + + if not args.adapter == "cutadapt": + pm.info("Skipping adapter insertion distribution plotting...") + pm.info("This requires using 'cutadapt' for adapter removal.") + elif not os.path.exists(cutadapt_report): + pm.info("Skipping adapter insertion distribution plotting...") + pm.info("Could not find {}.`".format(cutadapt_report)) + else: + degradation_pdf = os.path.join(cutadapt_folder, + args.sample_name + "_R1_adapter_insertion_distribution.pdf") + degradation_png = os.path.join(cutadapt_folder, + args.sample_name + "_R1_adapter_insertion_distribution.png") + cmd = (tools.Rscript + " " + tool_path("PEPPRO.R") + + " cutadapt -i " + cutadapt_report + " -o " + cutadapt_folder) + if int(args.umi_len) > 0: + cmd += (" -u " + str(args.umi_len)) + umi_len = args.umi_len + else: + umi_len = 0 + + pm.run(cmd, degradation_pdf, nofail=True) + pm.report_object("Adapter insertion distribution", degradation_pdf, + anchor_image=degradation_png) + + if not pm.get_stat('Peak_adapter_insertion_size') or args.new_start: + # Determine the peak insertion size + cmd = ("awk '/count/,0' " + cutadapt_report + + " | awk 'NR>2 {print prev} {prev=$0}'" + + " | awk '{if ($3/$2 < 0.01) print $1, $2}'" + + " | awk 'BEGIN{max= 0; max_len=0; len=0}" + + "{if ($2>0+max) {max=$2; len=$1}; max_len=$1} " + + "END{print max_len-len}'") + adapter_peak = pm.checkprint(cmd) + if adapter_peak: + ap = int(adapter_peak) + pm.report_result("Peak_adapter_insertion_size", ap) + + # Calculate the degradation ratio + if not pm.get_stat('Degradation_ratio') or args.new_start: + pm.timestamp("### Calculating degradation ratio") + + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + cutadapt_report + + " | awk '{ if ($1 == 10) {status = 1}} END " + + "{if (status) {print status} else {print 0}}'") + degraded_lower = pm.checkprint(cmd) + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + cutadapt_report + + " | awk '{ if ($1 == 20) {status = 1}} END " + + "{if (status) {print status} else {print 0}}'") + degraded_upper = pm.checkprint(cmd) + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + cutadapt_report + + " | awk '{ if ($1 == 30) {status = 1}} END " + + "{if (status) {print status} else {print 0}}'") + intact_lower = pm.checkprint(cmd) + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + cutadapt_report + + " | awk '{ if ($1 == 40) {status = 1}} END " + + "{if (status) {print status} else {print 0}}'") + intact_upper = pm.checkprint(cmd) + + if degraded_lower: + dl = int(degraded_lower) + if dl == 1: + dl = 10 + else: + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + + cutadapt_report + " | awk 'NR==1 {print $1}'") + degraded_lower = pm.checkprint(cmd) + dl = int(degraded_lower) if degraded_lower else 1 + + if degraded_upper: + du = int(degraded_upper) + if du == 1: + du = 20 + else: + du = int(degraded_lower) + 9 + + if intact_upper: + iu = int(intact_upper) + if iu == 1: + iu = 40 + else: + cmd = ("awk 'NR>2 {print prev} {prev=$0}' " + + cutadapt_report + " | awk 'END {print $1}'") + intact_upper = pm.checkprint(cmd) + dl = int(intact_upper) if intact_upper else 40 + + if intact_lower: + il = int(intact_lower) + if il == 1: + il = 30 + else: + il = int(intact_upper) - 10 + + cmd = ("awk '/count/,0' " + cutadapt_report + + " | awk 'NR>2 {print prev} {prev=$0}'" + + " | awk '{if ($3/$2 < 0.01) print $1, $2}'" + + " | awk '{a[NR]=$1; b[NR]=$2; max_len=$1}" + + "{if ($1 > max_len) {max_len=$1}} " + + "END{ for (i in a) print 1+max_len-a[i], b[i]}'" + + " | sort -nk1 | awk '(($1-" + str(umi_len) + ") <= " + + str(du) + " && ($1-" + str(umi_len) + ") >= " + + str(dl) + "){degradedSum += $2}; " + + "(($1-" + str(umi_len) + ") >= " + str(il) + + " && ($1-" + str(umi_len) + ") <= " + str(iu) + + "){intactSum += $2} END {if (intactSum < 1) " + + "{intactSum = 1} print degradedSum/intactSum}'") + degradation_ratio = pm.checkprint(cmd) + if degradation_ratio: + dr = float(degradation_ratio) + pm.report_result("Degradation_ratio", round(dr, 4)) + + pm.clean_add(fastq_folder, conditional=True) ############################################################################ # Map to any requested prealignments # ############################################################################ # We recommend mapping to human_rDNA first for PRO-seq data pm.timestamp("### Prealignments") + to_compress = [] + #if not pm.get_stat("Aligned_reads") or args.new_start: if len(args.prealignments) == 0: print("You may use `--prealignments` to align to references before " "the genome alignment step. See docs.") @@ -1688,13 +2236,13 @@ def main(): print("Prealignment assemblies: " + str(args.prealignments)) # Loop through any prealignment references and map to them sequentially for reference in args.prealignments: - if args.complexity and args.umi_len > 0: + if not args.complexity and args.umi_len > 0: if args.no_fifo: unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, False, unmap_fq1, unmap_fq2, reference, assembly_bt2=os.path.join( - res.rgc.get_asset(reference, BT2_IDX_KEY), reference), + rgc.get_asset(reference, BT2_IDX_KEY), reference), outfolder=param.outfolder, aligndir="prealignments") @@ -1702,7 +2250,7 @@ def main(): args, tools, args.paired_end, False, unmap_fq1_dups, unmap_fq2_dups, reference, assembly_bt2=os.path.join( - res.rgc.get_asset(reference, BT2_IDX_KEY), reference), + rgc.get_asset(reference, BT2_IDX_KEY), reference), outfolder=param.outfolder, aligndir="prealignments", dups=True) @@ -1712,7 +2260,7 @@ def main(): args, tools, args.paired_end, True, unmap_fq1, unmap_fq2, reference, assembly_bt2=os.path.join( - res.rgc.get_asset(reference, BT2_IDX_KEY), reference), + rgc.get_asset(reference, BT2_IDX_KEY), reference), outfolder=param.outfolder, aligndir="prealignments") @@ -1720,7 +2268,7 @@ def main(): args, tools, args.paired_end, True, unmap_fq1_dups, unmap_fq2_dups, reference, assembly_bt2=os.path.join( - res.rgc.get_asset(reference, BT2_IDX_KEY), reference), + rgc.get_asset(reference, BT2_IDX_KEY), reference), outfolder=param.outfolder, aligndir="prealignments", dups=True) @@ -1738,7 +2286,7 @@ def main(): args, tools, args.paired_end, False, unmap_fq1, unmap_fq2, reference, assembly_bt2=os.path.join( - res.rgc.get_asset(reference, BT2_IDX_KEY), reference), + rgc.get_asset(reference, BT2_IDX_KEY), reference), outfolder=param.outfolder, aligndir="prealignments") else: @@ -1746,7 +2294,7 @@ def main(): args, tools, args.paired_end, True, unmap_fq1, unmap_fq2, reference, assembly_bt2=os.path.join( - res.rgc.get_asset(reference, BT2_IDX_KEY), reference), + rgc.get_asset(reference, BT2_IDX_KEY), reference), outfolder=param.outfolder, aligndir="prealignments") if args.paired_end: @@ -1759,6 +2307,8 @@ def main(): # Map to primary genome # ############################################################################ pm.timestamp("### Map to genome") + + # Set up named files and options map_genome_folder = os.path.join( param.outfolder, "aligned_" + args.genome_assembly) ngstk.make_dir(map_genome_folder) @@ -1772,15 +2322,19 @@ def main(): unmap_genome_bam = os.path.join( map_genome_folder, args.sample_name + "_unmap.bam") - if args.complexity and args.umi_len > 0: - mapping_genome_bam_dups = os.path.join( - map_genome_folder, args.sample_name + "_sort_dups.bam") - mapping_genome_bam_temp_dups = os.path.join( - map_genome_folder, args.sample_name + "_temp_dups.bam") - failQC_genome_bam_dups = os.path.join( - map_genome_folder, args.sample_name + "_fail_qc_dups.bam") - unmap_genome_bam_dups = os.path.join( - map_genome_folder, args.sample_name + "_unmap_dups.bam") + mapping_genome_bam_dups = os.path.join( + map_genome_folder, args.sample_name + "_sort_dups.bam") + mapping_genome_bam_temp_dups = os.path.join( + map_genome_folder, args.sample_name + "_temp_dups.bam") + failQC_genome_bam_dups = os.path.join( + map_genome_folder, args.sample_name + "_fail_qc_dups.bam") + unmap_genome_bam_dups = os.path.join( + map_genome_folder, args.sample_name + "_unmap_dups.bam") + + temp_mapping_index = os.path.join(mapping_genome_bam_temp + ".bai") + temp_mapping_index_dups = os.path.join(mapping_genome_bam_temp_dups + ".bai") + + mito_name = ["chrM", "chrMT", "M", "MT", "rCRSd", "rCRSd_3k"] bt2_options = " --very-sensitive" bt2_options += " -X 2000" @@ -1791,20 +2345,24 @@ def main(): pm.clean_add(tempdir) # check input for zipped or not - if pypiper.is_gzipped_fastq(unmap_fq1): - cmd = (ngstk.ziptool + " -d " + (unmap_fq1 + ".gz")) + unmap_fq1_gz = unmap_fq1 + ".gz" + unmap_fq2_gz = unmap_fq2 + ".gz" + if _itsa_file(unmap_fq1_gz) and not _itsa_file(unmap_fq1): + cmd = (ngstk.ziptool + " -d " + unmap_fq1_gz) pm.run(cmd, mapping_genome_bam) + to_compress.append(unmap_fq1) if args.paired_end: - if pypiper.is_gzipped_fastq(unmap_fq2): - cmd = (ngstk.ziptool + " -d " + (unmap_fq2 + ".gz")) + if _itsa_file(unmap_fq2_gz) and not _itsa_file(unmap_fq2): + cmd = (ngstk.ziptool + " -d " + unmap_fq2_gz) pm.run(cmd, mapping_genome_bam) + to_compress.append(unmap_fq2) cmd = tools.bowtie2 + " -p " + str(pm.cores) cmd += bt2_options cmd += " --rg-id " + args.sample_name cmd += " -x " + os.path.join( - res.rgc.get_asset(args.genome_assembly, BT2_IDX_KEY), - args.genome_assembly) + rgc.get_asset(args.genome_assembly, BT2_IDX_KEY), + args.genome_assembly) if args.paired_end: cmd += " --rf -1 " + unmap_fq1 + " -2 " + unmap_fq2 else: @@ -1814,7 +2372,7 @@ def main(): cmd += " -T " + tempdir cmd += " -o " + mapping_genome_bam_temp - if args.complexity and args.umi_len > 0: + if not args.complexity and args.umi_len > 0: # check input for zipped or not if pypiper.is_gzipped_fastq(unmap_fq1_dups): cmd = (ngstk.ziptool + " -d " + (unmap_fq1_dups + ".gz")) @@ -1828,8 +2386,8 @@ def main(): cmd_dups += bt2_options cmd_dups += " --rg-id " + args.sample_name cmd_dups += " -x " + os.path.join( - res.rgc.get_asset(args.genome_assembly, BT2_IDX_KEY), - args.genome_assembly) + rgc.get_asset(args.genome_assembly, BT2_IDX_KEY), + args.genome_assembly) if args.paired_end: cmd_dups += " --rf -1 " + unmap_fq1_dups + " -2 " + unmap_fq2_dups else: @@ -1845,39 +2403,46 @@ def main(): # -q 10: skip alignments with MAPQ less than 10 cmd2 = (tools.samtools + " view -q 10 -b -@ " + str(pm.cores) + " -U " + failQC_genome_bam + " ") - #if args.paired_end: - # add a step to accept only reads mapped in proper pair - # ?not appropriate with reverse complemented proseq reads? - #cmd2 += "-f 2 " - cmd2 += mapping_genome_bam_temp + " > " + mapping_genome_bam - if args.complexity and args.umi_len > 0: + if not args.complexity and args.umi_len > 0: cmd2_dups = (tools.samtools + " view -q 10 -b -@ " + str(pm.cores) + " -U " + failQC_genome_bam_dups + " ") - #if args.paired_end: - # add a step to accept only reads mapped in proper pair - #cmd2_dups += "-f 2 " - cmd2_dups += mapping_genome_bam_temp_dups + " > " + mapping_genome_bam_dups pm.clean_add(failQC_genome_bam_dups) def check_alignment_genome(temp_bam, bam): mr = ngstk.count_mapped_reads(temp_bam, args.paired_end) ar = ngstk.count_mapped_reads(bam, args.paired_end) + + if float(ar) < 1: + err_msg = "No aligned reads. Check alignment settings." + pm.fail_pipeline(RuntimeError(err_msg)) if args.paired_end: ar = float(ar)/2 - rr = float(pm.get_stat("Raw_reads")) - tr = float(pm.get_stat("Trimmed_reads")) - if os.path.exists(res.pre_mRNA_annotation): + + tmp = pm.get_stat("Raw_reads") + if tmp: + rr = float(tmp) + else: + rr = 0 + + tmp = pm.get_stat("Trimmed_reads") + if tmp: + tr = float(tmp) + else: + tr = 0 + + if os.path.exists(res.refgene_pre_mRNA): cmd = (tools.samtools + " depth -b " + - res.pre_mRNA_annotation + " " + bam + + res.refgene_pre_mRNA + " " + bam + " | awk '{counter++;sum+=$3}END{print sum/counter}'") rd = pm.checkprint(cmd) else: cmd = (tools.samtools + " depth " + bam + " | awk '{counter++;sum+=$3}END{print sum/counter}'") rd = pm.checkprint(cmd) + pm.report_result("Mapped_reads", mr) pm.report_result("QC_filtered_reads", round(float(mr)) - round(float(ar))) @@ -1891,15 +2456,13 @@ def check_alignment_genome(temp_bam, bam): pm.run([cmd, cmd2], mapping_genome_bam, follow=lambda: check_alignment_genome(mapping_genome_bam_temp, - mapping_genome_bam), - container=pm.container) + mapping_genome_bam)) - if args.complexity and args.umi_len > 0: - pm.run([cmd_dups, cmd2_dups], mapping_genome_bam_dups, - container=pm.container) + if not args.complexity and args.umi_len > 0: + pm.run([cmd_dups, cmd2_dups], mapping_genome_bam_dups) pm.timestamp("### Compress all unmapped read files") - for unmapped_fq in to_compress: + for unmapped_fq in list(set(to_compress)): # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq) and not unmapped_fq == '': if 'unmap_dups' in unmapped_fq: @@ -1909,66 +2472,54 @@ def check_alignment_genome(temp_bam, bam): unmapped_fq = unmapped_fq + ".gz" pm.run(cmd, unmapped_fq) - - temp_mapping_index = os.path.join(mapping_genome_bam_temp + ".bai") - temp_mapping_index_dups = os.path.join(mapping_genome_bam_temp_dups + ".bai") - if not args.prealignments and os.path.exists(mapping_genome_bam_temp): # Index the temporary bam file cmd = tools.samtools + " index " + mapping_genome_bam_temp pm.run(cmd, temp_mapping_index) pm.clean_add(temp_mapping_index) - if args.complexity and args.umi_len > 0: + if not args.complexity and args.umi_len > 0: cmd_dups = tools.samtools + " index " + mapping_genome_bam_temp_dups pm.run(cmd_dups, temp_mapping_index_dups) pm.clean_add(temp_mapping_index_dups) pm.clean_add(mapping_genome_bam_temp_dups) - # Determine mitochondrial read counts - # TODO: instead do this to rDNA? - mito_name = ["chrM", "chrMT", "M", "MT", "rCRSd", "rCRSd_3k"] - if os.path.exists(mapping_genome_bam_temp): - if not os.path.exists(temp_mapping_index): - cmd = tools.samtools + " index " + mapping_genome_bam_temp - pm.run(cmd, temp_mapping_index) - pm.clean_add(temp_mapping_index) - - cmd = (tools.samtools + " idxstats " + mapping_genome_bam_temp + - " | grep") - for name in mito_name: - cmd += " -we '" + name + "'" - cmd += "| cut -f 3" - mr = pm.checkprint(cmd) - - # If there are mitochondrial reads, report and remove them - if mr and mr.strip(): - pm.report_result("Mitochondrial_reads", round(float(mr))) - # Index the sort'ed BAM file first - mapping_genome_index = os.path.join(mapping_genome_bam + ".bai") - noMT_mapping_genome_bam = os.path.join( - map_genome_folder, args.sample_name + "_noMT.bam") - - cmd1 = tools.samtools + " index " + mapping_genome_bam - cmd2 = (tools.samtools + " idxstats " + mapping_genome_bam + - " | cut -f 1 | grep") + if not pm.get_stat("Mitochondrial_reads") or args.new_start: + # Determine mitochondrial read counts + if os.path.exists(mapping_genome_bam_temp): + if not os.path.exists(temp_mapping_index): + cmd = tools.samtools + " index " + mapping_genome_bam_temp + pm.run(cmd, temp_mapping_index) + pm.clean_add(temp_mapping_index) + + cmd = (tools.samtools + " idxstats " + + mapping_genome_bam_temp + " | grep") for name in mito_name: - cmd2 += " -vwe '" + name + "'" - cmd2 += ("| xargs " + tools.samtools + " view -b -@ " + - str(pm.cores) + " " + mapping_genome_bam + " > " + - noMT_mapping_genome_bam) - cmd3 = ("mv " + noMT_mapping_genome_bam + " " + mapping_genome_bam) - cmd4 = tools.samtools + " index " + mapping_genome_bam - pm.run([cmd1, cmd2, cmd3, cmd4], noMT_mapping_genome_bam) - pm.clean_add(mapping_genome_index) - - # Determine maximum read length - cmd = (tools.samtools + " stats " + mapping_genome_bam + - " | grep '^SN' | cut -f 2- | grep 'maximum length:' | cut -f 2-") - max_len = int(pm.checkprint(cmd)) - - if args.max_len != -1: - max_len = args.max_len + cmd += " -we '" + name + "'" + cmd += "| cut -f 3" + mr = pm.checkprint(cmd) + + # If there are mitochondrial reads, report and remove them + if mr and mr.strip(): + pm.report_result("Mitochondrial_reads", round(float(mr))) + # Index the sort'ed BAM file first + mapping_genome_index = os.path.join(mapping_genome_bam + ".bai") + noMT_mapping_genome_bam = os.path.join( + map_genome_folder, args.sample_name + "_noMT.bam") + + cmd1 = tools.samtools + " index " + mapping_genome_bam + cmd2 = (tools.samtools + " idxstats " + mapping_genome_bam + + " | cut -f 1 | grep") + for name in mito_name: + cmd2 += " -vwe '" + name + "'" + cmd2 += ("| xargs " + tools.samtools + " view -b -@ " + + str(pm.cores) + " " + mapping_genome_bam + " > " + + noMT_mapping_genome_bam) + cmd3 = ("mv " + noMT_mapping_genome_bam + + " " + mapping_genome_bam) + cmd4 = tools.samtools + " index " + mapping_genome_bam + pm.run([cmd1, cmd2, cmd3, cmd4], noMT_mapping_genome_bam) + pm.clean_add(mapping_genome_index) # Remove PE2 reads if args.paired_end: @@ -1987,120 +2538,171 @@ def check_alignment_genome(temp_bam, bam): mapping_genome_bam = mapping_pe1_bam ############################################################################ - # Calculate library complexity # + # Determine maximum read length and add seqOutBias resource # ############################################################################ - if args.complexity and args.umi_len > 0: - if os.path.exists(mapping_genome_bam_temp_dups): - if not os.path.exists(temp_mapping_index_dups): - cmd = tools.samtools + " index " + mapping_genome_bam_temp_dups - pm.run(cmd, temp_mapping_index_dups) - pm.clean_add(temp_mapping_index_dups) - - cmd_dups = (tools.samtools + " idxstats " + - mapping_genome_bam_temp_dups + " | grep") - for name in mito_name: - cmd_dups += " -we '" + name + "'" - cmd_dups += "| cut -f 3" - mr_dups = pm.checkprint(cmd_dups) - - if mr_dups and mr_dups.strip(): - # Index the sort'ed BAM file first - mapping_genome_index_dups = os.path.join( - mapping_genome_bam_dups + ".bai") - noMT_mapping_genome_bam_dups = os.path.join( - map_genome_folder, args.sample_name + "_noMT_dups.bam") - - cmd1 = tools.samtools + " index " + mapping_genome_bam_dups - cmd2 = (tools.samtools + " idxstats " + - mapping_genome_bam_dups + " | cut -f 1 | grep") - for name in mito_name: - cmd2 += " -vwe '" + name + "'" - cmd2 += ("| xargs " + tools.samtools + " view -b -@ " + - str(pm.cores) + " " + mapping_genome_bam_dups + - " > " + noMT_mapping_genome_bam_dups) - cmd3 = ("mv " + noMT_mapping_genome_bam_dups + " " + - mapping_genome_bam_dups) - cmd4 = tools.samtools + " index " + mapping_genome_bam_dups - pm.run([cmd1, cmd2, cmd3, cmd4], noMT_mapping_genome_bam_dups) - pm.clean_add(mapping_genome_index_dups) - - # Remove PE2 reads - if args.paired_end: - dups_pe1_bam = os.path.join( - map_genome_folder, args.sample_name + "_dups_PE1.bam") - dups_pe2_bam = os.path.join( - map_genome_folder, args.sample_name + "_dups_PE2.bam") - cmd1 = (tools.samtools + " view -b -f 64 " + mapping_genome_bam_dups + - " | " + tools.samtools + " sort - -@ " + str(pm.cores) + - " > " + dups_pe1_bam) - cmd2 = (tools.samtools + " view -b -f 128 " + mapping_genome_bam_dups + - " | " + tools.samtools + " sort - -@ " + str(pm.cores) + - " > " + dups_pe2_bam) - pm.run([cmd1, cmd2], [dups_pe1_bam, dups_pe2_bam]) - mapping_genome_bam_dups = dups_pe1_bam - - pm.timestamp("### Calculate library complexity") - QC_folder = os.path.join(param.outfolder, "QC_" + args.genome_assembly) - ngstk.make_dir(QC_folder) - - preseq_output = os.path.join( - QC_folder, args.sample_name + "_preseq_out.txt") - preseq_yield = os.path.join( - QC_folder, args.sample_name + "_preseq_yield.txt") - # preseq_mr = os.path.join( - # QC_folder, args.sample_name + "_preseq.mr") - # preseq_cov = os.path.join( - # QC_folder, args.sample_name + "_preseq_coverage.txt") - preseq_counts = os.path.join( - QC_folder, args.sample_name + "_preseq_counts.txt") - preseq_plot = os.path.join( - QC_folder, args.sample_name + "_preseq_plot") - preseq_pdf = os.path.join( - QC_folder, args.sample_name + "_preseq_plot.pdf") - preseq_png = os.path.join( - QC_folder, args.sample_name + "_preseq_plot.png") - - cmd1 = (tools.preseq + " c_curve -v -o " + preseq_output + - " -B " + mapping_genome_bam_dups) - pm.run(cmd1, preseq_output) - - cmd2 = (tools.preseq + " lc_extrap -v -o " + preseq_yield + - " -B " + mapping_genome_bam_dups) - pm.run(cmd2, preseq_yield, nofail=True) - - if os.path.exists(preseq_yield): - # cmd3 = ("bam2mr " + mapping_genome_bam_dups + - # " > " + preseq_mr) - # cmd4 = (tools.preseq + " gc_extrap -v -o " + preseq_cov + - # " " + preseq_mr) - cmd5 = ("echo '" + preseq_yield + - " '$(" + tools.samtools + " view -c -F 4 " + - mapping_genome_bam_dups + ")" + "' '" + - "$(" + tools.samtools + " view -c -F 4 " + - mapping_genome_bam + ") > " + preseq_counts) - - # pm.run([cmd3, cmd4, cmd5], - # [preseq_mr, preseq_cov, preseq_counts]) - pm.run(cmd5, preseq_counts) - #pm.clean_add(preseq_mr) - pm.clean_add(mapping_genome_bam_dups) - pm.clean_add(mapping_genome_bam_temp_dups) - - cmd = ("awk '{sum+=$2} END {printf \"%.0f\", sum}' " + res.chrom_sizes) - genome_size = int(pm.checkprint(cmd)) - - cmd = (tools.Rscript + " " + tool_path("PEPPRO.R") + - " preseq " + "-i " + preseq_yield) - if args.coverage: - cmd += (" -c " + str(genome_size) + " -l " + max_len) - cmd += (" -r " + preseq_counts + " -o " + preseq_plot) - pm.run(cmd, [preseq_pdf, preseq_png]) + if not pm.get_stat("Maximum_read_length") or args.new_start: + if int(args.max_len) > 0: + max_len = int(args.max_len) + elif _itsa_file(mapping_genome_bam): + cmd = (tools.samtools + " stats " + mapping_genome_bam + + " | grep '^SN' | cut -f 2- | grep 'maximum length:' | cut -f 2-") + max_len = int(pm.checkprint(cmd)) + else: + max_len = int(DEFAULT_MAX_LEN) + pm.report_result("Maximum_read_length", max_len) + else: + max_len = int(pm.get_stat("Maximum_read_length")) - pm.report_object("Library complexity", preseq_pdf, - anchor_image=preseq_png) + # At this point we can check for seqOutBias required indicies. + # Can't do it earlier because we haven't determined the read_length of + # interest for mappability purposes. + if args.sob: + pm.debug("max_len: {}".format(max_len)) # DEBUG + if max_len == DEFAULT_MAX_LEN: + search_asset = [{"asset_name":"tallymer_index", + "seek_key":"search_file", + "tag_name":"default", + "arg":"search_file", + "user_arg":"search-file", + "required":True}] else: - print("Unable to calculate library complexity.") + search_asset = [{"asset_name":"tallymer_index", + "seek_key":"search_file", + "tag_name":max_len, + "arg":"search_file", + "user_arg":"search-file", + "required":True}] + res, rgc = _add_resources(args, res, search_asset) + + # Calculate size of genome + if not pm.get_stat("Genome_size") or args.new_start: + genome_size = int(pm.checkprint( + ("awk '{sum+=$2} END {printf \"%.0f\", sum}' " + + res.chrom_sizes))) + pm.report_result("Genome_size", genome_size) + else: + genome_size = int(pm.get_stat("Genome_size")) + + ############################################################################ + # Calculate library complexity # + ############################################################################ + QC_folder = os.path.join(param.outfolder, "QC_" + args.genome_assembly) + ngstk.make_dir(QC_folder) + preseq_output = os.path.join( + QC_folder, args.sample_name + "_preseq_out.txt") + preseq_yield = os.path.join( + QC_folder, args.sample_name + "_preseq_yield.txt") + preseq_counts = os.path.join( + QC_folder, args.sample_name + "_preseq_counts.txt") + preseq_plot = os.path.join( + QC_folder, args.sample_name + "_preseq_plot") + preseq_pdf = os.path.join( + QC_folder, args.sample_name + "_preseq_plot.pdf") + preseq_png = os.path.join( + QC_folder, args.sample_name + "_preseq_plot.png") + + if not _itsa_file(preseq_plot) or args.new_start: + if not args.complexity and args.umi_len > 0: + if os.path.exists(mapping_genome_bam_temp_dups): + if not os.path.exists(temp_mapping_index_dups): + cmd = tools.samtools + " index " + mapping_genome_bam_temp_dups + pm.run(cmd, temp_mapping_index_dups) + pm.clean_add(temp_mapping_index_dups) + + cmd_dups = (tools.samtools + " idxstats " + + mapping_genome_bam_temp_dups + " | grep") + for name in mito_name: + cmd_dups += " -we '" + name + "'" + cmd_dups += "| cut -f 3" + mr_dups = pm.checkprint(cmd_dups) + + if mr_dups and mr_dups.strip(): + # Index the sort'ed BAM file first + mapping_genome_index_dups = os.path.join( + mapping_genome_bam_dups + ".bai") + noMT_mapping_genome_bam_dups = os.path.join( + map_genome_folder, args.sample_name + "_noMT_dups.bam") + + cmd1 = tools.samtools + " index " + mapping_genome_bam_dups + cmd2 = (tools.samtools + " idxstats " + + mapping_genome_bam_dups + " | cut -f 1 | grep") + for name in mito_name: + cmd2 += " -vwe '" + name + "'" + cmd2 += ("| xargs " + tools.samtools + " view -b -@ " + + str(pm.cores) + " " + mapping_genome_bam_dups + + " > " + noMT_mapping_genome_bam_dups) + cmd3 = ("mv " + noMT_mapping_genome_bam_dups + " " + + mapping_genome_bam_dups) + cmd4 = tools.samtools + " index " + mapping_genome_bam_dups + pm.run([cmd1, cmd2, cmd3, cmd4], mapping_genome_bam_dups) + pm.clean_add(mapping_genome_index_dups) + + # Remove PE2 reads + if args.paired_end: + dups_pe1_bam = os.path.join( + map_genome_folder, args.sample_name + "_dups_PE1.bam") + dups_pe2_bam = os.path.join( + map_genome_folder, args.sample_name + "_dups_PE2.bam") + cmd1 = (tools.samtools + " view -b -f 64 " + + mapping_genome_bam_dups + " | " + tools.samtools + + " sort - -@ " + str(pm.cores) + " > " + dups_pe1_bam) + cmd2 = (tools.samtools + " view -b -f 128 " + + mapping_genome_bam_dups + " | " + tools.samtools + + " sort - -@ " + str(pm.cores) + " > " + dups_pe2_bam) + pm.run([cmd1, cmd2], [dups_pe1_bam, dups_pe2_bam]) + mapping_genome_bam_dups = dups_pe1_bam + + pm.timestamp("### Calculate library complexity") + + cmd1 = (tools.preseq + " c_curve -v -o " + preseq_output + + " -B " + mapping_genome_bam_dups) + pm.run(cmd1, preseq_output) + + cmd2 = (tools.preseq + " lc_extrap -v -o " + preseq_yield + + " -B " + mapping_genome_bam_dups) + pm.run(cmd2, preseq_yield, nofail=True) + + if os.path.exists(preseq_yield): + # cmd3 = ("bam2mr " + mapping_genome_bam_dups + + # " > " + preseq_mr) + # cmd4 = (tools.preseq + " gc_extrap -v -o " + preseq_cov + + # " " + preseq_mr) + cmd5 = ("echo '" + preseq_yield + + " '$(" + tools.samtools + " view -c -F 4 " + + mapping_genome_bam_dups + ")" + "' '" + + "$(" + tools.samtools + " view -c -F 4 " + + mapping_genome_bam + ") > " + preseq_counts) + + # pm.run([cmd3, cmd4, cmd5], + # [preseq_mr, preseq_cov, preseq_counts]) + pm.run(cmd5, preseq_counts) + #pm.clean_add(preseq_mr) + pm.clean_add(mapping_genome_bam_dups) + pm.clean_add(mapping_genome_bam_temp_dups) + + cmd = (tools.Rscript + " " + tool_path("PEPPRO.R") + + " preseq " + "-i " + preseq_yield) + if args.coverage: + cmd += (" -c " + str(genome_size) + " -l " + max_len) + cmd += (" -r " + preseq_counts + " -o " + preseq_plot) + + pm.run(cmd, [preseq_pdf, preseq_png]) + + pm.report_object("Library complexity", preseq_pdf, + anchor_image=preseq_png) + + if not pm.get_stat('Frac_exp_unique_at_10M') or args.new_start: + # Report the expected unique at 10M reads + cmd = ("grep -w '10000000' " + preseq_yield + + " | awk '{print $2}'") + expected_unique = pm.checkprint(cmd) + if expected_unique: + fraction_unique = float(expected_unique)/float(10000000) + pm.report_result("Frac_exp_unique_at_10M", + round(fraction_unique, 4)) + else: + print("Unable to calculate library complexity.") ############################################################################ # Calculate quality control metrics for the alignment file # @@ -2114,6 +2716,7 @@ def check_alignment_genome(temp_bam, bam): bamQC = os.path.join(QC_folder, args.sample_name + "_bamQC.tsv") cmd = tool_path("bamQC.py") + cmd += " --silent" cmd += " -i " + mapping_genome_bam cmd += " -c " + str(pm.cores) cmd += " -o " + bamQC @@ -2143,8 +2746,7 @@ def report_bam_qc(bamqc_log): pm.report_result("PBC1", round(float(pbc1),2)) pm.report_result("PBC2", round(float(pbc2), 2)) - pm.run(cmd, bamQC, follow=lambda: report_bam_qc(bamQC), - container=pm.container) + pm.run(cmd, bamQC, follow=lambda: report_bam_qc(bamQC)) ############################################################################ # Produce unmapped reads file # @@ -2165,8 +2767,7 @@ def count_unmapped_reads(): unmap_cmd += " -f 4 " unmap_cmd += " " + mapping_genome_bam_temp + " > " + unmap_genome_bam - pm.run(unmap_cmd, unmap_genome_bam, follow=count_unmapped_reads, - container=pm.container) + pm.run(unmap_cmd, unmap_genome_bam, follow=count_unmapped_reads) # Remove temporary bam file from unmapped file production pm.clean_add(mapping_genome_bam_temp) @@ -2188,88 +2789,91 @@ def count_unmapped_reads(): mapping_genome_bam, (">", plus_bam) ]) + cmd2 = build_command([ tools.samtools, "view", "-bh", - ("-f", 0x10), + ("-f", 16), mapping_genome_bam, (">", minus_bam) ]) - pm.run([cmd1,cmd2], minus_bam) + pm.run([cmd1, cmd2], [plus_bam, minus_bam]) ############################################################################ # TSS enrichment # ############################################################################ - if not os.path.exists(res.tss_annotation): + if not os.path.exists(res.refgene_tss): print("Skipping TSS -- TSS enrichment requires TSS annotation file: {}" - .format(res.tss_annotation)) + .format(res.refgene_tss)) else: pm.timestamp("### Calculate TSS enrichment") - # Split TSS file - plus_TSS = os.path.join(QC_folder, "plus_TSS.tsv") - minus_TSS = os.path.join(QC_folder, "minus_TSS.tsv") - cmd = ("sed -n -e '/[[:space:]]+/w " + - plus_TSS + "' -e '/[[:space:]]-/w " + - minus_TSS + "' " + res.tss_annotation) - pm.run(cmd, [plus_TSS, minus_TSS]) - - # pyTssEnrichment requires indexed bam - if not os.path.exists(mapping_genome_index): - cmd = build_command([ - tools.samtools, - "index", - mapping_genome_bam - ]) - pm.run(cmd, mapping_genome_index) - - # Plus TSS enrichment Tss_plus = os.path.join(QC_folder, args.sample_name + "_plus_TssEnrichment.txt") - cmd = tool_path("pyTssEnrichment.py") - cmd += " -a " + mapping_genome_bam + " -b " + plus_TSS + " -p ends" - cmd += " -c " + str(pm.cores) - cmd += " -z -v -s 4 -o " + Tss_plus - pm.run(cmd, Tss_plus, nofail=True) - pm.clean_add(plus_TSS) - pm.clean_add(Tss_plus) - - with open(Tss_plus) as f: - floats = list(map(float, f)) - try: - # If the TSS enrichment is 0, don't report - Tss_score = ( - (sum(floats[int(floats.index(max(floats))-49): - int(floats.index(max(floats))+51)]) / 100) / - (sum(floats[1:int(len(floats)*0.05)]) / int(len(floats)*0.05))) - pm.report_result("TSS_Plus_Score", round(Tss_score, 1)) - except ZeroDivisionError: - pass - - # Minus TSS enrichment Tss_minus = os.path.join(QC_folder, args.sample_name + "_minus_TssEnrichment.txt") - cmd = tool_path("pyTssEnrichment.py") - cmd += " -a " + mapping_genome_bam + " -b " + minus_TSS + " -p ends" - cmd += " -c " + str(pm.cores) - cmd += " -z -v -s 4 -o " + Tss_minus - pm.run(cmd, Tss_minus, nofail=True) - pm.clean_add(minus_TSS) - pm.clean_add(Tss_minus) - - with open(Tss_minus) as f: - floats = list(map(float, f)) - try: - # If the TSS enrichment is 0, don't report - Tss_score = ( - (sum(floats[int(floats.index(max(floats))-49): - int(floats.index(max(floats))+51)]) / 100) / - (sum(floats[1:int(len(floats)*0.05)]) / int(len(floats)*0.05))) - pm.report_result("TSS_Minus_Score", round(Tss_score, 1)) - except ZeroDivisionError: - pass + + if not pm.get_stat("TSS_Minus_Score") or args.new_start: + # Split TSS file + plus_TSS = os.path.join(QC_folder, "plus_TSS.tsv") + minus_TSS = os.path.join(QC_folder, "minus_TSS.tsv") + cmd = ("sed -n -e '/[[:space:]]+/w " + + plus_TSS + "' -e '/[[:space:]]-/w " + + minus_TSS + "' " + res.refgene_tss) + pm.run(cmd, [plus_TSS, minus_TSS]) + + # pyTssEnrichment requires indexed bam + if not os.path.exists(mapping_genome_index): + cmd = build_command([ + tools.samtools, + "index", + mapping_genome_bam + ]) + pm.run(cmd, mapping_genome_index) + + # Plus TSS enrichment + cmd = tool_path("pyTssEnrichment.py") + cmd += " -a " + mapping_genome_bam + " -b " + plus_TSS + " -p ends" + cmd += " -c " + str(pm.cores) + cmd += " -z -v -s 6 -o " + Tss_plus + pm.run(cmd, Tss_plus, nofail=True) + pm.clean_add(plus_TSS) + pm.clean_add(Tss_plus) + + with open(Tss_plus) as f: + floats = list(map(float, f)) + try: + # If the TSS enrichment is 0, don't report + Tss_score = ( + (sum(floats[int(floats.index(max(floats))-49): + int(floats.index(max(floats))+51)]) / 100) / + (sum(floats[1:int(len(floats)*0.05)]) / int(len(floats)*0.05))) + pm.report_result("TSS_Plus_Score", round(Tss_score, 1)) + except ZeroDivisionError: + pass + + # Minus TSS enrichment + cmd = tool_path("pyTssEnrichment.py") + cmd += " -a " + mapping_genome_bam + " -b " + minus_TSS + " -p ends" + cmd += " -c " + str(pm.cores) + cmd += " -z -v -s 6 -o " + Tss_minus + pm.run(cmd, Tss_minus, nofail=True) + pm.clean_add(minus_TSS) + pm.clean_add(Tss_minus) + + with open(Tss_minus) as f: + floats = list(map(float, f)) + try: + # If the TSS enrichment is 0, don't report + Tss_score = ( + (sum(floats[int(floats.index(max(floats))-49): + int(floats.index(max(floats))+51)]) / 100) / + (sum(floats[1:int(len(floats)*0.05)]) / int(len(floats)*0.05))) + pm.report_result("TSS_Minus_Score", round(Tss_score, 1)) + except ZeroDivisionError: + pass # Call Rscript to plot TSS Enrichment TSS_pdf = os.path.join(QC_folder, args.sample_name + @@ -2309,74 +2913,74 @@ def count_unmapped_reads(): pm.clean_add(chr_order) pm.clean_add(chr_keep) - if not os.path.exists(res.pi_tss): - if not os.path.exists(res.pi_body): + if not os.path.exists(res.ensembl_tss): + if not os.path.exists(res.ensembl_gene_body): print("Skipping PI -- Pause index requires 'TSS' and 'gene body' annotation files: {} and {}" - .format(res.pi_tss, res.pi_body)) + .format(res.ensembl_tss, res.ensembl_gene_body)) else: print("Skipping PI -- Pause index requires 'TSS' annotation file: {}" - .format(res.pi_tss)) - elif not os.path.exists(res.pi_body): + .format(res.ensembl_tss)) + elif not os.path.exists(res.ensembl_gene_body): print("Skipping PI -- Pause index requires 'gene body' annotation file: {}" - .format(res.pi_body)) + .format(res.ensembl_gene_body)) else: pm.timestamp("### Calculate Pause Index (PI)") - - # Remove missing chr from PI annotations - tss_local = os.path.join(QC_folder, - args.genome_assembly + "_PI_TSS.bed") - body_local = os.path.join(QC_folder, - args.genome_assembly + "_PI_body.bed") - cmd1 = ("grep -wf " + chr_keep + " " + res.pi_tss + " | " + - tools.bedtools + " sort -i stdin -faidx " + chr_order + - " > " + tss_local) - cmd2 = ("grep -wf " + chr_keep + " " + res.pi_body + " | " + - tools.bedtools + " sort -i stdin -faidx " + chr_order + - " > " + body_local) - pm.run([cmd1,cmd2], [tss_local, body_local], nofail=True) - pm.clean_add(tss_local) - pm.clean_add(body_local) - - # Determine coverage of highest scoring TSS - TSS_density = os.path.join(QC_folder, args.sample_name + - "_TSS_density.bed") - cmd = (tools.bedtools + " coverage -sorted -counts -s -a " + - tss_local + " -b " + mapping_genome_bam + - " -g " + chr_order + " | awk '$7>0' | " + - "sort -k4,4 -k7,7nr | " + - "sort -k4,4 -u > " + TSS_density) - pm.run(cmd, TSS_density, nofail=True) - pm.clean_add(TSS_density) - - # Determine coverage of gene body - body_density = os.path.join(QC_folder, args.sample_name + - "_gene_body_density.bed") - cmd = (tools.bedtools + " coverage -sorted -counts -s -a " + - body_local + " -b " + mapping_genome_bam + - " -g " + chr_order + " | awk '$7>0' | " + - "sort -k4 > " + body_density) - pm.run(cmd, body_density, nofail=True) - pm.clean_add(body_density) - - # Determine pause index pause_index = os.path.join(QC_folder, args.sample_name + - "_pause_index.txt") - cmd = ("join -j4 -o 1.1 1.2 1.3 1.4 1.6 1.7 2.2 2.3 2.7 " + - TSS_density + " " + body_density + - " | awk '{print ($6/($3-$2))/($9/($8-$7))}' > " + - pause_index) - pm.run(cmd, pause_index, nofail=True) - pm.clean_add(pause_index) - - # Number of reads / window length - cmd = ("sort -n " + pause_index + - " | awk ' { a[i++]=$1; } END " + - "{ x=int((i+1)/2); if (x < (i+1)/2) " + - "print (a[x-1]+a[x])/2; else print a[x-1]; }'") - val = pm.checkprint(cmd) - if val and val.strip(): - pi = float(val) - pm.report_result("Pause index", round(pi, 2)) + "_pause_index.bed") + if not pm.get_stat("Pause_index") or args.new_start: + # Remove missing chr from PI annotations + tss_local = os.path.join(QC_folder, + args.genome_assembly + "_ensembl_tss.bed") + body_local = os.path.join(QC_folder, + args.genome_assembly + "_ensembl_gene_body.bed") + cmd1 = ("grep -wf " + chr_keep + " " + res.ensembl_tss + " | " + + tools.bedtools + " sort -i stdin -faidx " + chr_order + + " > " + tss_local) + cmd2 = ("grep -wf " + chr_keep + " " + res.ensembl_gene_body + " | " + + tools.bedtools + " sort -i stdin -faidx " + chr_order + + " > " + body_local) + pm.run([cmd1,cmd2], [tss_local, body_local], nofail=True) + pm.clean_add(tss_local) + pm.clean_add(body_local) + + # Determine coverage of highest scoring TSS + TSS_density = os.path.join(QC_folder, args.sample_name + + "_TSS_density.bed") + cmd = (tools.bedtools + " coverage -sorted -counts -s -a " + + tss_local + " -b " + mapping_genome_bam + + " -g " + chr_order + " | awk '$7>0' | " + + "sort -k4,4 -k7,7nr | " + + "sort -k4,4 -u > " + TSS_density) + pm.run(cmd, TSS_density, nofail=True) + pm.clean_add(TSS_density) + + # Determine coverage of gene body + body_density = os.path.join(QC_folder, args.sample_name + + "_gene_body_density.bed") + cmd = (tools.bedtools + " coverage -sorted -counts -s -a " + + body_local + " -b " + mapping_genome_bam + + " -g " + chr_order + " | awk '$7>0' | " + + "sort -k4 > " + body_density) + pm.run(cmd, body_density, nofail=True) + pm.clean_add(body_density) + + # Determine pause index + cmd = ("join --nocheck-order -j4 -o 1.1 1.2 1.3 1.4 1.6 1.7 2.2 2.3 2.7 " + + TSS_density + " " + body_density + + " | awk -v OFS='\t' '{print $1, $2, $3, $4, ($6/($3-$2))" + + "/($9/($8-$7)), $5}' | env LC_COLLATE=C sort -k1,1 -k2,2n > " + + pause_index) + pm.run(cmd, pause_index, nofail=True) + + # Median pause index + cmd = ("sort -k5,5n " + pause_index + + " | awk ' { a[i++]=$5; } END " + + "{ x=int((i+1)/2); if (x < (i+1)/2) " + + "print (a[x-1]+a[x])/2; else print a[x-1]; }'") + val = pm.checkprint(cmd) + if val and val.strip(): + pi = float(val) + pm.report_result("Pause_index", round(pi, 2)) # Plot pause index distribution pi_pdf = os.path.join(QC_folder, args.sample_name + @@ -2384,79 +2988,91 @@ def count_unmapped_reads(): pi_png = os.path.join(QC_folder, args.sample_name + "_pause_index.png") cmd = (tools.Rscript + " " + tool_path("PEPPRO.R") + - " pi -i " + pause_index) + " pi --annotate -i " + pause_index) pm.run(cmd, pi_pdf, nofail=True) pm.report_object("Pause index", pi_pdf, anchor_image=pi_png) + if not is_gzipped(pause_index): + cmd = (ngstk.ziptool + " -f " + pause_index) + pause_index = pause_index + ".gz" + pm.run(cmd, pause_index) + ############################################################################ # Calculate Fraction of Reads in Pre-mRNA (FRiP) # ############################################################################ - if not os.path.exists(res.pre_mRNA_annotation): - print("Skipping FRiP -- Fraction of reads in pre-mRNA requires " + signal_folder = os.path.join( + param.outfolder, "signal_" + args.genome_assembly) + ngstk.make_dir(signal_folder) + + if not os.path.exists(res.refgene_pre_mRNA): + print("Skipping FRiP and gene coverage calculation which require the " "pre-mRNA annotation file: {}" - .format(res.pre_mRNA_annotation)) + .format(res.refgene_pre_mRNA)) else: - pm.timestamp("### Calculate FRiP") - # Plus - plus_frip = calc_frip(plus_bam, res.pre_mRNA_annotation, - frip_func=ngstk.simple_frip, - pipeline_manager=pm) - pm.report_result("Plus FRiP", round(plus_frip, 2)) - # Minus - minus_frip = calc_frip(minus_bam, res.pre_mRNA_annotation, - frip_func=ngstk.simple_frip, - pipeline_manager=pm) - pm.report_result("Minus FRiP", round(minus_frip, 2)) + pm.timestamp("### Calculate Fraction of Reads in pre-mature mRNA") + if not pm.get_stat('Plus_FRiP') or args.new_start: + # Plus + plus_frip = calc_frip(plus_bam, res.refgene_pre_mRNA, + frip_func=ngstk.simple_frip, + pipeline_manager=pm) + pm.report_result("Plus_FRiP", round(plus_frip, 2)) + + if not pm.get_stat('Minus_FRiP') or args.new_start: + # Minus + minus_frip = calc_frip(minus_bam, res.refgene_pre_mRNA, + frip_func=ngstk.simple_frip, + pipeline_manager=pm) + pm.report_result("Minus_FRiP", round(minus_frip, 2)) + + # Calculate gene coverage + gene_cov = os.path.join(signal_folder, + args.sample_name + "_gene_coverage.bed") + gene_sort = os.path.join(QC_folder, args.genome_assembly + + "_gene_sort.bed") + cmd1 = ("grep -wf " + chr_keep + " " + res.refgene_pre_mRNA + + " | " + tools.bedtools + " sort -i stdin -faidx " + + chr_order + " > " + gene_sort) + cmd2 = (tools.bedtools + " coverage -sorted -counts -s -a " + + gene_sort + " -b " + mapping_genome_bam + + " -g " + chr_order + " > " + gene_cov) + pm.run([cmd1, cmd2], [gene_sort, gene_cov]) + pm.clean_add(gene_sort) ############################################################################ - # Plot fragment distribution (for SE data) # + # Plot fragment distribution (for PE data) # ############################################################################ - if args.paired_end: - pm.timestamp("### Plot fragment distribution") - frag_len = os.path.join(QC_folder, args.sample_name + "_fragLen.txt") - frag_dist_tool = tool_path("fragment_length_dist.pl") - cmd = build_command([tools.perl, frag_dist_tool, - mapping_genome_bam, frag_len]) - - fragL_counts_file = args.sample_name + "_fragCount.txt" - fragL_count = os.path.join(QC_folder, fragL_counts_file) - cmd1 = "sort -n " + frag_len + " | uniq -c > " + fragL_count - - fragL_dis1 = os.path.join(QC_folder, args.sample_name + - "_fragLenDistribution.pdf") - fragL_dis2 = os.path.join(QC_folder, args.sample_name + - "_fragLenDistribution.txt") - - cmd2 = (tools.Rscript + " " + tool_path("PEPPRO.R")) - cmd2 += " frag -l " + frag_len + " -c " + fragL_count - cmd2 += " -p " + fragL_dis1 + " -t " + fragL_dis2 - - pm.run([cmd, cmd1, cmd2], fragL_dis1, nofail=True) - pm.clean_add(frag_len) - pm.clean_add(fragL_count) - - fragL_png = os.path.join(QC_folder, args.sample_name + - "_fragLenDistribution.png") - pm.report_object("Fragment distribution", fragL_dis1, - anchor_image=fragL_png) - else: - pm.timestamp("### Plot adapter insertion distribution") - if not args.adapter == "cutadapt": - print("Skipping sample degradation plotting...") - print("This requires the use of 'cutadapt' for adapter clipping.") - elif not os.path.exists(adapter_report): - print("Skipping sample degradation plotting...") - print("Could not find {}.`".format(adapter_report)) - else: - degradation_pdf = os.path.join(QC_folder, - args.sample_name + "_adapter_insertion_distribution.pdf") - degradation_png = os.path.join(QC_folder, - args.sample_name + "_adapter_insertion_distribution.pdf") - cmd = (tools.Rscript + " " + tool_path("PEPPRO.R") + - " cutadapt -i " + adapter_report + " -o " + QC_folder) - pm.run(cmd, degradation_pdf, nofail=True) - pm.report_object("Adapter insertion distribution", degradation_pdf, - anchor_image=degradation_png) + # DEPRECATED + # if args.paired_end: + # pm.timestamp("### Plot fragment distribution") + # frag_len = os.path.join(QC_folder, args.sample_name + "_fragLen.txt") + # frag_dist_tool = tool_path("fragment_length_dist.pl") + # cmd = build_command([tools.perl, frag_dist_tool, + # mapping_genome_bam, frag_len]) + + # fragL_counts_file = args.sample_name + "_fragCount.txt" + # fragL_count = os.path.join(QC_folder, fragL_counts_file) + # cmd1 = "sort -n " + frag_len + " | uniq -c > " + fragL_count + + # fragL_dis1 = os.path.join(QC_folder, args.sample_name + + # "_fragLenDistribution.pdf") + # fragL_dis2 = os.path.join(QC_folder, args.sample_name + + # "_fragLenDistribution.txt") + + # cmd2 = (tools.Rscript + " " + tool_path("PEPPRO.R")) + # cmd2 += " frag -l " + frag_len + " -c " + fragL_count + # cmd2 += " -p " + fragL_dis1 + " -t " + fragL_dis2 + + # pm.run([cmd, cmd1, cmd2], fragL_dis1, nofail=True) + # pm.clean_add(frag_len) + # pm.clean_add(fragL_count) + + # fragL_png = os.path.join(QC_folder, args.sample_name + + # "_fragLenDistribution.png") + # pm.report_object("Fragment distribution", fragL_dis1, + # anchor_image=fragL_png) + # else: + # pass + # # Used to plot adapter distribution here, but moved to cutadapt. ############################################################################ # Extract genomic features # @@ -2488,214 +3104,406 @@ def count_unmapped_reads(): .format(args.genome_assembly)) print("Could not find {}.`" .format(str(os.path.dirname(res.feat_annotation)))) + elif os.path.exists(anno_zip) or args.new_start: + cmd = (ngstk.ziptool + " -d -c " + anno_zip + + " > " + anno_local) + pm.run(cmd, anno_local) + pm.clean_add(anno_local) ############################################################################ # Determine genomic feature coverage # ############################################################################ - pm.timestamp("### Calculate fraction of reads in features (FRiF)") - - frif_plus_PDF = os.path.join(QC_folder, args.sample_name + "_plus_frif.pdf") - frif_plus_PNG = os.path.join(QC_folder, args.sample_name + "_plus_frif.png") - frif_minus_PDF = os.path.join(QC_folder, - args.sample_name + "_minus_frif.pdf") - frif_minus_PNG = os.path.join(QC_folder, - args.sample_name + "_minus_frif.png") - - if not os.path.exists(frif_plus_PDF) or args.new_start: + pm.timestamp("### Calculate fraction and proportion of reads in features (FRiF/PRiF)") + + frif_PDF = os.path.join(QC_folder, args.sample_name + "_frif.pdf") + frif_PNG = os.path.join(QC_folder, args.sample_name + "_frif.png") + # frif_plus_PDF = os.path.join(QC_folder, args.sample_name + "_plus_frif.pdf") + # frif_plus_PNG = os.path.join(QC_folder, args.sample_name + "_plus_frif.png") + # frif_minus_PDF = os.path.join(QC_folder, + # args.sample_name + "_minus_frif.pdf") + # frif_minus_PNG = os.path.join(QC_folder, + # args.sample_name + "_minus_frif.png") + + # Proportion of Reads in Feature (PRiF) + prif_PDF = os.path.join(QC_folder, args.sample_name + "_prif.pdf") + prif_PNG = os.path.join(QC_folder, args.sample_name + "_prif.png") + # prif_plus_PDF = os.path.join(QC_folder, args.sample_name + "_plus_prif.pdf") + # prif_plus_PNG = os.path.join(QC_folder, args.sample_name + "_plus_prif.png") + # prif_minus_PDF = os.path.join(QC_folder, + # args.sample_name + "_minus_prif.pdf") + # prif_minus_PNG = os.path.join(QC_folder, + # args.sample_name + "_minus_prif.png") + + if not os.path.exists(frif_PDF) or args.new_start: + anno_files = list() anno_list_plus = list() anno_list_minus = list() if os.path.isfile(anno_local): # Get list of features - cmd1 = ("cut -f 4 " + anno_local + " | sort -u") + if args.prioritize: + cmd1 = ("cut -f 4 " + anno_local + " | uniq") + else: + cmd1 = ("cut -f 4 " + anno_local + " | sort -u") ft_list = pm.checkprint(cmd1, shell=True) ft_list = ft_list.splitlines() # Split annotation file on features cmd2 = ("awk -F'\t' '{print>\"" + QC_folder + "/\"$4}' " + anno_local) - if len(ft_list) >= 1: - for pos, anno in enumerate(ft_list): - # working files - anno_file = os.path.join(QC_folder, str(anno)) - valid_name = str(re.sub('[^\w_.)( -]', '', anno).strip().replace(' ', '_')) - file_name = os.path.join(QC_folder, valid_name) - anno_sort = os.path.join(QC_folder, - valid_name + "_sort.bed") - anno_cov_plus = os.path.join(QC_folder, - args.sample_name + "_" + - valid_name + "_plus_coverage.bed") - anno_cov_minus = os.path.join(QC_folder, - args.sample_name + "_" + - valid_name + - "_minus_coverage.bed") - - # Extract feature files - pm.run(cmd2, anno_file) - - # Rename files to valid file_names - # Avoid 'mv' "are the same file" error - if not os.path.exists(file_name): - cmd = 'mv "{old}" "{new}"'.format(old=anno_file, - new=file_name) - pm.run(cmd, file_name) - - # Sort files (ensure only aligned chromosomes are kept) - cmd3 = ("cut -f 1 " + chr_order + " | grep -wf - " + - file_name + " | cut -f 1-3 | " + - "bedtools sort -i stdin -faidx " + - chr_order + " > " + anno_sort) - pm.run(cmd3, anno_sort) - - anno_list_plus.append(anno_cov_plus) - anno_list_minus.append(anno_cov_minus) - cmd4 = (tools.bedtools + " coverage -sorted -counts -a " + - anno_sort + " -b " + plus_bam + - " -g " + chr_order + " > " + - anno_cov_plus) - cmd5 = (tools.bedtools + " coverage -sorted -counts -a " + - anno_sort + " -b " + minus_bam + - " -g " + chr_order + " > " + - anno_cov_minus) - pm.run(cmd4, anno_cov_plus) - pm.run(cmd5, anno_cov_minus) - - pm.clean_add(file_name) - pm.clean_add(anno_sort) - pm.clean_add(anno_cov_plus) - pm.clean_add(anno_cov_minus) + + if args.prioritize: + if len(ft_list) >= 1: + for pos, anno in enumerate(ft_list): + # working files + anno_file = os.path.join(QC_folder, str(anno)) + valid_name = str(re.sub('[^\w_.)( -]', '', anno).strip().replace(' ', '_')) + file_name = os.path.join(QC_folder, valid_name) + anno_sort = os.path.join(QC_folder, + valid_name + "_sort.bed") + anno_cov_plus = os.path.join(QC_folder, + args.sample_name + "_" + + valid_name + + "_plus_coverage.bed") + anno_cov_minus = os.path.join(QC_folder, + args.sample_name + "_" + + valid_name + + "_minus_coverage.bed") + + # Extract feature files + pm.run(cmd2, anno_file) + + # Rename files to valid file_names + # Avoid 'mv' "are the same file" error + if not os.path.exists(file_name): + cmd = 'mv "{old}" "{new}"'.format(old=anno_file, + new=file_name) + pm.run(cmd, file_name) + + # Sort files (ensure only aligned chromosomes are kept) + # Need to cut -f 1-6 if you want strand information + # Not all features are stranded + # TODO: check for strandedness (*only works on some features) + cmd3 = ("cut -f 1 " + chr_order + " | grep -wf - " + + file_name + " | cut -f 1-3 | " + + "bedtools sort -i stdin -faidx " + + chr_order + " | bedtools merge -i stdin > " + + anno_sort) + # for future stranded possibilities include for merge + # "-c 4,5,6 -o collapse,collapse,collapse > " + + pm.run(cmd3, anno_sort) + + anno_files.append(anno_sort) + anno_list_plus.append(anno_cov_plus) + anno_list_minus.append(anno_cov_minus) + + pm.clean_add(file_name) + pm.clean_add(anno_sort) + pm.clean_add(anno_cov_plus) + pm.clean_add(anno_cov_minus) + + # Iteratively prioritize annotations by order presented + anno_files.reverse() + if len(anno_files) >= 1: + idx = list(range(0,len(anno_files))) + #idx.reverse() + file_count = 1 + for annotation in anno_files: + del idx[0] + if file_count < len(anno_files): + file_count += 1 + for i in idx: + if annotation is not anno_files[i]: + os.path.join(QC_folder) + temp = tempfile.NamedTemporaryFile(dir=QC_folder, delete=False) + #os.chmod(temp.name, 0o771) + cmd1 = ("bedtools subtract -a " + + annotation + " -b " + + anno_files[i] + " > " + + temp.name) + cmd2 = ("mv " + temp.name + + " " + annotation) + pm.run([cmd1, cmd2], frif_PDF) + temp.close() + + anno_list_plus.reverse() + anno_list_minus.reverse() + if len(anno_files) >= 1: + for idx, annotation in enumerate(anno_files): + # Identifies unstranded coverage + # Would need to use '-s' flag to be stranded + if _itsa_file(annotation): + cmd4 = (tools.bedtools + + " coverage -sorted -a " + + annotation + " -b " + plus_bam + + " -g " + chr_order + " > " + + anno_list_plus[idx]) + pm.run(cmd4, frif_PDF) + if _itsa_file(annotation): + cmd5 = (tools.bedtools + + " coverage -sorted -a " + + annotation + " -b " + minus_bam + + " -g " + chr_order + " > " + + anno_list_minus[idx]) + pm.run(cmd5, frif_PDF) + else: + if len(ft_list) >= 1: + for pos, anno in enumerate(ft_list): + # working files + anno_file = os.path.join(QC_folder, str(anno)) + valid_name = str(re.sub('[^\w_.)( -]', '', anno).strip().replace(' ', '_')) + file_name = os.path.join(QC_folder, valid_name) + anno_sort = os.path.join(QC_folder, + valid_name + "_sort.bed") + anno_cov_plus = os.path.join(QC_folder, + args.sample_name + "_" + + valid_name + + "_plus_coverage.bed") + anno_cov_minus = os.path.join(QC_folder, + args.sample_name + "_" + + valid_name + + "_minus_coverage.bed") + + # Extract feature files + pm.run(cmd2, anno_file) + + # Rename files to valid file_names + # Avoid 'mv' "are the same file" error + if not os.path.exists(file_name): + cmd = 'mv "{old}" "{new}"'.format(old=anno_file, + new=file_name) + pm.run(cmd, file_name) + + # Sort files (ensure only aligned chromosomes are kept) + # Need to cut -f 1-6 if you want strand information + # Not all features are stranded + # TODO: check for strandedness + cmd3 = ("cut -f 1 " + chr_order + " | grep -wf - " + + file_name + " | cut -f 1-3 | " + + "bedtools sort -i stdin -faidx " + + chr_order + " > " + anno_sort) + pm.run(cmd3, anno_sort) + + anno_list_plus.append(anno_cov_plus) + anno_list_minus.append(anno_cov_minus) + # Identifies unstranded coverage + # Would need to use '-s' flag to be stranded + cmd4 = (tools.bedtools + " coverage -sorted " + + " -a " + anno_sort + " -b " + plus_bam + + " -g " + chr_order + " > " + + anno_cov_plus) + cmd5 = (tools.bedtools + " coverage -sorted " + + " -a " + anno_sort + " -b " + minus_bam + + " -g " + chr_order + " > " + + anno_cov_minus) + pm.run(cmd4, anno_cov_plus) + pm.run(cmd5, anno_cov_minus) + + pm.clean_add(file_name) + pm.clean_add(anno_sort) + pm.clean_add(anno_cov_plus) + pm.clean_add(anno_cov_minus) ############################################################################ # Plot FRiF # ############################################################################ - pm.timestamp("### Plot FRiF") + pm.timestamp("### Plot FRiF/PRiF") + # Plus - if not os.path.exists(frif_plus_PDF) or args.new_start: - count_cmd = (tools.samtools + " view -@ " + str(pm.cores) + " " + - param.samtools.params + " -c -F4 " + plus_bam) + if not os.path.exists(frif_PDF) or args.new_start: + if args.prioritize: + # Count bases, not reads + # return to original priority ranked order + anno_list_plus.reverse() + anno_list_minus.reverse() + count_cmd = (tools.bedtools + " genomecov -ibam " + plus_bam + + " -bg | awk '{sum+=($3-$2)}END{print sum}'") + else: + # Count reads + count_cmd = (tools.samtools + " view -@ " + str(pm.cores) + " " + + param.samtools.params + " -c -F4 " + plus_bam) + plus_read_count = pm.checkprint(count_cmd) plus_read_count = str(plus_read_count).rstrip() frif_cmd = [tools.Rscript, tool_path("PEPPRO.R"), "frif", - "-n", args.sample_name, "-r", plus_read_count, - "-o", frif_plus_PDF, "--bed"] - if anno_list_plus: - for cov in anno_list_plus: - frif_cmd.append(cov) - cmd = build_command(frif_cmd) - pm.run(cmd, frif_plus_PDF, nofail=False) - pm.report_object("Plus FRiF", frif_plus_PDF, - anchor_image=frif_plus_PNG) + "-s", args.sample_name, "-z", str(genome_size).rstrip(), + "-n", plus_read_count, "-y", "frif"] - # Minus - if not os.path.exists(frif_minus_PDF) or args.new_start: - count_cmd = (tools.samtools + " view -@ " + str(pm.cores) + " " + - param.samtools.params + " -c -F4 " + minus_bam) - minus_read_count = pm.checkprint(count_cmd) - minus_read_count = str(minus_read_count).rstrip() + prif_cmd = [tools.Rscript, tool_path("PEPPRO.R"), "frif", + "-s", args.sample_name, "-z", str(genome_size).rstrip(), + "-n", plus_read_count, "-y", "prif"] - frif_cmd = [tools.Rscript, tool_path("PEPPRO.R"), "frif", - "-n", args.sample_name, "-r", minus_read_count, - "-o", frif_minus_PDF, "--bed"] - if anno_list_minus: - for cov in anno_list_minus: - frif_cmd.append(cov) + if not args.prioritize: + # Use reads for calculation + frif_cmd.append("--reads") + prif_cmd.append("--reads") + + frif_cmd.append("-o") + frif_cmd.append(frif_PDF) + frif_cmd.append("--bed") + + prif_cmd.append("-o") + prif_cmd.append(prif_PDF) + prif_cmd.append("--bed") + + if anno_list_plus: + for cov in anno_list_plus: + if _itsa_file(cov): + frif_cmd.append(cov) + prif_cmd.append(cov) cmd = build_command(frif_cmd) - pm.run(cmd, frif_minus_PDF, nofail=False) - pm.report_object("Minus FRiF", frif_minus_PDF, - anchor_image=frif_minus_PNG) + pm.run(cmd, frif_PDF, nofail=False) + # pm.report_object("Plus FRiF", frif_plus_PDF, + # anchor_image=frif_plus_PNG) + pm.report_object("FRiF", frif_PDF, anchor_image=frif_PNG) + + cmd = build_command(prif_cmd) + pm.run(cmd, prif_PDF, nofail=False) + # pm.report_object("Plus PRiF", prif_plus_PDF, + # anchor_image=prif_plus_PNG) + pm.report_object("PRiF", prif_PDF, anchor_image=prif_PNG) + + # Minus (unused as we currently use unstranded feature coverage calculation) + # if not os.path.exists(frif_minus_PDF) or args.new_start: + # count_cmd = (tools.samtools + " view -@ " + str(pm.cores) + " " + + # param.samtools.params + " -c -F4 " + minus_bam) + # minus_read_count = pm.checkprint(count_cmd) + # minus_read_count = str(minus_read_count).rstrip() + + # frif_cmd = [tools.Rscript, tool_path("PEPPRO.R"), "frif", + # "-n", args.sample_name, "-s", str(genome_size).rstrip(), + # "-r", minus_read_count, "-y", "frif", + # "-o", frif_minus_PDF, "--bed"] + + # prif_cmd = [tools.Rscript, tool_path("PEPPRO.R"), "frif", + # "-n", args.sample_name, "-s", str(genome_size).rstrip(), + # "-r", minus_read_count, "-y", "prif", + # "-o", prif_minus_PDF, "--bed"] + + # if anno_list_minus: + # for cov in anno_list_minus: + # frif_cmd.append(cov) + # prif_cmd.append(cov) + # cmd = build_command(frif_cmd) + # pm.run(cmd, frif_minus_PDF, nofail=False) + # pm.report_object("Minus FRiF", frif_minus_PDF, + # anchor_image=frif_minus_PNG) + # cmd = build_command(prif_cmd) + # pm.run(cmd, prif_minus_PDF, nofail=False) + # pm.report_object("Minus PRiF", prif_minus_PDF, + # anchor_image=prif_minus_PNG) ############################################################################ # Report mRNA contamination # ############################################################################ - if (os.path.exists(res.exon_annotation) and - os.path.exists(res.intron_annotation)): + if (os.path.exists(res.refgene_exon) and + os.path.exists(res.refgene_intron)): pm.timestamp("### Calculate mRNA contamination") - - # Sort exons and introns - exons_sort = os.path.join(QC_folder, args.genome_assembly + - "_exons_sort.bed") - introns_sort = os.path.join(QC_folder, args.genome_assembly + - "_introns_sort.bed") - cmd1 = ("grep -wf " + chr_keep + " " + res.exon_annotation + - " | " + tools.bedtools + " sort -i stdin -faidx " + - chr_order + " > " + exons_sort) - # a single sort fails to sort a 1 bp different start position intron - cmd2 = ("grep -wf " + chr_keep + " " + res.intron_annotation + - " | " + tools.bedtools + " sort -i stdin -faidx " + - chr_order + " | " + tools.bedtools + " sort -i stdin -faidx " + - chr_order + " > " + introns_sort) - pm.run([cmd1, cmd2], [exons_sort, introns_sort], nofail=True) - pm.clean_add(exons_sort) - pm.clean_add(introns_sort) - - # Determine coverage of exons/introns - exons_cov = os.path.join(QC_folder, args.sample_name + - "_exons_coverage.bed") - introns_cov = os.path.join(QC_folder, args.sample_name + - "_introns_coverage.bed") - cmd1 = (tools.bedtools + " coverage -sorted -counts -s -a " + - exons_sort + " -b " + mapping_genome_bam + - " -g " + chr_order + " > " + exons_cov) - cmd2 = (tools.bedtools + " coverage -sorted -counts -s -a " + - introns_sort + " -b " + mapping_genome_bam + - " -g " + chr_order + " > " + introns_cov) - pm.run([cmd1, cmd2], [exons_cov, introns_cov], nofail=True) - pm.clean_add(exons_cov) - pm.clean_add(introns_cov) - - # need Total Reads divided by 1M - ar = float(pm.get_stat("Aligned_reads")) - scaling_factor = float(ar/1000000) - - exons_rpkm = os.path.join(QC_folder, args.sample_name + - "_exons_rpkm.tsv") - introns_rpkm = os.path.join(QC_folder, args.sample_name + - "_introns_rpkm.tsv") - - # determine exonic RPKM for individual genes - if os.path.exists(exons_cov): - cmd = ("awk -v OFS='\t' '{readCount[$4] += $7; " + - "exonCount[$4] += 1; geneSizeKB[$4] += " + - "(sqrt(($3-$2+0.00000001)^2)/1000); " + - "gene[$4] = $4} END { for (a in readCount) " + - "{ print gene[a], (readCount[a]/" + - str(scaling_factor) + ")/geneSizeKB[a]}}' " + exons_cov + - " | awk '$2>0' | sort -k1 > " + exons_rpkm) - pm.run(cmd, exons_rpkm, nofail=True) - pm.clean_add(exons_rpkm) - - # determine intronic RPKM for individual genes - if os.path.exists(introns_cov): - cmd = ("awk -v OFS='\t' '{readCount[$4] += $7; " + - "exonCount[$4] += 1; geneSizeKB[$4] += " + - "(sqrt(($3-$2+0.00000001)^2)/1000); " + - "gene[$4] = $4} END { for (a in readCount) " + - "{ print gene[a], (readCount[a]/" + - str(scaling_factor) + ")/geneSizeKB[a]}}' " + introns_cov + - " | awk '$2>0' | sort -k1 > " + introns_rpkm) - pm.run(cmd, introns_rpkm, nofail=True) - pm.clean_add(introns_rpkm) - - # join intron, exon RPKM on gene name intron_exon = os.path.join(QC_folder, args.sample_name + - "_intron_exon.tsv") - if os.path.exists(exons_rpkm) and os.path.exists(introns_rpkm): - cmd = ("join -a1 -a2 -j1 -e0 -o 0 1.2 2.2 " + - introns_rpkm + " " + exons_rpkm + " > " + intron_exon) - pm.run(cmd, intron_exon, nofail=True) - pm.clean_add(intron_exon) - - # compare intron to exon RPKM and report result - if os.path.exists(intron_exon): - cmd = ("awk '$2>0' " + intron_exon + - " | awk '{print $3/$2}' | sort -n | awk ' { a[i++]=$1; }" + - " END { x=int((i+1)/2);" + - " if (x < (i+1)/2) print (a[x-1]+a[x])/2;" + - " else print a[x-1]; }'") - mrna_con = float(pm.checkprint(cmd)) - pm.report_result("mRNA contamination", round(mrna_con, 2)) + "_exon_intron_ratios.bed") + + if not pm.get_stat("mRNA_contamination") or args.new_start: + # Sort exons and introns + exons_sort = os.path.join(QC_folder, args.genome_assembly + + "_exons_sort.bed") + introns_sort = os.path.join(QC_folder, args.genome_assembly + + "_introns_sort.bed") + cmd1 = ("grep -wf " + chr_keep + " " + res.refgene_exon + + " | " + tools.bedtools + " sort -i stdin -faidx " + + chr_order + " > " + exons_sort) + # a single sort fails to sort a 1 bp different start position intron + cmd2 = ("grep -wf " + chr_keep + " " + res.refgene_intron + + " | " + tools.bedtools + " sort -i stdin -faidx " + + chr_order + " | " + tools.bedtools + " sort -i stdin -faidx " + + chr_order + " > " + introns_sort) + pm.run([cmd1, cmd2], [exons_sort, introns_sort], nofail=True) + pm.clean_add(exons_sort) + pm.clean_add(introns_sort) + + # Determine coverage of exons/introns + exons_cov = os.path.join(QC_folder, args.sample_name + + "_exons_coverage.bed") + introns_cov = os.path.join(QC_folder, args.sample_name + + "_introns_coverage.bed") + cmd1 = (tools.bedtools + " coverage -sorted -counts -s -a " + + exons_sort + " -b " + mapping_genome_bam + + " -g " + chr_order + " > " + exons_cov) + cmd2 = (tools.bedtools + " coverage -sorted -counts -s -a " + + introns_sort + " -b " + mapping_genome_bam + + " -g " + chr_order + " > " + introns_cov) + pm.run([cmd1, cmd2], [exons_cov, introns_cov], nofail=True) + pm.clean_add(exons_cov) + pm.clean_add(introns_cov) + + # need Total Reads divided by 1M + ar = float(pm.get_stat("Aligned_reads")) + scaling_factor = float(ar/1000000) + + exons_rpkm = os.path.join(QC_folder, args.sample_name + + "_exons_rpkm.bed") + introns_rpkm = os.path.join(QC_folder, args.sample_name + + "_introns_rpkm.bed") + + # determine exonic RPKM for individual genes + if os.path.exists(exons_cov): + cmd = ("awk -v OFS='\t' '{chrom[$4] = $1; " + + "if($4!=prev4) {chromStart[$4] = $2} " + + "strand[$4] = $6; " + + "readCount[$4] += $7; " + + "exonCount[$4] += 1; " + + "geneSizeKB[$4] += (sqrt(($3-$2+0.00000001)^2)/1000); " + + "gene[$4] = $4; " + + "chromEnd[$4]=$3; " + + "prev4=$4} END " + + "{ for (a in readCount) " + + "{ print chrom[a], chromStart[a], chromEnd[a], gene[a], " + + "(readCount[a]/" + str(scaling_factor) + + ")/geneSizeKB[a], strand[a]}}' " + + exons_cov + " | awk '$5>0' | sort -k4 > " + + exons_rpkm) + pm.run(cmd, exons_rpkm, nofail=True) + pm.clean_add(exons_rpkm) + + # determine intronic RPKM for individual genes + if os.path.exists(introns_cov): + cmd = ("awk -v OFS='\t' '{chrom[$4] = $1; " + + "if($4!=prev4) {chromStart[$4] = $2} " + + "strand[$4] = $6; " + + "readCount[$4] += $7; " + + "exonCount[$4] += 1; " + + "geneSizeKB[$4] += (sqrt(($3-$2+0.00000001)^2)/1000); " + + "gene[$4] = $4; " + + "chromEnd[$4]=$3; " + + "prev4=$4} END " + + "{ for (a in readCount) " + + "{ print chrom[a], chromStart[a], chromEnd[a], gene[a], " + + "(readCount[a]/" + str(scaling_factor) + + ")/geneSizeKB[a], strand[a]}}' " + + introns_cov + " | awk '$5>0' | sort -k4 > " + + introns_rpkm) + pm.run(cmd, introns_rpkm, nofail=True) + pm.clean_add(introns_rpkm) + + # join intron, exon RPKM on gene name and calculate ratio + if os.path.exists(exons_rpkm) and os.path.exists(introns_rpkm): + cmd = ("join --nocheck-order -a1 -a2 -j4 " + + introns_rpkm + " " + exons_rpkm + " | " + + "awk -v OFS='\t' " + + "'NF==11 {print $7, $8, $9, $1, ($10/$5), $11}'" + + " | sort -k1,1 -k2,2n > " + intron_exon) + pm.run(cmd, intron_exon, nofail=True) + + # report median ratio + if os.path.exists(intron_exon): + cmd = ("awk '{print $5}' " + intron_exon + + " | sort -n | awk ' { a[i++]=$1; }" + + " END { x=int((i+1)/2);" + + " if (x < (i+1)/2) print (a[x-1]+a[x])/2;" + + " else print a[x-1]; }'") + mrna_con = float(pm.checkprint(cmd)) + pm.report_result("mRNA_contamination", round(mrna_con, 2)) # plot mRNA contamination distribution mRNApdf = os.path.join(QC_folder, @@ -2703,20 +3511,22 @@ def count_unmapped_reads(): mRNApng = os.path.join(QC_folder, args.sample_name + "_mRNA_contamination.png") mRNAplot = [tools.Rscript, tool_path("PEPPRO.R"), "mrna", - "-i", intron_exon, "--raw"] + "-i", intron_exon, "--raw", "--annotate"] cmd = build_command(mRNAplot) pm.run(cmd, mRNApdf, nofail=False) pm.report_object("mRNA contamination", mRNApdf, anchor_image=mRNApng) + if not is_gzipped(intron_exon): + cmd = (ngstk.ziptool + " -f " + intron_exon) + intron_exon = intron_exon + ".gz" + pm.run(cmd, intron_exon) + ############################################################################ - # Shift and produce BigWig's # + # Produce BigWigs # ############################################################################ - genome_fq = os.path.join(res.rgc.genome_folder, - args.genome_assembly, - (args.genome_assembly + ".fa")) - signal_folder = os.path.join( - param.outfolder, "signal_" + args.genome_assembly) - ngstk.make_dir(signal_folder) + genome_fq = rgc.get_asset(args.genome_assembly, + asset_name="fasta", + seek_key="fasta") plus_bw = os.path.join( signal_folder, args.sample_name + "_plus_body_0-mer.bw") minus_bw = os.path.join( @@ -2738,7 +3548,7 @@ def count_unmapped_reads(): cmd2 += " -o " + plus_bw # DEBUG formerly smoothed " -w " + plus_bw cmd2 += " -p " + str(int(max(1, int(pm.cores) * 2/3))) cmd2 += " --variable-step" - if args.runon.lower() == "pro": + if args.protocol.lower() in RUNON_SOURCE_PRO: cmd2 += " --tail-edge" pm.run([cmd1, cmd2], plus_bw) @@ -2749,114 +3559,35 @@ def count_unmapped_reads(): cmd4 += " -o " + minus_bw # DEBUG formerly smoothed " -w " + minus_bw cmd4 += " -p " + str(int(max(1, int(pm.cores) * 2/3))) cmd4 += " --variable-step" - if args.runon.lower() == "pro": + if args.protocol.lower() in RUNON_SOURCE_PRO: cmd4 += " --tail-edge" pm.run([cmd3, cmd4], minus_bw) else: print("Skipping signal track production -- Could not call \'wigToBigWig\'.") print("Check that you have the required UCSC tools in your PATH.") else: - # Need to run seqOutBias tallymer separately - # Do that in the $GENOMES folder, in a subfolder called "mappability" - # Only need to do that once for each read-size of interest - # default would be read-size 30 (args.max_len) - mappability_folder = os.path.join(res.rgc.genome_folder, - args.genome_assembly, - "mappability") - ngstk.make_dir(mappability_folder) - - # Link fasta file - genome_fq_ln = os.path.join(mappability_folder, - (args.genome_assembly + ".fa")) - if not os.path.isfile(genome_fq_ln): - cmd = "ln -sf " + genome_fq + " " + genome_fq_ln - pm.run(cmd, genome_fq_ln) - - if args.max_len != -1: - max_len = args.max_len - - suffix_index = os.path.join(mappability_folder, - (args.genome_assembly + ".sft")) - suffix_check = os.path.join(mappability_folder, - (args.genome_assembly + ".sft.suf")) - tally_index = os.path.join(mappability_folder, - (args.genome_assembly + ".tal_" + - str(max_len))) - tally_check = os.path.join(mappability_folder, - (args.genome_assembly + ".tal_" + - str(max_len) + ".mer")) - search_file = os.path.join(mappability_folder, - (args.genome_assembly + ".tal_" + - str(max_len) + ".gtTxt")) - - map_files = [suffix_check, tally_check, search_file] - already_mapped = False - - for file in map_files: - if os.path.isfile(file) and os.stat(file).st_size > 0: - already_mapped = True - else: - already_mapped = False - - if not already_mapped: - pm.timestamp("### Compute mappability information") - - suffix_cmd_chunks = [ - ("gt", "suffixerator"), - "-dna", - "-pl", - "-tis", - "-suf", - "-lcp", - "-v", - ("-parts", args.parts), - ("-db", genome_fq_ln), - ("-indexname", suffix_index) - ] - suffix_cmd = build_command(suffix_cmd_chunks) - pm.run(suffix_cmd, suffix_index) - - tally_cmd_chunks = [ - ("gt", "tallymer"), - "mkindex", - ("-mersize", max_len), - ("-minocc", 2), - ("-indexname", tally_index), - "-counts", - "-pl", - ("-esa", suffix_index) - ] - tally_cmd = build_command(tally_cmd_chunks) - pm.run(tally_cmd, tally_index) - - search_cmd_chunks = [ - ("gt", "tallymer"), - "search", - "-output", - ("qseqnum", "qpos"), - ("-strand", "fp"), - ("-tyr", tally_index), - ("-q", genome_fq_ln), - (">", search_file) - ] - search_cmd = build_command(search_cmd_chunks) - pm.run(search_cmd, search_file) + pm.debug("The max read length is {}".format(str(max_len))) + + # seqOutBias needs a working directory, we'll make that temporary + tempdir = tempfile.mkdtemp(dir=signal_folder) + os.chmod(tempdir, 0o771) + pm.clean_add(tempdir) pm.timestamp("### Use seqOutBias to produce bigWig files") - seqtable = os.path.join(res.genomes, args.genome_assembly, - mappability_folder, (args.genome_assembly + ".tbl")) + seqtable = os.path.join(signal_folder, (args.genome_assembly + ".tbl")) seqtable_cmd = build_command([ (tools.seqoutbias, "seqtable"), - genome_fq_ln, - str("--tallymer=" + search_file), - str("--gt-workdir=" + mappability_folder), # TODO - str("--read-size=" + max_len), + res.fasta, + str("--tallymer=" + res.search_file), + str("--gt-workdir=" + tempdir), + str("--read-size=" + str(max_len)), str("--out=" + seqtable) ]) pm.run(seqtable_cmd, seqtable) + pm.clean_add(seqtable) plus_table = os.path.join( signal_folder, (args.genome_assembly + "_plus_tbl.txt")) @@ -2890,7 +3621,7 @@ def count_unmapped_reads(): "--skip-bed", str("--bw=" + plus_bw) ] - if args.runon.lower() == "pro": + if args.protocol.lower() in RUNON_SOURCE_PRO: scale_plus_chunks.extend([("--tail-edge")]) scale_plus_cmd = build_command(scale_plus_chunks) @@ -2901,7 +3632,7 @@ def count_unmapped_reads(): "--skip-bed", str("--bw=" + minus_bw), ] - if args.runon.lower() == "pro": + if args.protocol.lower() in RUNON_SOURCE_PRO: scale_minus_chunks.extend([("--tail-edge")]) scale_minus_cmd = build_command(scale_minus_chunks) else: @@ -2913,7 +3644,7 @@ def count_unmapped_reads(): "--skip-bed", str("--bw=" + plus_bw) ] - if args.runon.lower() == "pro": + if args.protocol.lower() in RUNON_SOURCE_PRO: scale_plus_chunks.extend([("--tail-edge")]) scale_plus_cmd = build_command(scale_plus_chunks) @@ -2925,12 +3656,20 @@ def count_unmapped_reads(): "--skip-bed", str("--bw=" + minus_bw), ] - if args.runon.lower() == "pro": + if args.protocol.lower() in RUNON_SOURCE_PRO: scale_minus_chunks.extend([("--tail-edge")]) scale_minus_cmd = build_command(scale_minus_chunks) pm.run([scale_plus_cmd, scale_minus_cmd], minus_bw) + # Remove potentially empty folders + if os.path.exists(raw_folder) and os.path.isdir(raw_folder): + if not os.listdir(raw_folder): + pm.clean_add(raw_folder) + if os.path.exists(fastqc_folder) and os.path.isdir(fastqc_folder): + if not os.listdir(fastqc_folder): + pm.clean_add(fastqc_folder) + ############################################################################ # PIPELINE COMPLETE! # ############################################################################ diff --git a/pipelines/peppro.yaml b/pipelines/peppro.yaml index 5b65dc9..965416f 100644 --- a/pipelines/peppro.yaml +++ b/pipelines/peppro.yaml @@ -8,8 +8,10 @@ tools: # absolute paths to required tools seqkit: seqkit fastp: fastp cutadapt: cutadapt + flash: flash seqtk: seqtk fastqpair: fastq_pair + picard: /apps/software/standard/core/picard/2.18.5/picard.jar # UCSC tools bigWigCat: bigWigCat wigToBigWig: wigToBigWig diff --git a/requirements.txt b/requirements.txt index ad7b76d..d8b2383 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ cutadapt loopercli numpy -pararead pandas +pararead piper refgenconf refgenie diff --git a/tools/PEPPRO.R b/tools/PEPPRO.R index 3a4f6fa..3630b31 100755 --- a/tools/PEPPRO.R +++ b/tools/PEPPRO.R @@ -3,7 +3,7 @@ # PEPPRO R parser ############################################################################### -version <- 0.5 +version <- 0.6 ##### Load dependencies ##### required_libraries <- c("PEPPROr") @@ -46,10 +46,11 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Command: preseq\t\t plot preseq complexity curves\n", "\t frif\t\t plot fraction of reads in features\n", "\t tss\t\t plot TSS enrichment\n", - "\t frag\t\t plot fragment length distribution\n", + "\t frag\t\t plot PE fragment length distribution\n", "\t mrna\t\t plot mRNA contamination distribution\n", "\t pi\t\t plot pause indicies distribution\n", - "\t cutadapt\t plot adapter insertion distribution\n" + "\t cutadapt\t plot cutadapt-based adapter insertion distribution\n", + "\t adapt\t plot generalized adapter insertion distribution\n" ) message(usage) } else if (!is.na(subcmd) && tolower(subcmd) == "preseq") { @@ -165,15 +166,30 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { ignore_unique = ign_unique, x_min = x_min, x_max = x_max) - # now save the plot - pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), - width= 10, height = 7, useDingbats=F) - print(fig) - invisible(dev.off()) - png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), - width = 686, height = 480) - print(fig) - invisible(dev.off()) + + if (length(input) == 1) { + # now save the plot + pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), + height = 4, width = 4.25, useDingbats=F) + suppressWarnings(print(p)) + invisible(dev.off()) + + png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), + height = 275, width = 300) + suppressWarnings(print(p)) + invisible(dev.off()) + } else { + # now save the plot + pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), + height = 5, width = 6, useDingbats=F) + suppressWarnings(print(p)) + invisible(dev.off()) + + png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), + height = 315, width = 425) + suppressWarnings(print(p)) + invisible(dev.off()) + } if (exists("p")) { write("Library complexity plot completed!\n", stdout()) @@ -187,8 +203,10 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Usage: PEPPRO.R [command] {args}\n", "Version: ", version, "\n\n", "Command: frif \t plot fraction of reads in features\n\n", - " -n, --sample_name\t Sample name.\n", - " -r, --reads\t\t Number of mapped reads.\n", + " -s, --sample_name\t Sample name.\n", + " -n, --num_reads\t\t Number of mapped reads.\n", + " -z, --size\t\t Size of genome (bp).\n", + " -y, --type\t Choose plot type: FRiF, PRiF, or Both.\n", " -o, --output_name\t Output file name.\n", " -b, --bed\t\t Coverage file(s).\n" ) @@ -204,35 +222,60 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { message(usage) quit() } else { - sample_name <- opt_get(name = c("sample_name", "n"), required=TRUE, + sample_name <- opt_get(name = c("sample_name", "s"), required=TRUE, description="Sample name.") - reads <- opt_get(name = c("reads", "r"), required=TRUE, - description="Number of mapped reads.") + num_reads <- opt_get(name = c("num_reads", "n"), required=TRUE, + description="Number of mapped reads (or bases).") + genome_size <- opt_get(name = c("size", "z"), required=TRUE, + description="Size of genome (bp).") + type <- opt_get(name = c("type", "y"), required=FALSE, default="frif", + description="Choose plot type: FRiF, PRiF, or Both (Default = frif).") + reads <- opt_get(name = c("reads", "r"), required=FALSE, default=FALSE, + description="Calculate using reads (TRUE) or bases (FALSE) (Default = FALSE).") output_name <- opt_get(name = c("output_name", "o"), required=TRUE, description="Output file name.") numArgs <- length(opt_get_args()) + #message(numArgs) + argGap <- ifelse(reads, 13, 12) + #message(argGap) bed <- opt_get(name = c("bed", "b"), required=TRUE, - n=(numArgs - 8), + n=(numArgs - argGap), description="Coverage file(s).") + #message(paste0("\nbed: ", bed)) p <- plotFRiF(sample_name = sample_name, - num_reads = reads, + num_reads = as.numeric(num_reads), + genome_size = as.numeric(genome_size), + type = tolower(type), + reads = reads, output_name = output_name, bedFile = bed) - pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), - width= 7, height = 7, useDingbats=F) - print(p) - invisible(dev.off()) - png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), - width = 480, height = 480) - print(p) - invisible(dev.off()) + if (tolower(type) == "both") { + pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), + height = 5.45, width = 8.39, useDingbats=F) + suppressWarnings(print(p)) + invisible(dev.off()) + png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), + height = 550, width=850) + suppressWarnings(print(p)) + invisible(dev.off()) + } else { + pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), + height = 4, width = 4, useDingbats=F) + suppressWarnings(print(p)) + invisible(dev.off()) + png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), + height = 275, width=275) + suppressWarnings(print(p)) + invisible(dev.off()) + } + if (exists("p")) { - write("Cumulative FRiF plot completed!\n", stdout()) + write(paste0("Cumulative ", type, " plot completed!\n"), stdout()) } else { - write("Unable to produce FRiF plot!\n", stdout()) + write(paste0("Unable to produce ", type, " plot!\n"), stdout()) } } } else if (!is.na(subcmd) && tolower(subcmd) == "tss") { @@ -282,13 +325,13 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { sample_name <- sampleName(TSSfile[1]) png(filename = paste0(sample_name, "_TSSenrichment.png"), - width = 480, height = 480) - print(p) + width = 275, height = 275) + suppressWarnings(print(p)) invisible(dev.off()) pdf(file = paste0(sample_name, "_TSSenrichment.pdf"), - width= 7, height = 7, useDingbats=F) - print(p) + width = 4, height = 4, useDingbats=F) + suppressWarnings(print(p)) invisible(dev.off()) if (exists("p")) { @@ -334,14 +377,14 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { fragL_txt = fragL_txt) # Save plot to pdf file - pdf(file=fragL_name, width= 7, height = 7, useDingbats=F) - print(p) + pdf(file=fragL_name, width = 4, height = 4, useDingbats=F) + suppressWarnings(print(p)) invisible(dev.off()) # Save plot to png file outfile_png <- gsub('pdf', 'png', fragL_name) - png(filename=outfile_png, width = 480, height = 480) - print(p) + png(filename=outfile_png, width = 275, height = 275) + suppressWarnings(print(p)) invisible(dev.off()) if (exists("p")) { @@ -357,7 +400,9 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Version: ", version, "\n\n", "Command: mrna \t plot mRNA contamination distribution\n\n", " -i, --rpkm\t Three column TSV of intron and exon RPKM by gene.\n", - " -w, --raw\t Plot raw exon/intron ratios instead of log10.\n" + " -w, --raw\t Plot raw exon/intron ratios instead of log10.\n", + " -y, --type\t Choose plot type from: histogram, boxplot, or violin.\n", + " -a, --annotate\t Display raw and log10-transformed median values on plot.\n" ) help <- opt_get(name = c("help", "?", "h"), required=FALSE, @@ -375,21 +420,27 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { description="Three column TSV containing gene exon and intron RPKMs.") raw <- opt_get(name = c("raw", "w"), required=FALSE, default=FALSE, description="Plot raw ratios (Default = FALSE).") + type <- opt_get(name = c("type", "y"), required=FALSE, default="histogram", + description="Choose plot type from: histogram, boxplot, or violin (Default = histogram).") + annotate <- opt_get(name = c("annotate", "a"), required=FALSE, default=FALSE, + description="Display raw and log10-transformed median values on plot.") - suppressWarnings(p <- mRNAcontamination(rpkm=rpkm, raw=raw)) - - sample_name <- sampleName(rpkm) + sample_name <- sampleName(rpkm, 3) + name <- basename(sample_name) + suppressWarnings(p <- mRNAcontamination(rpkm=rpkm, name=name, raw=raw, + type=tolower(type), + annotate=annotate)) # Save plot to pdf file pdf(file=paste0(sample_name, "_mRNA_contamination.pdf"), - width= 7, height = 7, useDingbats=F) - print(q) + width = 4, height = 4, useDingbats=F) + suppressMessages(suppressWarnings(print(p))) invisible(dev.off()) # Save plot to png file png(filename = paste0(sample_name, "_mRNA_contamination.png"), - width = 480, height = 480) - print(q) + width = 275, height = 275) + suppressMessages(suppressWarnings(print(p))) invisible(dev.off()) if (exists("p")) { @@ -404,7 +455,9 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Usage: PEPPRO.R [command] {args}\n", "Version: ", version, "\n\n", "Command: pi \t plot pause indicies distribution\n\n", - " -i, --input\t Pause density/gene body density ratios.\n" + " -i, --input\t Pause density/gene body density ratios.\n", + " -y, --type\t Choose plot type from: histogram, boxplot, or violin.\n", + " -a, --annotate\t Display median and mean values on plot.\n" ) help <- opt_get(name = c("help", "?", "h"), required=FALSE, @@ -420,21 +473,26 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { } else { input <- opt_get(name = c("input", "i"), required=TRUE, description="Pause density/gene body density ratios.") + type <- opt_get(name = c("type", "y"), required=FALSE, default="histogram", + description="Choose plot type from: histogram, boxplot, or violin (Default = histogram).") + annotate <- opt_get(name = c("annotate", "a"), required=FALSE, default=FALSE, + description="Display median and mean values on plot.") - suppressWarnings(p <- plotPI(pi=input)) - - sample_name <- sampleName(input) + sample_name <- sampleName(input) + name <- basename(sample_name) + suppressWarnings(p <- plotPI(pi=input, name=name, + type=tolower(type), annotate=annotate)) # Save plot to pdf file pdf(file=paste0(sample_name, "_pause_index.pdf"), - width= 7, height = 7, useDingbats=F) - print(p) + width = 4, height = 4, useDingbats=F) + suppressWarnings(print(p)) invisible(dev.off()) # Save plot to png file png(filename = paste0(sample_name, "_pause_index.png"), - width = 480, height = 480) - print(p) + width = 275, height = 275) + suppressWarnings(print(p)) invisible(dev.off()) if (exists("p")) { @@ -448,9 +506,11 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "\n", "Usage: PEPPRO.R [command] {args}\n", "Version: ", version, "\n\n", - "Command: cutadapt \t plot adapter insertion distribution\n\n", + "Command: cutadapt \t plot cutadapt-based adapter insertion distribution\n\n", " -i, --input\t cutadapt report.\n", - " -o, --output\t output directory.\n" + " -o, --output\t output directory.\n", + " -u, --umi_len\t UMI length (Default 0).\n", + " -f, --factor\t Factor to divide read count (Default 1M).\n" ) help <- opt_get(name = c("help", "?", "h"), required=FALSE, @@ -464,26 +524,93 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { message(usage) quit() } else { - input <- opt_get(name = c("input", "i"), required=TRUE, - description="cutadapt report.") - output <- opt_get(name = c("output", "o"), required=TRUE, - description="output destination directory.") + input <- opt_get(name = c("input", "i"), required=TRUE, + description="cutadapt report.") + output <- opt_get(name = c("output", "o"), required=TRUE, + description="output destination directory.") + umi_len <- opt_get(name = c("umi_len", "u"), required=FALSE, + default = 0, + description="UMI length (Default 0).") + factor <- opt_get(name = c("factor", "f"), required=FALSE, + default = 1000000, + description="Factor to divide read count (Default 1M).") + + name <- basename(sampleName(input, num_fields=1)) + #message(name) + suppressWarnings(p <- plotCutadapt(input=input, name=name, + umi_len = umi_len, + count_factor=factor)) + sample_name <- paste(output, name, sep="/") + #message(sample_name) - name <- basename(sampleName(input)) - suppressWarnings(p <- plotCutadapt(input=input, name=name)) + # Save plot to pdf file + pdf(file=paste0(sample_name, "_adapter_insertion_distribution.pdf"), + width = 4, height = 4, useDingbats=F) + suppressWarnings(print(p)) + invisible(dev.off()) + + # Save plot to png file + png(filename = paste0(sample_name, + "_adapter_insertion_distribution.png"), + width = 275, height = 275) + suppressWarnings(print(p)) + invisible(dev.off()) + + if (exists("p")) { + write("Adapter insertion distribution plot completed!\n", stdout()) + } else { + write("Unable to produce adapter insertion distribution plot!\n", + stdout()) + } + } +} else if (!is.na(subcmd) && tolower(subcmd) == "adapt") { + usage <- paste0( + "\n", + "Usage: PEPPRO.R [command] {args}\n", + "Version: ", version, "\n\n", + "Command: adapt \t plot generalized adapter insertion distribution\n\n", + " -i, --input\t flash histogram output.\n", + " -o, --output\t output directory.\n", + " -u, --umi_len\t UMI length (Default 0).\n" + ) + + help <- opt_get(name = c("help", "?", "h"), required=FALSE, + default=FALSE, n=0) + if (!help) { + help <- suppressWarnings( + if(length(opt_get_args()) == 1) {TRUE} else {FALSE} + ) + } + if (help) { + message(usage) + quit() + } else { + input <- opt_get(name = c("input", "i"), required=TRUE, + description="flash histogram output.") + output <- opt_get(name = c("output", "o"), required=TRUE, + description="output destination directory.") + umi_len <- opt_get(name = c("umi_len", "u"), required=FALSE, + default = 0, + description="UMI length (Default 0).") + + name <- basename(sampleName(input, num_fields=0)) + #message(name) + suppressWarnings(p <- plotAdapt(input=input, name=name, + umi_len = umi_len)) sample_name <- paste(output, name, sep="/") + #message(sample_name) # Save plot to pdf file pdf(file=paste0(sample_name, "_adapter_insertion_distribution.pdf"), - width= 7, height = 7, useDingbats=F) - print(p) + width = 4, height = 4, useDingbats=F) + suppressWarnings(print(p)) invisible(dev.off()) # Save plot to png file png(filename = paste0(sample_name, "_adapter_insertion_distribution.png"), - width = 480, height = 480) - print(p) + width = 275, height = 275) + suppressWarnings(print(p)) invisible(dev.off()) if (exists("p")) { @@ -501,10 +628,11 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Command: preseq\t\t plot preseq complexity curves\n", "\t frif\t\t plot fraction of reads in features\n", "\t tss\t\t plot TSS enrichment\n", - "\t frag\t\t plot fragment length distribution\n", + "\t frag\t\t plot PE fragment length distribution\n", "\t mrna\t\t plot mRNA contamination distribution\n", "\t pi\t\t plot pause indicies distribution\n", - "\t cutadapt\t plot adapter insertion distribution\n" + "\t cutadapt\t plot cutadapt-based adapter insertion distribution\n", + "\t adapt\t plot generalized adapter insertion distribution\n" ) message(usage) quit() diff --git a/tools/PEPPRO_summarizer.R b/tools/PEPPRO_complexity_curves.R similarity index 52% rename from tools/PEPPRO_summarizer.R rename to tools/PEPPRO_complexity_curves.R index b16cd38..39f9873 100755 --- a/tools/PEPPRO_summarizer.R +++ b/tools/PEPPRO_complexity_curves.R @@ -1,20 +1,20 @@ #! /usr/bin/env Rscript ############################################################################### -#6/10/19 +#1/20/2020 #Author: Jason Smith -#PEPPRO_summarizer.R +#PEPPRO_complexity_curves.R # #This program is meant to plot multiple library complexity curves on the #same plot when called by looper summarize # #NOTES: -#usage: Rscript tools/PEPPRO_summarizer.R +#usage: Rscript tools/PEPPRO_complexity_curves.R # /path/to/project_config.yaml # #requirements: PEPPROr # ############################################################################### -version <- 0.1 +version <- 0.2 ##### Load dependencies ##### required_libraries <- c("PEPPROr") @@ -54,24 +54,46 @@ dir.create( showWarnings = FALSE) # Plot combined library complexity curves for all samples in project -cc <- paste(config(prj)$metadata$output_dir, +cc <- paste(suppressMessages(config(prj)$metadata$output_dir), "results_pipeline", - samples(prj)$sample_name, - paste0("QC_", samples(prj)$genome), - paste0(samples(prj)$sample_name, "_preseq_yield.txt"), + suppressMessages(samples(prj)$sample_name), + paste0("QC_", suppressMessages(samples(prj)$genome)), + paste0(suppressMessages(samples(prj)$sample_name), + "_preseq_yield.txt"), sep="/") -rc <- paste(config(prj)$metadata$output_dir, +rc <- paste(suppressMessages(config(prj)$metadata$output_dir), "results_pipeline", - samples(prj)$sample_name, - paste0("QC_", samples(prj)$genome), - paste0(samples(prj)$sample_name, "_preseq_counts.txt"), + suppressMessages(samples(prj)$sample_name), + paste0("QC_", suppressMessages(samples(prj)$genome)), + paste0(suppressMessages(samples(prj)$sample_name), + "_preseq_counts.txt"), sep="/") -plotComplexityCurves(ccurves = cc, - coverage = 0, read_length = 0, - real_counts_path = rc, ignore_unique = FALSE, - output_name = paste(config(prj)$metadata$output_dir, - "summary", - paste0(config(prj)$name, "_libComplexity"), - sep="/")) + +hasBoth <- file.exists(cc) & file.exists(rc) + +ccSub <- cc[hasBoth] +rcSub <- rc[hasBoth] +#message(ccSub, rcSub) +message(paste0(length(ccSub), " of ", length(cc), " files available")) + +output_name <- paste(config(prj)$metadata$output_dir, "summary", + paste0(config(prj)$name, "_libComplexity"), sep="/") + +if (sum(hasBoth) > 0){ + + p <- plotComplexityCurves(ccurves = ccSub, coverage = 0, read_length = 0, + real_counts_path = rcSub, ignore_unique = FALSE) + + pdf(file = paste0(tools::file_path_sans_ext(output_name), ".pdf"), + width= 10, height = 7, useDingbats=F) + print(p) + invisible(dev.off()) + png(filename = paste0(tools::file_path_sans_ext(output_name), ".png"), + width = 686, height = 480) + print(p) + invisible(dev.off()) +} else { + message("No samples have available library complexity files.") +} ############################################################################### diff --git a/tools/PEPPRO_counts.R b/tools/PEPPRO_counts.R new file mode 100755 index 0000000..5e06a13 --- /dev/null +++ b/tools/PEPPRO_counts.R @@ -0,0 +1,147 @@ +#! /usr/bin/env Rscript +############################################################################### +#1/20/2020 +#Author: Jason Smith +#PEPPRO_counts.R +# +#For each sample in a project, generate a counts object of gene counts +#for each gene for each sample in the +#format [gene],[sample1],[sample...],[sampleN] +# +#NOTES: +#usage: Rscript tools/PEPPRO_counts.R +# /path/to/project_config.yaml +# +#requirements: PEPPROr +# +############################################################################### +version <- 0.1 +##### Load dependencies ##### + +required_libraries <- c("PEPPROr") +for (i in required_libraries) { + loadLibrary <- tryCatch ( + { + suppressPackageStartupMessages( + suppressWarnings(library(i, character.only=TRUE))) + }, + error=function(e) { + message("Error: Install the \"", i, + "\" R package before proceeding.") + message('i.e. devtools::install_github("databio/peppro",', + ' subdir="PEPPROr")') + return(NULL) + }, + warning=function(e) { + message(e) + return(1) + } + ) + if (length(loadLibrary)!=0) { + suppressWarnings(library(i, character.only=TRUE)) + } else { + quit() + } +} + +############################################################################### + +readPepproGeneCounts = function(project) { + cwd <- getwd() + project_dir <- pepr::config(project)$metadata$output_dir + sample_names <- pepr::samples(project)$sample_name + genomes <- as.list(pepr::samples(project)$genome) + names(genomes) <- sample_names + paths <- vector("list", length(sample_names)) + names(paths) <- sample_names + + for (sample in sample_names) { + paths[[sample]] <- paste(project_dir, 'results_pipeline', sample, + paste0('signal_', genomes[[sample]]), + paste0(sample, "_gene_coverage.bed"), sep="/") + } + + result <- lapply(paths, function(x){ + if (file.exists(x)) { + df <- fread(x) + colnames(df) <- c('chr', 'start', 'end', 'geneName', + 'score', 'strand', 'count') + gr <- GenomicRanges::GRanges(df) + } else { + gr <- GenomicRanges::GRanges() + } + }) + + setwd(cwd) + return(GenomicRanges::GRangesList(Filter(length, result))) +} + +############################################################################### + +configFile <- opt_get_verb() +prj <- suppressWarnings(Project(configFile)) + +message("Creating counts table...") + +# Produce output directory (if needed) +dir.create( + suppressMessages( + file.path(pepr::config(prj)$metadata$output_dir, "summary")), + showWarnings = FALSE) + +# Load gene counts files +totalSamples <- length(suppressMessages(pepr::samples(prj)$sample_name)) +countsGR <- readPepproGeneCounts(prj) +actualSamples <- length(countsGR) + +message(paste0(actualSamples, " of ", totalSamples, " files available")) + +# Generate output file name +output_name <- paste(pepr::config(prj)$metadata$output_dir, "summary", + paste0(pepr::config(prj)$name, "_countData.csv"), sep="/") + +if (length(countsGR) > 0) { + # Create gene name data table + i <- 1 + while (length(countsGR[[i]]) == 0) { + i <- i + 1 + } + count_dt <- data.table(geneName = countsGR[[i]]$geneName, + seqnames=as.character(seqnames(countsGR[[i]])), + start=start(countsGR[[i]]), + end=end(countsGR[[i]]), + width=width(countsGR[[i]]), + strand=as.character(strand(countsGR[[i]]))) + + # Populate count table + while (i < length(names(countsGR))) { + dt1 <- as.data.table(countsGR[[names(countsGR)[i]]]) + name1 <- paste0(".",names(countsGR)[i]) + i <- i + 1 + dt2 <- as.data.table(countsGR[[names(countsGR)[i]]]) + name2 <- paste0(".",names(countsGR)[i]) + + dt <- merge(dt1[,-"score"], dt2[,-"score"], + by=c("seqnames", "start", "end", + "width", "strand", "geneName"), + sort=TRUE, suffix=c(name1, name2)) + + count_dt <- merge(count_dt, dt, sort=TRUE, + by=c("geneName", "seqnames", "start", "end", + "width", "strand")) + i <- i + 1 + } + + colnames(count_dt) <- sub(".*\\.","",colnames(count_dt)) + counts <- count_dt[,-c("seqnames", "start", "end", + "width", "strand")] + #rownames(counts) <- count_dt$geneName + # Export as csv with rownames as the first column + # Will require modification upon loading into R to convert to matrix + fwrite(counts, file=output_name) + message(paste0("Counts table: ", file.path(output_name), "\n")) +} else { + message("No samples have available gene count data.") +} + +############################################################################### diff --git a/tools/PRO-seq_adapter.fa b/tools/PRO-seq_adapter.fa deleted file mode 100644 index a4868ec..0000000 --- a/tools/PRO-seq_adapter.fa +++ /dev/null @@ -1,2 +0,0 @@ ->5prime -TGGAATTCTCGGGTGCCAAGG \ No newline at end of file diff --git a/tools/adapter.fa b/tools/adapter.fa new file mode 100644 index 0000000..4e4308e --- /dev/null +++ b/tools/adapter.fa @@ -0,0 +1,4 @@ +>5prime +TGGAATTCTCGGGTGCCAAGG +>3prime +GATCGTCGGACTGTAGAACTCTGAAC \ No newline at end of file diff --git a/tools/bamQC.py b/tools/bamQC.py index 7dc75cf..2eaadf9 100755 --- a/tools/bamQC.py +++ b/tools/bamQC.py @@ -13,9 +13,8 @@ import sys import pararead -#from pararead.processor import _LOGGER -from pararead import add_logging_options, ParaReadProcessor -from pararead import logger_via_cli +import logmuse +from pararead import ParaReadProcessor import pandas as _pd import numpy as np @@ -231,7 +230,7 @@ def parse_args(cmdl): parser.add_argument('-c', '--cores', dest='cores', default=20, type=int, help="Number of processors to use. Default=20") - parser = add_logging_options(parser) + parser = logmuse.add_logging_options(parser) return parser.parse_args(cmdl) @@ -239,7 +238,7 @@ def parse_args(cmdl): if __name__ == "__main__": args = parse_args(sys.argv[1:]) - _LOGGER = logger_via_cli(args) + _LOGGER = logmuse.logger_via_cli(args) qc = bamQC(reads_filename=args.infile, out_filename=args.outfile, diff --git a/tools/bamSitesToWig.py b/tools/bamSitesToWig.py index b4320f5..481cd40 100755 --- a/tools/bamSitesToWig.py +++ b/tools/bamSitesToWig.py @@ -13,12 +13,11 @@ import os import subprocess import sys - +import logmuse import pararead import pysam -from pararead import add_logging_options, ParaReadProcessor -from pararead import logger_via_cli +from pararead import ParaReadProcessor MODES = ["dnase", "atac"] @@ -46,8 +45,9 @@ def __init__(self, reads_filename, chrom_sizes_file, temp_parent, nProc, _LOGGER.info("Cutting parallel chroms in half to accommodate two tracks.") nProc = max(int(nProc / 2), 1) - super(CutTracer,self).__init__(reads_filename, nProc, - self.resultAcronym, temp_parent, limit, allow_unaligned=False, + super(CutTracer,self).__init__(path_reads_file=reads_filename, cores=nProc, + action=self.resultAcronym, temp_folder_parent_path=temp_parent, + limit=limit, allow_unaligned=False, retain_temp=retain_temp) self.exactbw = exactbw self.summary_filename = summary_filename @@ -95,7 +95,7 @@ def __call__(self, chrom): chrom_size = self.get_chrom_size(chrom) #self.unbuffered_write("[Name: " + chrom + "; Size: " + str(chrom_size) + "]") - _LOGGER.info("[Name: " + chrom + "; Size: " + str(chrom_size) + "]") + _LOGGER.debug("[Name: " + chrom + "; Size: " + str(chrom_size) + "]") reads = self.fetch_chunk(chrom) chromOutFile = self._tempf(chrom) @@ -283,22 +283,34 @@ def combine(self, good_chromosomes): _LOGGER.info("Merging {} files into output file: '{}'". format(len(good_chromosomes), self.exactbw)) temp_files = [self._tempf(chrom) + ".bw" for chrom in good_chromosomes] - cmd = "bigWigCat " + self.exactbw + " " + " ".join(temp_files) + files_exist = [] + for file in temp_files: + if os.path.isfile(file) and os.stat(file).st_size > 0: + files_exist.append(file) + cmd = "bigWigCat " + self.exactbw + " " + " ".join(files_exist) _LOGGER.debug(cmd) - p = subprocess.call(['bigWigCat', self.exactbw] + temp_files) + p = subprocess.call(['bigWigCat', self.exactbw] + files_exist) if self.smoothbw: _LOGGER.info("Merging {} files into output file: '{}'". format(len(good_chromosomes), self.smoothbw)) temp_files = [self._tempf(chrom) + "_smooth.bw" for chrom in good_chromosomes] - cmd = "bigWigCat " + self.smoothbw + " " + " ".join(temp_files) + files_exist = [] + for file in temp_files: + if os.path.isfile(file) and os.stat(file).st_size > 0: + files_exist.append(file) + cmd = "bigWigCat " + self.smoothbw + " " + " ".join(files_exist) _LOGGER.debug(cmd) - p = subprocess.call(['bigWigCat', self.smoothbw] + temp_files) + p = subprocess.call(['bigWigCat', self.smoothbw] + files_exist) if self.bedout: # root, ext = os.path.splitext(self.exactbw) temp_files = [self._tempf(chrom) + ".bed" for chrom in good_chromosomes] - cmd = "cat " + " ".join(temp_files) + " > " + self.bedout + files_exist = [] + for file in temp_files: + if os.path.isfile(file) and os.stat(file).st_size > 0: + files_exist.append(file) + cmd = "cat " + " ".join(files_exist) + " > " + self.bedout _LOGGER.debug(cmd) p = subprocess.call(cmd, shell=True) @@ -339,15 +351,16 @@ def parse_args(cmdl): parser.add_argument('--retain-temp', action='store_true', default=False, help="Retain temporary files? Default: False") - parser = add_logging_options(parser) - return parser.parse_args(cmdl) + parser = logmuse.add_logging_options(parser) + args = parser.parse_args(cmdl) + if not (args.exactbw or args.smoothbw): + parser.error('No output requested, use --exactbw and/or --smoothbw') + return args if __name__ == "__main__": args = parse_args(sys.argv[1:]) - if not (args.exactbw or args.smoothbw): - parser.error('No output requested, use --exactbw and/or --smoothbw') - _LOGGER = logger_via_cli(args) + _LOGGER = logmuse.logger_via_cli(args, make_root=True) if args.mode == "dnase": shift_factor = {"+":1, "-":0} # DNase @@ -355,7 +368,6 @@ def parse_args(cmdl): shift_factor = {"+":4, "-":-5} # ATAC else: shift_factor = {"+":0, "-":0} - ct = CutTracer( reads_filename=args.infile, chrom_sizes_file=args.chrom_sizes_file, summary_filename=args.summary_file, diff --git a/tools/packageReport.py b/tools/packageReport.py new file mode 100755 index 0000000..5550dd4 --- /dev/null +++ b/tools/packageReport.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +""" +Package PEPPRO reports +""" + +__author__ = ["Jason Smith"] +__email__ = "jasonsmith@virginia.edu" +__version__ = "0.0.1" + + +from argparse import ArgumentParser +import os +import sys +import tempfile +import tarfile +import shutil +import glob +import peppy + +################################################################################ + +def parse_arguments(): + """ + Parse command-line arguments passed to the pipeline. + """ + + # Argument Parsing from yaml file + parser = ArgumentParser(description='Package PEPPRO reports version ' + __version__) + + # Pipeline-specific arguments + parser.add_argument("-p", "--project", dest="project", + help="PEP configuration file (*.yaml).") + + parser.add_argument("-v", "--version", action="version", + version="%(prog)s {v}".format(v=__version__)) + + args = parser.parse_args() + + if not args.project: + parser.print_help() + raise SystemExit + + return args + +################################################################################ + +def main(): + """ + Main pipeline process. + """ + + args = parse_arguments() + + project = peppy.Project(args.project) + + # convenience alias + projectName = project.name + sampleNames = [] + for sample in project.samples: + sampleNames.append(sample.name) + projectDir = project.metadata.output_dir + outputName = os.path.join(projectDir, projectName + ".tar.gz") + + tempdir = tempfile.mkdtemp(dir=projectDir) + os.chmod(tempdir, 0o771) + tempName = os.path.basename(tempdir) + exclude = set([tempdir, os.path.join(tempName, tempName)]) + + print("-- Copy project directory structure --") + for (path, dirs, files) in os.walk(projectDir, topdown=True): + if path not in exclude: + newDir = path.replace(projectDir, tempdir) + if os.path.join(tempName, tempName) not in newDir: + os.makedirs(newDir, exist_ok=True) + + print("-- Copy project reports --") + projectReports = ['PEPPRO_stats_summary.tsv', 'PEPPRO_summary.html', + 'PEPPRO_objs_summary.tsv', 'reports/*.html', + 'summary/*.p[dn][fg]'] + for item in projectReports: + dirName = os.path.dirname(item) + path = os.path.join(tempdir, dirName) + report = os.path.join(projectDir, item.strip('.')) + for file in glob.glob(report): + if os.path.exists(file): + shutil.copy2(file, path.strip('.')) + + print("-- Copy sample reports --") + sampleReports = ['objects.tsv', 'stats.tsv', 'PEPPRO_log.md', + 'PEPPRO_commands.sh', 'PEPPRO_profile.tsv', + 'fastqc/*.html', 'QC_hg38/*.p[dn][fg]', + 'cutadapt/*.p[dn][fg]', 'fastp/*.p[dn][fg]', + 'fastp/*.html'] + for item in sampleReports: + dirName = os.path.dirname(item) + for sample in sampleNames: + sampleDir = os.path.join(projectDir, "results_pipeline", sample) + if os.path.isdir(sampleDir): + report = os.path.join(sampleDir, item.strip('.')) + path = os.path.join(tempdir, "results_pipeline", sample, dirName) + for file in glob.glob(report): + if os.path.exists(file): + shutil.copy2(file, path) + + print("-- Archive report --") + with tarfile.open(outputName, "w:gz") as tar: + tar.add(tempdir, arcname='.') + + print("-- Clean up workspace --") + shutil.rmtree(tempdir) + + print("-- Package project report complete! --") + print("Report pacakge: {}".format(outputName)) + +if __name__ == '__main__': + try: + sys.exit(main()) + except KeyboardInterrupt: + print("PEPPRO report packager aborted.") + sys.exit(1) \ No newline at end of file diff --git a/usage.txt b/usage.txt index fe4251f..37aaedc 100644 --- a/usage.txt +++ b/usage.txt @@ -1,18 +1,23 @@ -usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-C CONFIG_FILE] -O - PARENT_OUTPUT_FOLDER [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] - -S SAMPLE_NAME -I INPUT_FILES [INPUT_FILES ...] +usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--verbosity V] [--silent] + [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I + INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] [--runon {pro,gro}] - [--adapter {fastp,cutadapt}] [--dedup {seqkit,fqdedup}] - [--trimmer {seqtk,fastx}] [--umi] [--umi_len UMI_LEN] - [--max_len MAX_LEN] [--sob] [--scale] [--parts PARTS] + [-Q SINGLE_OR_PAIRED] + [--protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq}] + [--adapter-tool {cutadapt,fastp}] + [--dedup-tool {seqkit,fqdedup}] + [--trimmer-tool {seqtk,fastx}] [--umi-len UMI_LEN] + [--max-len MAX_LEN] [--sob] [--scale] [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] - [--TSS-name TSS_NAME] [--pi-tss PI_TSS] [--pi-body PI_BODY] - [--pre-name PRE_NAME] [--anno-name ANNO_NAME] - [--exon-name EXON_NAME] [--intron-name INTRON_NAME] - [--coverage] [--keep] [--noFIFO] [--complexity] [-V] + [--TSS-name TSS_NAME] [--pi-tss ENSEMBL_TSS] + [--pi-body ENSEMBL_GENE_BODY] [--pre-name PRE_NAME] + [--anno-name ANNO_NAME] [--exon-name EXON_NAME] + [--intron-name INTRON_NAME] [--search-file SEARCH_FILE] + [--coverage] [--keep] [--noFIFO] [--no-complexity] + [--prioritize] [-V] -PEPPRO version 0.8.0 +PEPPRO version 0.8.6 optional arguments: -h, --help show this help message and exit @@ -20,6 +25,10 @@ optional arguments: -N, --new-start Overwrite all results to start a fresh run -D, --dirty Don't auto-delete intermediate files -F, --force-follow Always run 'follow' commands + -T, --testmode Only print commands, don't run + --verbosity V Set logging level (1-5 or logging module level name) + --silent Silence logging. Overrides verbosity. + --logdev Expand content of logging message format. -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. @@ -33,30 +42,29 @@ optional arguments: Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol - --runon {pro,gro} Run on sequencing type. - --adapter {fastp,cutadapt} - Name of adapter removal program - --dedup {seqkit,fqdedup} - Name of program that removes duplicate reads - --trimmer {seqtk,fastx} - Name of read trimming program - --umi Remove umi with fastp - --umi_len UMI_LEN Specify the length of the UMI.If your data does not - utilize UMIs, set to 0. - --max_len MAX_LEN Trim reads to maximum length. Set to -1 to disable - length trimming. + --protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq} + Run on sequencing type. + --adapter-tool {cutadapt,fastp} + Name of adapter removal program. Default: cutadapt + --dedup-tool {seqkit,fqdedup} + Program to use to duplicate reads. Default: seqkit + --trimmer-tool {seqtk,fastx} + Name of read trimming program. Default: seqtk + --umi-len UMI_LEN Specify the length of the UMI.If your data does not + utilize UMIs, set to 0. Default: 0 + --max-len MAX_LEN Trim reads to maximum length. Set to -1 to disable + length trimming. Default: 30 --sob Use seqOutBias to produce signal tracks and incorporate mappability information. --scale Scale output with seqOutBias when producing signal tracks. - --parts PARTS Split suffix tree generation into parts. Increase - this value to lower memory use. --prealignments PREALIGNMENTS [PREALIGNMENTS ...] Space-delimited list of reference genomes to align to before primary alignment. --TSS-name TSS_NAME file_name of TSS annotation file. - --pi-tss PI_TSS file_name of pause index TSS annotation file. - --pi-body PI_BODY file_name of pause index gene body annotation file. + --pi-tss ENSEMBL_TSS file_name of pause index TSS annotation file. + --pi-body ENSEMBL_GENE_BODY + file_name of pause index gene body annotation file. --pre-name PRE_NAME file_name of pre-mRNA annotation file. --anno-name ANNO_NAME file_name of genomic annotation file. @@ -64,11 +72,17 @@ optional arguments: file_name of exon annotation file. --intron-name INTRON_NAME file_name of intron annotation file. + --search-file SEARCH_FILE + file_name of read length matched gt tallymer index + search file --coverage Report library complexity using coverage: reads / (bases in genome / read length) --keep Keep prealignment BAM files --noFIFO Do NOT use named pipes during prealignments. - --complexity Disable library complexity calculation (faster). + --no-complexity Disable library complexity calculation (faster). + --prioritize Plot FRiF/PRiF using mutually exclusive priority + ranked features based on the order of feature + appearance in the feature annotation asset. -V, --version show program's version number and exit required named arguments: