diff --git a/@eaDir/.editorconfig@SynoEAStream b/@eaDir/.editorconfig@SynoEAStream new file mode 100755 index 0000000..7826b29 Binary files /dev/null and b/@eaDir/.editorconfig@SynoEAStream differ diff --git a/bin/@eaDir/sra_test.csv@SynoEAStream b/bin/@eaDir/sra_test.csv@SynoEAStream new file mode 100755 index 0000000..3f622b2 Binary files /dev/null and b/bin/@eaDir/sra_test.csv@SynoEAStream differ diff --git a/bin/@eaDir/test.csv@SynoEAStream b/bin/@eaDir/test.csv@SynoEAStream new file mode 100755 index 0000000..d0d19da Binary files /dev/null and b/bin/@eaDir/test.csv@SynoEAStream differ diff --git a/bin/combine_lanes.pl b/bin/combine_lanes.pl index bf0b6b6..a511463 100755 --- a/bin/combine_lanes.pl +++ b/bin/combine_lanes.pl @@ -18,31 +18,31 @@ my $help; sub usage { - my $usage = "\ncombine_lanes.pl\nv0.1.0\n - PURPOSE: Reads and parses a CSV sample sheet from create_sample_sheet.pl to combine lanes with system calls to 'cat', - Then creates a new CSV sample sheet with the fields , , + my $usage = "\ncombine_lanes.pl\nv0.1.0\n + PURPOSE: Reads and parses a CSV sample sheet from create_sample_sheet.pl to combine lanes with system calls to 'cat', + Then creates a new CSV sample sheet with the fields , , - USAGE: combine_lanes.pl -i -o + USAGE: combine_lanes.pl -i -o - ARGUMENTS: - -i | --input DIR (Required). Input sample sheet, expecting csv format. - -o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT. - -v | --version Print version number and exit. - -h | --help Print this help message and exit. + ARGUMENTS: + -i | --input DIR (Required). Input sample sheet, expecting csv format. + -o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT. + -v | --version Print version number and exit. + -h | --help Print this help message and exit. - \n"; - print $usage; + \n"; + print $usage; } -GetOptions( 'input|i=s' => \$input, - 'out|o=s' => \$output, - 'version|v' => \$version, - 'help|h' => \$help, +GetOptions( 'input|i=s' => \$input, + 'out|o=s' => \$output, + 'version|v' => \$version, + 'help|h' => \$help, ) or die usage(); # Print the version number or the help message and exit if -v or -h is activated -if ( $version ) { die "combine_lanes.pl version 0.1.0\n"; } -if ( $help ) { die usage(); } +if ( $version ) { die "combine_lanes.pl version 0.1.0\n"; } +if ( $help ) { die usage(); } ################################################################################ # Read the sample sheet, expecting a csv format @@ -53,37 +53,37 @@ sub usage { # Set output filehandles my $succout = open( OUT, ">", "$output" ) if $output ne "--"; my $fhout; -if ( $succout ) { $fhout = *OUT; } -else { $fhout = *STDOUT; } +if ( $succout ) { $fhout = *OUT; } +else { $fhout = *STDOUT; } # Print sample sheet header line print $fhout join(",", "sample", "fastq_1", "fastq_2" ), "\n"; -while ( <$fh> ) { - chomp $_; - my @line = split(",", $_); - my $seqid = shift @line; - my $ext = "fastq.gz"; # Defaulting to .gz extension - - system("mkdir $seqid"); - - if ( scalar @line > 3 ) { - $ext = ( $line[1] =~ /\.gz$/ )? "fastq.gz" : "fastq"; - for ( my $i=1; $i < scalar @line; $i++ ) { - my $reads = ( $i % 2 == 0 )? "R2" : "R1"; - system("cat $line[$i] >> $seqid/$seqid\_$reads.$ext"); - } - } - else { - $ext = ( $line[2] =~ /\.gz$/ )? "fastq.gz" : "fastq"; - system("ln -s $line[2] $seqid/$seqid\_R1.$ext"); - system("ln -s $line[2] $seqid/$seqid\_R2.$ext"); - } - - # Print entry out to the sample sheet with the qualified full path - my $realpathR1 = `realpath $seqid/$seqid\_R1.$ext`; chomp $realpathR1; - my $realpathR2 = `realpath $seqid/$seqid\_R2.$ext`; chomp $realpathR2; - print $fhout join(",", $seqid, $realpathR1, $realpathR2), "\n"; +while ( <$fh> ) { + chomp $_; + my @line = split(",", $_); + my $seqid = shift @line; + my $ext = "fastq.gz"; # Defaulting to .gz extension + + system("mkdir $seqid"); + + if ( scalar @line > 3 ) { + $ext = ( $line[1] =~ /\.gz$/ )? "fastq.gz" : "fastq"; + for ( my $i=1; $i < scalar @line; $i++ ) { + my $reads = ( $i % 2 == 0 )? "R2" : "R1"; + system("cat $line[$i] >> $seqid/$seqid\_$reads.$ext"); + } + } + else { + $ext = ( $line[2] =~ /\.gz$/ )? "fastq.gz" : "fastq"; + system("ln -s $line[2] $seqid/$seqid\_R1.$ext"); + system("ln -s $line[2] $seqid/$seqid\_R2.$ext"); + } + + # Print entry out to the sample sheet with the qualified full path + my $realpathR1 = `realpath $seqid/$seqid\_R1.$ext`; chomp $realpathR1; + my $realpathR2 = `realpath $seqid/$seqid\_R2.$ext`; chomp $realpathR2; + print $fhout join(",", $seqid, $realpathR1, $realpathR2), "\n"; } close $fh if ( $succin ); close $fhout if ( $succout ); diff --git a/bin/create_samplesheet.pl b/bin/create_samplesheet.pl index 6eabd6c..24b1ccf 100755 --- a/bin/create_samplesheet.pl +++ b/bin/create_samplesheet.pl @@ -22,105 +22,105 @@ my $help; sub usage { - my $usage = "\ncreate_samplesheet.pl\nv0.1.0\n - PURPOSE: Reads the contents of a given directory (expected to contain sequencing read data files in FASTQ format, optionally gzip-compressed), - then uses standard file naming conventions to match up samples with their read mate information (including potentially multiple sequencing lanes), - and finally, formats and outputs a csv sample sheet. - - USAGE: create_samplesheet.pl -i -o - - ARGUMENTS: - -i | --input DIR (Required). Input directory name. - -o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT. - -p | --prefix STR (Optional). Prepend a STR prefix to sequence names (to set directories not on current system). - -f | --fullpath (Optional flag). Call realpath and set the fully qualified path in the sequence ID fields. - -s | --sanitize (Optional flag). Find and replace spaces in the sequence ID field. - -v | --version Print version number and exit. - -h | --help Print this help message and exit. - - \n"; - print $usage; + my $usage = "\ncreate_samplesheet.pl\nv0.1.0\n + PURPOSE: Reads the contents of a given directory (expected to contain sequencing read data files in FASTQ format, optionally gzip-compressed), + then uses standard file naming conventions to match up samples with their read mate information (including potentially multiple sequencing lanes), + and finally, formats and outputs a csv sample sheet. + + USAGE: create_samplesheet.pl -i -o + + ARGUMENTS: + -i | --input DIR (Required). Input directory name. + -o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT. + -p | --prefix STR (Optional). Prepend a STR prefix to sequence names (to set directories not on current system). + -f | --fullpath (Optional flag). Call realpath and set the fully qualified path in the sequence ID fields. + -s | --sanitize (Optional flag). Find and replace spaces in the sequence ID field. + -v | --version Print version number and exit. + -h | --help Print this help message and exit. + + \n"; + print $usage; } -GetOptions( 'input|i=s' => \$input, - 'out|o=s' => \$output, - 'prefix|p=s' => \$prefix, - 'sanitize|s' => \$sanitize, - 'fullpath|f' => \$fullpath, - 'version|v' => \$version, - 'help|h' => \$help, +GetOptions( 'input|i=s' => \$input, + 'out|o=s' => \$output, + 'prefix|p=s' => \$prefix, + 'sanitize|s' => \$sanitize, + 'fullpath|f' => \$fullpath, + 'version|v' => \$version, + 'help|h' => \$help, ) or die usage(); # Print the version number or the help message and exit if -v or -h is activated -if ( $version ) { die "create_samplesheet.pl version 0.1.0\n"; } -if ( $help ) { die usage(); } +if ( $version ) { die "create_samplesheet.pl version 0.1.0\n"; } +if ( $help ) { die usage(); } ################################################################################ # Read the input directory, capturing all files that end in .fastq (optionally .gz) or some flavor of this extension opendir( my $dh, $input ) or die "create_samplesheet::ERROR --> Cannot read the given input directory. $!\n"; my @fastqs; -while ( readdir $dh ) { - chomp $_; +while ( readdir $dh ) { + chomp $_; - # Keep this file if it is a FASTQ file - # WARNING: A curse upon ye who put .fastq somewhere in the filename other than the extension - # This *should* work even with full paths that might contain 'fastq' in the path, e.g. path/to/data/fastq/file1 - push @fastqs, $_ if ( $_ =~ /\.fq/ || $_ =~ /\.fastq/ ); + # Keep this file if it is a FASTQ file + # WARNING: A curse upon ye who put .fastq somewhere in the filename other than the extension + # This *should* work even with full paths that might contain 'fastq' in the path, e.g. path/to/data/fastq/file1 + push @fastqs, $_ if ( $_ =~ /\.fq/ || $_ =~ /\.fastq/ ); } # For each FASTQ file: # 1) deconstruct the string to get the sample/file name, lane info, and read information. # 2) Then, populate a lookup table to store matching information per sample. my %Samples = (); -foreach my $file ( @fastqs ) { - - # Deconstruct the sample name -- here I'm doing it with regex capture groups - my $filetmp = $file; - $filetmp =~ s/\s/_/g; - my ( $id, $fr_read ) = $filetmp =~ /(.*)_[Rr]?(1|2)/; - my ( $lane ) = $id =~ s/_L(\d{3})?//; - $lane = ( $1 eq $id )? "001" : $1; # Default to 001 if there is no lane information - - # Add the file to the lookup table - # Hierarchy is SAMPLENAME ==> LANE ==> READ (1|2) = FILENAME - $Samples{$id}->{$lane}->{$fr_read} = $file; +foreach my $file ( @fastqs ) { + + # Deconstruct the sample name -- here I'm doing it with regex capture groups + my $filetmp = $file; + $filetmp =~ s/\s/_/g; + my ( $id, $fr_read ) = $filetmp =~ /(.*)_[Rr]?(1|2)/; + my ( $lane ) = $id =~ s/_L(\d{3})?//; + $lane = ( $1 eq $id )? "001" : $1; # Default to 001 if there is no lane information + + # Add the file to the lookup table + # Hierarchy is SAMPLENAME ==> LANE ==> READ (1|2) = FILENAME + $Samples{$id}->{$lane}->{$fr_read} = $file; } # Set output filehandles my $fileout = open( OUT, ">", "$output" ) if $output ne "--"; my $fhout; -if ( $fileout ) { $fhout = *OUT; } -else { $fhout = *STDOUT; } +if ( $fileout ) { $fhout = *OUT; } +else { $fhout = *STDOUT; } # Print out a csv sample sheet with format , , , , , ... , -foreach my $sample ( keys %Samples ) { - - # Prepend the prefix if one is set by the user. - # Also do a quick sanity check to remove any double // that might be introduced in various ways. - my @line_out = ( $sample ); - $line_out[0] =~ s/\/\//\//; # Well that's an ugly subst regex... - $line_out[0] =~ s/\s/_/g if ( $sanitize ); - - # Loop to get the lanes/read 1 or 2 information - foreach my $lane ( sort {$a <=> $b} keys %{$Samples{$sample}} ) { - foreach my $read ( sort {$a <=> $b} keys %{$Samples{$sample}->{$lane}} ) { - if ( $fullpath ) { - my $filetmp = $Samples{$sample}->{$lane}->{$read}; - my $realpath = `realpath "$input/$filetmp"`; - chomp $realpath; - push @line_out, $realpath; - } - else { - if ( $prefix ) { - push @line_out, "$prefix/$Samples{$sample}->{$lane}->{$read}"; # May need to add $input between prefix/INPUT/name... - } - else { - push @line_out, "$input/$Samples{$sample}->{$lane}->{$read}"; - } - } - } - } - print $fhout join(",", @line_out), "\n"; +foreach my $sample ( keys %Samples ) { + + # Prepend the prefix if one is set by the user. + # Also do a quick sanity check to remove any double // that might be introduced in various ways. + my @line_out = ( $sample ); + $line_out[0] =~ s/\/\//\//; # Well that's an ugly subst regex... + $line_out[0] =~ s/\s/_/g if ( $sanitize ); + + # Loop to get the lanes/read 1 or 2 information + foreach my $lane ( sort {$a <=> $b} keys %{$Samples{$sample}} ) { + foreach my $read ( sort {$a <=> $b} keys %{$Samples{$sample}->{$lane}} ) { + if ( $fullpath ) { + my $filetmp = $Samples{$sample}->{$lane}->{$read}; + my $realpath = `realpath "$input/$filetmp"`; + chomp $realpath; + push @line_out, $realpath; + } + else { + if ( $prefix ) { + push @line_out, "$prefix/$Samples{$sample}->{$lane}->{$read}"; # May need to add $input between prefix/INPUT/name... + } + else { + push @line_out, "$input/$Samples{$sample}->{$lane}->{$read}"; + } + } + } + } + print $fhout join(",", @line_out), "\n"; } close $fhout if ( $fileout ); diff --git a/bin/sra_test.csv b/bin/sra_test.csv new file mode 100644 index 0000000..b0c2e04 --- /dev/null +++ b/bin/sra_test.csv @@ -0,0 +1,2 @@ +SRR18682785 +SRR15196412 diff --git a/bin/sra_test_full.csv b/bin/sra_test_full.csv new file mode 100755 index 0000000..20404d5 --- /dev/null +++ b/bin/sra_test_full.csv @@ -0,0 +1,3 @@ +SRR18682785 +SRR15196412 +SRR14242343 diff --git a/conf/test.config b/conf/test.config index fa4e2b7..ea454ea 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,5 +22,5 @@ params { max_time = '6.h' // Input data - add_sra_file = "${projectDir}/data/sra_test.csv" + add_sra_file = "${projectDir}/bin/sra_test.csv" } diff --git a/conf/test_full.config b/conf/test_full.config index b9861a7..6b12a7a 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,5 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - add_sra_file = "${projectDir}/data/sra_test_full.csv" + add_sra_file = "${projectDir}/bin/sra_test_full.csv" }