modified formatiing of pearl scripts and moved test csv files to bin

UPHL-BioNGS · Nov 29, 2023 · 12bd4a0 · 12bd4a0
1 parent e12a9d0
commit 12bd4a0
Show file tree

Hide file tree

Showing 9 changed files with 126 additions and 121 deletions.
diff --git a/@eaDir/.editorconfig@SynoEAStream b/@eaDir/.editorconfig@SynoEAStream
diff --git a/bin/@eaDir/sra_test.csv@SynoEAStream b/bin/@eaDir/sra_test.csv@SynoEAStream
diff --git a/bin/@eaDir/test.csv@SynoEAStream b/bin/@eaDir/test.csv@SynoEAStream
diff --git a/bin/combine_lanes.pl b/bin/combine_lanes.pl
@@ -18,31 +18,31 @@
 my $help;
 
 sub usage {
-	my $usage = "\ncombine_lanes.pl\nv0.1.0\n
-	PURPOSE: Reads and parses a CSV sample sheet from create_sample_sheet.pl to combine lanes with system calls to 'cat',
-		 Then creates a new CSV sample sheet with the fields <name>, <r1>, <r2>
+    my $usage = "\ncombine_lanes.pl\nv0.1.0\n
+        PURPOSE: Reads and parses a CSV sample sheet from create_sample_sheet.pl to combine lanes with system calls to 'cat',
+                 Then creates a new CSV sample sheet with the fields <name>, <r1>, <r2>
 
-	USAGE:	combine_lanes.pl -i <input sample sheet> -o <output sample sheet>
+        USAGE:   combine_lanes.pl -i <input sample sheet> -o <output sample sheet>
 
-	ARGUMENTS:
-	-i | --input		DIR (Required).  Input sample sheet, expecting csv format.
-	-o | --output		STR (Optional).  Output sample sheet file name.  If no argument given, prints to STDOUT.
-	-v | --version	 	Print version number and exit.
-	-h | --help		Print this help message and exit.
+        ARGUMENTS:
+        -i | --input        DIR (Required).  Input sample sheet, expecting csv format.
+        -o | --output       STR (Optional).  Output sample sheet file name.  If no argument given, prints to STDOUT.
+        -v | --version      Print version number and exit.
+        -h | --help         Print this help message and exit.
 
-	\n";
-	print $usage;
+    \n";
+    print $usage;
 }
 
-GetOptions(	'input|i=s'  => \$input,
-			'out|o=s'    => \$output,
-			'version|v'  => \$version,
-			'help|h'     => \$help,
+GetOptions( 'input|i=s'  => \$input,
+            'out|o=s'    => \$output,
+            'version|v'  => \$version,
+            'help|h'     => \$help,
 ) or die usage();
 
 # Print the version number or the help message and exit if -v or -h is activated
-if ( $version ) 	{ die "combine_lanes.pl version 0.1.0\n"; }
-if ( $help    )     { die usage();											}
+if ( $version ) { die "combine_lanes.pl version 0.1.0\n"; }
+if ( $help    ) { die usage();                                           }
 
 ################################################################################
 # Read the sample sheet, expecting a csv format
@@ -53,37 +53,37 @@ sub usage {
 # Set output filehandles
 my $succout = open( OUT, ">", "$output" ) if $output ne "--";
 my $fhout;
-if ( $succout )		{	$fhout = *OUT;			}
-else				{	$fhout = *STDOUT;		}
+if ( $succout )     {   $fhout = *OUT;           }
+else                {   $fhout = *STDOUT;        }
 
 # Print sample sheet header line
 print $fhout join(",", "sample", "fastq_1", "fastq_2" ), "\n";
 
-while ( <$fh> )		{
-	chomp $_;
-	my @line = split(",", $_);
-	my $seqid = shift @line;
-	my $ext = "fastq.gz"; 			# Defaulting to .gz extension
-
-	system("mkdir $seqid");
-
-	if ( scalar @line > 3 )	{
-		$ext = ( $line[1] =~ /\.gz$/ )? "fastq.gz" : "fastq";
-		for ( my $i=1; $i < scalar @line; $i++ )	{
-			my $reads = ( $i % 2 == 0 )? "R2" : "R1";
-			system("cat $line[$i] >> $seqid/$seqid\_$reads.$ext");
-		}
-	}
-	else 	{
-		$ext = ( $line[2] =~ /\.gz$/ )? "fastq.gz" : "fastq";
-		system("ln -s $line[2] $seqid/$seqid\_R1.$ext");
-		system("ln -s $line[2] $seqid/$seqid\_R2.$ext");
-	}
-
-	# Print entry out to the sample sheet with the qualified full path
-	my $realpathR1 = `realpath $seqid/$seqid\_R1.$ext`; chomp $realpathR1;
-	my $realpathR2 = `realpath $seqid/$seqid\_R2.$ext`; chomp $realpathR2;
-	print $fhout join(",", $seqid, $realpathR1, $realpathR2), "\n";
+while ( <$fh> ) {
+    chomp $_;
+    my @line = split(",", $_);
+    my $seqid = shift @line;
+    my $ext = "fastq.gz";           # Defaulting to .gz extension
+
+    system("mkdir $seqid");
+
+    if ( scalar @line > 3 ) {
+        $ext = ( $line[1] =~ /\.gz$/ )? "fastq.gz" : "fastq";
+        for ( my $i=1; $i < scalar @line; $i++ ) {
+            my $reads = ( $i % 2 == 0 )? "R2" : "R1";
+            system("cat $line[$i] >> $seqid/$seqid\_$reads.$ext");
+        }
+    }
+    else    {
+        $ext = ( $line[2] =~ /\.gz$/ )? "fastq.gz" : "fastq";
+        system("ln -s $line[2] $seqid/$seqid\_R1.$ext");
+        system("ln -s $line[2] $seqid/$seqid\_R2.$ext");
+    }
+
+    # Print entry out to the sample sheet with the qualified full path
+    my $realpathR1 = `realpath $seqid/$seqid\_R1.$ext`; chomp $realpathR1;
+    my $realpathR2 = `realpath $seqid/$seqid\_R2.$ext`; chomp $realpathR2;
+    print $fhout join(",", $seqid, $realpathR1, $realpathR2), "\n";
 }
 close $fh if ( $succin );
 close $fhout if ( $succout );

diff --git a/bin/create_samplesheet.pl b/bin/create_samplesheet.pl
@@ -22,105 +22,105 @@
 my $help;
 
 sub usage {
-	my $usage = "\ncreate_samplesheet.pl\nv0.1.0\n
-	PURPOSE: Reads the contents of a given directory (expected to contain sequencing read data files in FASTQ format, optionally gzip-compressed),
-		 then uses standard file naming conventions to match up samples with their read mate information (including potentially multiple sequencing lanes),
-		 and finally, formats and outputs a csv sample sheet.
-
-	USAGE:	create_samplesheet.pl -i <input directory> -o <output>
-
-	ARGUMENTS:
-	-i | --input		DIR (Required).  Input directory name.
-	-o | --output		STR (Optional).  Output sample sheet file name.  If no argument given, prints to STDOUT.
-	-p | --prefix		STR (Optional).  Prepend a STR prefix to sequence names (to set directories not on current system).
-	-f | --fullpath 	(Optional flag). Call realpath and set the fully qualified path in the sequence ID fields.
-	-s | --sanitize		(Optional flag). Find and replace spaces in the sequence ID field.
-	-v | --version	 	Print version number and exit.
-	-h | --help		Print this help message and exit.
-
-	\n";
-	print $usage;
+    my $usage = "\ncreate_samplesheet.pl\nv0.1.0\n
+        PURPOSE: Reads the contents of a given directory (expected to contain sequencing read data files in FASTQ format, optionally gzip-compressed),
+                 then uses standard file naming conventions to match up samples with their read mate information (including potentially multiple sequencing lanes),
+                 and finally, formats and outputs a csv sample sheet.
+
+        USAGE:   create_samplesheet.pl -i <input directory> -o <output>
+
+        ARGUMENTS:
+        -i | --input        DIR (Required).  Input directory name.
+        -o | --output       STR (Optional).  Output sample sheet file name.  If no argument given, prints to STDOUT.
+        -p | --prefix       STR (Optional).  Prepend a STR prefix to sequence names (to set directories not on current system).
+        -f | --fullpath     (Optional flag). Call realpath and set the fully qualified path in the sequence ID fields.
+        -s | --sanitize     (Optional flag). Find and replace spaces in the sequence ID field.
+        -v | --version      Print version number and exit.
+        -h | --help         Print this help message and exit.
+
+    \n";
+    print $usage;
 }
 
-GetOptions(	'input|i=s'  => \$input,
-			'out|o=s'    => \$output,
-			'prefix|p=s' => \$prefix,
-			'sanitize|s' => \$sanitize,
-			'fullpath|f' => \$fullpath,
-			'version|v'  => \$version,
-			'help|h'     => \$help,
+GetOptions( 'input|i=s'  => \$input,
+            'out|o=s'    => \$output,
+            'prefix|p=s' => \$prefix,
+            'sanitize|s' => \$sanitize,
+            'fullpath|f' => \$fullpath,
+            'version|v'  => \$version,
+            'help|h'     => \$help,
 ) or die usage();
 
 # Print the version number or the help message and exit if -v or -h is activated
-if ( $version ) 	{ die "create_samplesheet.pl version 0.1.0\n"; }
-if ( $help    )     { die usage();											}
+if ( $version ) { die "create_samplesheet.pl version 0.1.0\n"; }
+if ( $help    ) { die usage();                                           }
 
 ################################################################################
 # Read the input directory, capturing all files that end in .fastq (optionally .gz) or some flavor of this extension
 opendir( my $dh, $input ) or die "create_samplesheet::ERROR --> Cannot read the given input directory.  $!\n";
 my @fastqs;
-while ( readdir $dh )	{
-	chomp $_;
+while ( readdir $dh )   {
+    chomp $_;
 
-	# Keep this file if it is a FASTQ file
-	# WARNING: A curse upon ye who put .fastq somewhere in the filename other than the extension
-	# This *should* work even with full paths that might contain 'fastq' in the path, e.g. path/to/data/fastq/file1
-	push @fastqs, $_ if ( $_ =~ /\.fq/ || $_ =~ /\.fastq/ );
+    # Keep this file if it is a FASTQ file
+    # WARNING: A curse upon ye who put .fastq somewhere in the filename other than the extension
+    # This *should* work even with full paths that might contain 'fastq' in the path, e.g. path/to/data/fastq/file1
+    push @fastqs, $_ if ( $_ =~ /\.fq/ || $_ =~ /\.fastq/ );
 }
 
 # For each FASTQ file:
 # 1) deconstruct the string to get the sample/file name, lane info, and read information.
 # 2) Then, populate a lookup table to store matching information per sample.
 my %Samples = ();
-foreach my $file ( @fastqs )	{
-
-	# Deconstruct the sample name -- here I'm doing it with regex capture groups
-	my $filetmp = $file;
-	$filetmp =~ s/\s/_/g;
-	my ( $id, $fr_read ) = $filetmp =~ /(.*)_[Rr]?(1|2)/;
-	my ( $lane ) = $id =~ s/_L(\d{3})?//;
-	$lane = ( $1 eq $id )? "001" : $1;			# Default to 001 if there is no lane information
-
-	# Add the file to the lookup table
-	# Hierarchy is SAMPLENAME ==> LANE ==> READ (1|2) = FILENAME
-	$Samples{$id}->{$lane}->{$fr_read} = $file;
+foreach my $file ( @fastqs )   {
+
+    # Deconstruct the sample name -- here I'm doing it with regex capture groups
+    my $filetmp = $file;
+    $filetmp =~ s/\s/_/g;
+    my ( $id, $fr_read ) = $filetmp =~ /(.*)_[Rr]?(1|2)/;
+    my ( $lane ) = $id =~ s/_L(\d{3})?//;
+    $lane = ( $1 eq $id )? "001" : $1;          # Default to 001 if there is no lane information
+
+    # Add the file to the lookup table
+    # Hierarchy is SAMPLENAME ==> LANE ==> READ (1|2) = FILENAME
+    $Samples{$id}->{$lane}->{$fr_read} = $file;
 }
 
 # Set output filehandles
 my $fileout = open( OUT, ">", "$output" ) if $output ne "--";
 my $fhout;
-if ( $fileout )		{	$fhout = *OUT;			}
-else				{	$fhout = *STDOUT;		}
+if ( $fileout )     {   $fhout = *OUT;           }
+else                {   $fhout = *STDOUT;        }
 
 # Print out a csv sample sheet with format <name>, <r1>, <r2>, <r1b>, <r2b>, ... <r1n>, <r2n>
-foreach my $sample ( keys %Samples )		{
-
-	# Prepend the prefix if one is set by the user.
-	# Also do a quick sanity check to remove any double // that might be introduced in various ways.
-	my @line_out = ( $sample );
-	$line_out[0] =~ s/\/\//\//;			# Well that's an ugly subst regex...
-	$line_out[0] =~ s/\s/_/g if ( $sanitize );
-
-	# Loop to get the lanes/read 1 or 2 information
-	foreach my $lane ( sort {$a <=> $b} keys %{$Samples{$sample}} )	{
-		foreach my $read ( sort {$a <=> $b} keys %{$Samples{$sample}->{$lane}} )	{
-			if ( $fullpath ) 	{
-				my $filetmp = $Samples{$sample}->{$lane}->{$read};
-				my $realpath = `realpath "$input/$filetmp"`;
-				chomp $realpath;
-				push @line_out, $realpath;
-			}
-			else	{
-				if ( $prefix ) 	{
-					push @line_out, "$prefix/$Samples{$sample}->{$lane}->{$read}";		# May need to add $input between prefix/INPUT/name...
-				}
-				else 	{
-					push @line_out, "$input/$Samples{$sample}->{$lane}->{$read}";
-				}
-			}
-		}
-	}
-	print $fhout join(",", @line_out), "\n";
+foreach my $sample ( keys %Samples )   {
+
+    # Prepend the prefix if one is set by the user.
+    # Also do a quick sanity check to remove any double // that might be introduced in various ways.
+    my @line_out = ( $sample );
+    $line_out[0] =~ s/\/\//\//;          # Well that's an ugly subst regex...
+    $line_out[0] =~ s/\s/_/g if ( $sanitize );
+
+    # Loop to get the lanes/read 1 or 2 information
+    foreach my $lane ( sort {$a <=> $b} keys %{$Samples{$sample}} )   {
+        foreach my $read ( sort {$a <=> $b} keys %{$Samples{$sample}->{$lane}} )   {
+            if ( $fullpath )   {
+                my $filetmp = $Samples{$sample}->{$lane}->{$read};
+                my $realpath = `realpath "$input/$filetmp"`;
+                chomp $realpath;
+                push @line_out, $realpath;
+            }
+            else    {
+                if ( $prefix )  {
+                    push @line_out, "$prefix/$Samples{$sample}->{$lane}->{$read}";        # May need to add $input between prefix/INPUT/name...
+                }
+                else    {
+                    push @line_out, "$input/$Samples{$sample}->{$lane}->{$read}";
+                }
+            }
+        }
+    }
+    print $fhout join(",", @line_out), "\n";
 }
 close $fhout if ( $fileout );
 

diff --git a/bin/sra_test.csv b/bin/sra_test.csv
@@ -0,0 +1,2 @@
+SRR18682785
+SRR15196412
diff --git a/bin/sra_test_full.csv b/bin/sra_test_full.csv
@@ -0,0 +1,3 @@
+SRR18682785
+SRR15196412
+SRR14242343
diff --git a/conf/test.config b/conf/test.config
@@ -22,5 +22,5 @@ params {
     max_time   = '6.h'
 
     // Input data
-    add_sra_file = "${projectDir}/data/sra_test.csv"
+    add_sra_file = "${projectDir}/bin/sra_test.csv"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,5 +15,5 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    add_sra_file = "${projectDir}/data/sra_test_full.csv"
+    add_sra_file = "${projectDir}/bin/sra_test_full.csv"
 }