Skip to content

Commit

Permalink
modified formatiing of pearl scripts and moved test csv files to bin
Browse files Browse the repository at this point in the history
  • Loading branch information
tives82 committed Nov 29, 2023
1 parent e12a9d0 commit 12bd4a0
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 121 deletions.
Binary file added @eaDir/.editorconfig@SynoEAStream
Binary file not shown.
Binary file added bin/@eaDir/sra_test.csv@SynoEAStream
Binary file not shown.
Binary file added bin/@eaDir/test.csv@SynoEAStream
Binary file not shown.
88 changes: 44 additions & 44 deletions bin/combine_lanes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -18,31 +18,31 @@
my $help;

sub usage {
my $usage = "\ncombine_lanes.pl\nv0.1.0\n
PURPOSE: Reads and parses a CSV sample sheet from create_sample_sheet.pl to combine lanes with system calls to 'cat',
Then creates a new CSV sample sheet with the fields <name>, <r1>, <r2>
my $usage = "\ncombine_lanes.pl\nv0.1.0\n
PURPOSE: Reads and parses a CSV sample sheet from create_sample_sheet.pl to combine lanes with system calls to 'cat',
Then creates a new CSV sample sheet with the fields <name>, <r1>, <r2>
USAGE: combine_lanes.pl -i <input sample sheet> -o <output sample sheet>
USAGE: combine_lanes.pl -i <input sample sheet> -o <output sample sheet>
ARGUMENTS:
-i | --input DIR (Required). Input sample sheet, expecting csv format.
-o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT.
-v | --version Print version number and exit.
-h | --help Print this help message and exit.
ARGUMENTS:
-i | --input DIR (Required). Input sample sheet, expecting csv format.
-o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT.
-v | --version Print version number and exit.
-h | --help Print this help message and exit.
\n";
print $usage;
\n";
print $usage;
}

GetOptions( 'input|i=s' => \$input,
'out|o=s' => \$output,
'version|v' => \$version,
'help|h' => \$help,
GetOptions( 'input|i=s' => \$input,
'out|o=s' => \$output,
'version|v' => \$version,
'help|h' => \$help,
) or die usage();

# Print the version number or the help message and exit if -v or -h is activated
if ( $version ) { die "combine_lanes.pl version 0.1.0\n"; }
if ( $help ) { die usage(); }
if ( $version ) { die "combine_lanes.pl version 0.1.0\n"; }
if ( $help ) { die usage(); }

################################################################################
# Read the sample sheet, expecting a csv format
Expand All @@ -53,37 +53,37 @@ sub usage {
# Set output filehandles
my $succout = open( OUT, ">", "$output" ) if $output ne "--";
my $fhout;
if ( $succout ) { $fhout = *OUT; }
else { $fhout = *STDOUT; }
if ( $succout ) { $fhout = *OUT; }
else { $fhout = *STDOUT; }

# Print sample sheet header line
print $fhout join(",", "sample", "fastq_1", "fastq_2" ), "\n";

while ( <$fh> ) {
chomp $_;
my @line = split(",", $_);
my $seqid = shift @line;
my $ext = "fastq.gz"; # Defaulting to .gz extension

system("mkdir $seqid");

if ( scalar @line > 3 ) {
$ext = ( $line[1] =~ /\.gz$/ )? "fastq.gz" : "fastq";
for ( my $i=1; $i < scalar @line; $i++ ) {
my $reads = ( $i % 2 == 0 )? "R2" : "R1";
system("cat $line[$i] >> $seqid/$seqid\_$reads.$ext");
}
}
else {
$ext = ( $line[2] =~ /\.gz$/ )? "fastq.gz" : "fastq";
system("ln -s $line[2] $seqid/$seqid\_R1.$ext");
system("ln -s $line[2] $seqid/$seqid\_R2.$ext");
}

# Print entry out to the sample sheet with the qualified full path
my $realpathR1 = `realpath $seqid/$seqid\_R1.$ext`; chomp $realpathR1;
my $realpathR2 = `realpath $seqid/$seqid\_R2.$ext`; chomp $realpathR2;
print $fhout join(",", $seqid, $realpathR1, $realpathR2), "\n";
while ( <$fh> ) {
chomp $_;
my @line = split(",", $_);
my $seqid = shift @line;
my $ext = "fastq.gz"; # Defaulting to .gz extension

system("mkdir $seqid");

if ( scalar @line > 3 ) {
$ext = ( $line[1] =~ /\.gz$/ )? "fastq.gz" : "fastq";
for ( my $i=1; $i < scalar @line; $i++ ) {
my $reads = ( $i % 2 == 0 )? "R2" : "R1";
system("cat $line[$i] >> $seqid/$seqid\_$reads.$ext");
}
}
else {
$ext = ( $line[2] =~ /\.gz$/ )? "fastq.gz" : "fastq";
system("ln -s $line[2] $seqid/$seqid\_R1.$ext");
system("ln -s $line[2] $seqid/$seqid\_R2.$ext");
}

# Print entry out to the sample sheet with the qualified full path
my $realpathR1 = `realpath $seqid/$seqid\_R1.$ext`; chomp $realpathR1;
my $realpathR2 = `realpath $seqid/$seqid\_R2.$ext`; chomp $realpathR2;
print $fhout join(",", $seqid, $realpathR1, $realpathR2), "\n";
}
close $fh if ( $succin );
close $fhout if ( $succout );
Expand Down
150 changes: 75 additions & 75 deletions bin/create_samplesheet.pl
Original file line number Diff line number Diff line change
Expand Up @@ -22,105 +22,105 @@
my $help;

sub usage {
my $usage = "\ncreate_samplesheet.pl\nv0.1.0\n
PURPOSE: Reads the contents of a given directory (expected to contain sequencing read data files in FASTQ format, optionally gzip-compressed),
then uses standard file naming conventions to match up samples with their read mate information (including potentially multiple sequencing lanes),
and finally, formats and outputs a csv sample sheet.
USAGE: create_samplesheet.pl -i <input directory> -o <output>
ARGUMENTS:
-i | --input DIR (Required). Input directory name.
-o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT.
-p | --prefix STR (Optional). Prepend a STR prefix to sequence names (to set directories not on current system).
-f | --fullpath (Optional flag). Call realpath and set the fully qualified path in the sequence ID fields.
-s | --sanitize (Optional flag). Find and replace spaces in the sequence ID field.
-v | --version Print version number and exit.
-h | --help Print this help message and exit.
\n";
print $usage;
my $usage = "\ncreate_samplesheet.pl\nv0.1.0\n
PURPOSE: Reads the contents of a given directory (expected to contain sequencing read data files in FASTQ format, optionally gzip-compressed),
then uses standard file naming conventions to match up samples with their read mate information (including potentially multiple sequencing lanes),
and finally, formats and outputs a csv sample sheet.
USAGE: create_samplesheet.pl -i <input directory> -o <output>
ARGUMENTS:
-i | --input DIR (Required). Input directory name.
-o | --output STR (Optional). Output sample sheet file name. If no argument given, prints to STDOUT.
-p | --prefix STR (Optional). Prepend a STR prefix to sequence names (to set directories not on current system).
-f | --fullpath (Optional flag). Call realpath and set the fully qualified path in the sequence ID fields.
-s | --sanitize (Optional flag). Find and replace spaces in the sequence ID field.
-v | --version Print version number and exit.
-h | --help Print this help message and exit.
\n";
print $usage;
}

GetOptions( 'input|i=s' => \$input,
'out|o=s' => \$output,
'prefix|p=s' => \$prefix,
'sanitize|s' => \$sanitize,
'fullpath|f' => \$fullpath,
'version|v' => \$version,
'help|h' => \$help,
GetOptions( 'input|i=s' => \$input,
'out|o=s' => \$output,
'prefix|p=s' => \$prefix,
'sanitize|s' => \$sanitize,
'fullpath|f' => \$fullpath,
'version|v' => \$version,
'help|h' => \$help,
) or die usage();

# Print the version number or the help message and exit if -v or -h is activated
if ( $version ) { die "create_samplesheet.pl version 0.1.0\n"; }
if ( $help ) { die usage(); }
if ( $version ) { die "create_samplesheet.pl version 0.1.0\n"; }
if ( $help ) { die usage(); }

################################################################################
# Read the input directory, capturing all files that end in .fastq (optionally .gz) or some flavor of this extension
opendir( my $dh, $input ) or die "create_samplesheet::ERROR --> Cannot read the given input directory. $!\n";
my @fastqs;
while ( readdir $dh ) {
chomp $_;
while ( readdir $dh ) {
chomp $_;

# Keep this file if it is a FASTQ file
# WARNING: A curse upon ye who put .fastq somewhere in the filename other than the extension
# This *should* work even with full paths that might contain 'fastq' in the path, e.g. path/to/data/fastq/file1
push @fastqs, $_ if ( $_ =~ /\.fq/ || $_ =~ /\.fastq/ );
# Keep this file if it is a FASTQ file
# WARNING: A curse upon ye who put .fastq somewhere in the filename other than the extension
# This *should* work even with full paths that might contain 'fastq' in the path, e.g. path/to/data/fastq/file1
push @fastqs, $_ if ( $_ =~ /\.fq/ || $_ =~ /\.fastq/ );
}

# For each FASTQ file:
# 1) deconstruct the string to get the sample/file name, lane info, and read information.
# 2) Then, populate a lookup table to store matching information per sample.
my %Samples = ();
foreach my $file ( @fastqs ) {

# Deconstruct the sample name -- here I'm doing it with regex capture groups
my $filetmp = $file;
$filetmp =~ s/\s/_/g;
my ( $id, $fr_read ) = $filetmp =~ /(.*)_[Rr]?(1|2)/;
my ( $lane ) = $id =~ s/_L(\d{3})?//;
$lane = ( $1 eq $id )? "001" : $1; # Default to 001 if there is no lane information

# Add the file to the lookup table
# Hierarchy is SAMPLENAME ==> LANE ==> READ (1|2) = FILENAME
$Samples{$id}->{$lane}->{$fr_read} = $file;
foreach my $file ( @fastqs ) {

# Deconstruct the sample name -- here I'm doing it with regex capture groups
my $filetmp = $file;
$filetmp =~ s/\s/_/g;
my ( $id, $fr_read ) = $filetmp =~ /(.*)_[Rr]?(1|2)/;
my ( $lane ) = $id =~ s/_L(\d{3})?//;
$lane = ( $1 eq $id )? "001" : $1; # Default to 001 if there is no lane information

# Add the file to the lookup table
# Hierarchy is SAMPLENAME ==> LANE ==> READ (1|2) = FILENAME
$Samples{$id}->{$lane}->{$fr_read} = $file;
}

# Set output filehandles
my $fileout = open( OUT, ">", "$output" ) if $output ne "--";
my $fhout;
if ( $fileout ) { $fhout = *OUT; }
else { $fhout = *STDOUT; }
if ( $fileout ) { $fhout = *OUT; }
else { $fhout = *STDOUT; }

# Print out a csv sample sheet with format <name>, <r1>, <r2>, <r1b>, <r2b>, ... <r1n>, <r2n>
foreach my $sample ( keys %Samples ) {

# Prepend the prefix if one is set by the user.
# Also do a quick sanity check to remove any double // that might be introduced in various ways.
my @line_out = ( $sample );
$line_out[0] =~ s/\/\//\//; # Well that's an ugly subst regex...
$line_out[0] =~ s/\s/_/g if ( $sanitize );

# Loop to get the lanes/read 1 or 2 information
foreach my $lane ( sort {$a <=> $b} keys %{$Samples{$sample}} ) {
foreach my $read ( sort {$a <=> $b} keys %{$Samples{$sample}->{$lane}} ) {
if ( $fullpath ) {
my $filetmp = $Samples{$sample}->{$lane}->{$read};
my $realpath = `realpath "$input/$filetmp"`;
chomp $realpath;
push @line_out, $realpath;
}
else {
if ( $prefix ) {
push @line_out, "$prefix/$Samples{$sample}->{$lane}->{$read}"; # May need to add $input between prefix/INPUT/name...
}
else {
push @line_out, "$input/$Samples{$sample}->{$lane}->{$read}";
}
}
}
}
print $fhout join(",", @line_out), "\n";
foreach my $sample ( keys %Samples ) {

# Prepend the prefix if one is set by the user.
# Also do a quick sanity check to remove any double // that might be introduced in various ways.
my @line_out = ( $sample );
$line_out[0] =~ s/\/\//\//; # Well that's an ugly subst regex...
$line_out[0] =~ s/\s/_/g if ( $sanitize );

# Loop to get the lanes/read 1 or 2 information
foreach my $lane ( sort {$a <=> $b} keys %{$Samples{$sample}} ) {
foreach my $read ( sort {$a <=> $b} keys %{$Samples{$sample}->{$lane}} ) {
if ( $fullpath ) {
my $filetmp = $Samples{$sample}->{$lane}->{$read};
my $realpath = `realpath "$input/$filetmp"`;
chomp $realpath;
push @line_out, $realpath;
}
else {
if ( $prefix ) {
push @line_out, "$prefix/$Samples{$sample}->{$lane}->{$read}"; # May need to add $input between prefix/INPUT/name...
}
else {
push @line_out, "$input/$Samples{$sample}->{$lane}->{$read}";
}
}
}
}
print $fhout join(",", @line_out), "\n";
}
close $fhout if ( $fileout );

Expand Down
2 changes: 2 additions & 0 deletions bin/sra_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SRR18682785
SRR15196412
3 changes: 3 additions & 0 deletions bin/sra_test_full.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SRR18682785
SRR15196412
SRR14242343
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ params {
max_time = '6.h'

// Input data
add_sra_file = "${projectDir}/data/sra_test.csv"
add_sra_file = "${projectDir}/bin/sra_test.csv"
}
2 changes: 1 addition & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ params {
config_profile_description = 'Full test dataset to check pipeline function'

// Input data for full size test
add_sra_file = "${projectDir}/data/sra_test_full.csv"
add_sra_file = "${projectDir}/bin/sra_test_full.csv"
}

0 comments on commit 12bd4a0

Please sign in to comment.