From 348b156bb15786a2e25ad728f88f14df1e4ecd79 Mon Sep 17 00:00:00 2001 From: Clay Birkett Date: Tue, 15 Oct 2024 14:31:42 -0400 Subject: [PATCH 1/2] add protocol checks for vcf upload using scripts --- bin/load_genotypes_vcf_cxgn_postgres.pl | 58 ++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/bin/load_genotypes_vcf_cxgn_postgres.pl b/bin/load_genotypes_vcf_cxgn_postgres.pl index ebd0294416..46ff758d61 100644 --- a/bin/load_genotypes_vcf_cxgn_postgres.pl +++ b/bin/load_genotypes_vcf_cxgn_postgres.pl @@ -270,7 +270,12 @@ =head1 AUTHOR }); $parser->load_plugin($opt_c); -$parser->parse_with_iterator(); +my $parser_return = $parser->parse_with_iterator(); +if ($parser->get_parse_errors()) { + my $parse_errors = $parser->get_parse_errors(); + print STDERR Dumper $parse_errors; + die("parse errors"); +} my $project_id; my $protocol = $parser->protocol_data(); @@ -278,7 +283,6 @@ =head1 AUTHOR $protocol->{'reference_genome_name'} = $reference_genome_name; $protocol->{'species_name'} = $organism_species; - my $vcf_genotyping_type = $opt_T ? $opt_T : 'vcf_snp_genotyping'; # my $genotyping_type; my $genotype_data_type; @@ -345,6 +349,56 @@ =head1 AUTHOR } } + my @protocol_match_errors; + if ($protocol_id) { + my $new_marker_data = $protocol->{markers}; + my $stored_protocol = CXGN::Genotype::Protocol->new({ + bcs_schema => $schema, + nd_protocol_id => $protocol_id + }); + my $stored_markers = $stored_protocol->markers(); + + my @all_stored_markers = keys %$stored_markers; + my %compare_marker_names = map {$_ => 1} @all_stored_markers; + my @mismatch_marker_names; + while (my ($chrom, $new_marker_data_1) = each %$new_marker_data) { + while (my ($marker_name, $new_marker_details) = each %$new_marker_data_1) { + if (exists($compare_marker_names{$marker_name})) { + while (my ($key, $value) = each %$new_marker_details) { + if ($value ne ($stored_markers->{$marker_name}->{$key})) { + push @protocol_match_errors, "Marker $marker_name in your file has $value for $key, but in the previously stored protocol shows ".$stored_markers->{$marker_name}->{$key}; + } + } + } else { + push @mismatch_marker_names, $marker_name; + } + } + } + + if (scalar(@mismatch_marker_names) > 0){ + my $marker_name_error; + $marker_name_error .= "
"; + foreach my $error ( sort @mismatch_marker_names) { + $marker_name_error .= "$error\n"; + } + print STDERR Dumper $marker_name_error; + print STDERR "These marker names in your file are not in the selected protocol.\n"; + die; + } + + if (scalar(@protocol_match_errors) > 0){ + my $protocol_warning; + foreach my $match_error (@protocol_match_errors) { + $protocol_warning .= $match_error."
"; + } + if (!$opt_A){ + print STDERR Dumper $protocol_warning; + print STDERR "Protocol match error\n"; + die; + } + } + } + $store_genotypes->store_metadata(); my $result = $store_genotypes->store_identifiers(); $protocol_id = $result->{nd_protocol_id}; From 08a18811c90a85b8d8fbc924787fc392e1f0120b Mon Sep 17 00:00:00 2001 From: Clay Birkett Date: Wed, 16 Oct 2024 08:15:35 -0400 Subject: [PATCH 2/2] code cleanup --- bin/load_genotypes_vcf_cxgn_postgres.pl | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bin/load_genotypes_vcf_cxgn_postgres.pl b/bin/load_genotypes_vcf_cxgn_postgres.pl index 46ff758d61..ca2941425c 100644 --- a/bin/load_genotypes_vcf_cxgn_postgres.pl +++ b/bin/load_genotypes_vcf_cxgn_postgres.pl @@ -68,6 +68,7 @@ =head1 AUTHOR =cut use strict; +use warnings; use Getopt::Std; use Data::Dumper; @@ -171,7 +172,7 @@ =head1 AUTHOR open (my $Fout, ">", $opt_o) || die "Can't open file $opt_o\n"; open (my $F, "<", $file) or die "Can't open file $file \n"; my @outline; - my $lastcol; + my $lastcol = 0; while (<$F>) { if ($_ =~ m/^\##/) { print $Fout $_; @@ -245,7 +246,7 @@ =head1 AUTHOR my $organism_q = "SELECT organism_id FROM organism WHERE species = ?"; my @found_organisms; -my $h = $schema->storage->dbh()->prepare($organism_q); +$h = $schema->storage->dbh()->prepare($organism_q); $h->execute($organism_species); while (my ($organism_id) = $h->fetchrow_array()){ push @found_organisms, $organism_id; @@ -341,7 +342,7 @@ =head1 AUTHOR die; } if (scalar(@{$verified_errors->{warning_messages}}) > 0){ - my $warning_string = join ', ', @{$verified_errors->{warning_messages}}; + my $warning_string = join "\n", @{$verified_errors->{warning_messages}}; if (!$opt_A){ print STDERR Dumper $warning_string; print STDERR "You can accept these warnings and continue with store if you use -A\n"; @@ -376,12 +377,9 @@ =head1 AUTHOR } if (scalar(@mismatch_marker_names) > 0){ - my $marker_name_error; - $marker_name_error .= "
"; foreach my $error ( sort @mismatch_marker_names) { - $marker_name_error .= "$error\n"; - } - print STDERR Dumper $marker_name_error; + print STDERR "$error\n"; + } print STDERR "These marker names in your file are not in the selected protocol.\n"; die; } @@ -389,7 +387,7 @@ =head1 AUTHOR if (scalar(@protocol_match_errors) > 0){ my $protocol_warning; foreach my $match_error (@protocol_match_errors) { - $protocol_warning .= $match_error."
"; + $protocol_warning .= "$match_error\n"; } if (!$opt_A){ print STDERR Dumper $protocol_warning;