Skip to content

Commit

Permalink
adding check for duplicated pool names in the input genotype file
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed Jun 25, 2024
1 parent 8331afd commit fedd75a
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 20 deletions.
2 changes: 1 addition & 1 deletion src/aldknni.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,7 @@ pub fn impute_aldknni(
Ok(x) => x,
Err(_) => return Err(ImputefError{
code: 129,
message: "Error writing the output file using the write_tsv() method within impute_aldknni(): ".to_owned() + &out
message: "Error writing the output file using the write_tsv() method within impute_aldknni(): ".to_owned() + out
})
};
Ok(out)
Expand Down
36 changes: 29 additions & 7 deletions src/geno.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ pub fn load_geno<'a, 'b>(
Err(_) => {
return Err(ImputefError {
code: 316,
message: "Error reading the allele frequency table file: ".to_owned() + &fname,
message: "Error reading the allele frequency table file: ".to_owned() + fname,
})
}
},
Expand All @@ -435,7 +435,7 @@ pub fn load_geno<'a, 'b>(
code: 317,
message: "Please check the format of the allele frequency table text file: "
.to_owned()
+ &fname,
+ fname,
})
}
};
Expand All @@ -460,26 +460,48 @@ pub fn load_geno<'a, 'b>(
true => (),
false => return Err(ImputefError{
code: 318,
message: "Error unable to properly parse the header line. Please make sure the allele frequency table file: ".to_owned() + &fname +" is separated by tabs, commas, or semi-colons."
message: "Error unable to properly parse the header line. Please make sure the allele frequency table file: ".to_owned() + fname +" is separated by tabs, commas, or semi-colons."
})
};
let pool_names: Vec<String> = vec_header[3..vec_header.len()]
.iter()
.map(|&x| x.to_owned())
.collect();
let n = pool_names.len();
// Check for duplicated pool names
let mut unique_pool_names: Vec<String> = vec![];
for name_source in pool_names.iter() {
let mut duplicated = false;
for name_destination in unique_pool_names.iter() {
if name_source == name_destination {
duplicated = true;
break;
}
}
if !duplicated {
unique_pool_names.push(name_source.to_string())
}
}
if n > unique_pool_names.len() {
return Err(ImputefError {
code: 139,
message: "Error: there are duplicated pool names in file: ".to_owned()
+ fname
+ " in load_geno() function.",
});
}
// If a single pool size was supplied then we are assuming the same sizes across all pools
if filter_stats.pool_sizes.len() == 1 {
filter_stats.pool_sizes = vec![filter_stats.pool_sizes[0]; n];
}
match filter_stats.pool_sizes.len() == n {
true => (),
false => return Err(ImputefError {
code: 319,
code: 320,
message:
"Error in the number of pools and the pool sizes do not match in the input file: "
.to_owned()
+ &fname,
+ fname,
}),
};
let file_geno = FileGeno {
Expand All @@ -492,11 +514,11 @@ pub fn load_geno<'a, 'b>(
) {
Ok(x) => x,
Err(_e) => return Err(ImputefError {
code: 320,
code: 321,
message:
"Error parsing the genotype data (extracted from allele frequency table text file: "
.to_owned()
+ &fname
+ fname
+ ") via convert_into_genotypes_and_phenotypes() method within impute().",
}),
};
Expand Down
32 changes: 27 additions & 5 deletions src/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1262,7 +1262,7 @@ pub fn load_sync<'a, 'b>(
Err(_) => {
return Err(ImputefError {
code: 763,
message: "Error reading the input vcf file: ".to_owned() + &fname,
message: "Error reading the input vcf file: ".to_owned() + fname,
})
}
};
Expand All @@ -1273,7 +1273,7 @@ pub fn load_sync<'a, 'b>(
Err(_) => {
return Err(ImputefError {
code: 764,
message: "Error reading the input sync file: ".to_owned() + &fname,
message: "Error reading the input sync file: ".to_owned() + fname,
})
}
};
Expand All @@ -1300,11 +1300,33 @@ pub fn load_sync<'a, 'b>(
return Err(ImputefError {
code: 765,
message: "Error reading the header line of the sync file: ".to_owned()
+ &fname
+ fname
+ ". Please make sure the header line starts with '#chr'.",
})
}
};
// Check for duplicated pool names
let mut unique_pool_names: Vec<String> = vec![];
for name_source in pool_names.iter() {
let mut duplicated = false;
for name_destination in unique_pool_names.iter() {
if name_source == name_destination {
duplicated = true;
break;
}
}
if !duplicated {
unique_pool_names.push(name_source.to_string())
}
}
if n > unique_pool_names.len() {
return Err(ImputefError {
code: 766,
message: "Error: there are duplicated pool names in file: ".to_owned()
+ fname
+ " in load_sync() function.",
});
}
// If a single pool size was supplied then we are assuming the same sizes across all pools
if filter_stats.pool_sizes.len() == 1 {
filter_stats.pool_sizes = vec![filter_stats.pool_sizes[0]; n];
Expand All @@ -1313,11 +1335,11 @@ pub fn load_sync<'a, 'b>(
true => (),
false => {
return Err(ImputefError {
code: 766,
code: 767,
message:
"Error: the number of pools and the pool sizes do not match in the sync file: "
.to_owned()
+ &fname,
+ fname,
})
}
};
Expand Down
36 changes: 29 additions & 7 deletions src/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ pub fn load_vcf<'a, 'b>(
return Err(ImputefError {
code: 823,
message: "Error opening the input vcf file: ".to_owned()
+ &fname
+ fname
+ " in load_vcf() function.",
})
}
Expand All @@ -639,7 +639,7 @@ pub fn load_vcf<'a, 'b>(
return Err(ImputefError {
code: 824,
message: "Error reading the input vcf file: ".to_owned()
+ &fname
+ fname
+ " in load_vcf() function.",
})
}
Expand All @@ -661,6 +661,28 @@ pub fn load_vcf<'a, 'b>(
}
}
let n = pool_names.len();
// Check for duplicated pool names
let mut unique_pool_names: Vec<String> = vec![];
for name_source in pool_names.iter() {
let mut duplicated = false;
for name_destination in unique_pool_names.iter() {
if name_source == name_destination {
duplicated = true;
break;
}
}
if !duplicated {
unique_pool_names.push(name_source.to_string())
}
}
if n > unique_pool_names.len() {
return Err(ImputefError {
code: 825,
message: "Error: there are duplicated pool names in file: ".to_owned()
+ fname
+ " in load_vcf() function.",
});
}
// If a single pool size was supplied then we are assuming the same sizes across all pools
if filter_stats.pool_sizes.len() == 1 {
filter_stats.pool_sizes = vec![filter_stats.pool_sizes[0]; n];
Expand All @@ -669,10 +691,10 @@ pub fn load_vcf<'a, 'b>(
true => (),
false => {
return Err(ImputefError {
code: 825,
code: 826,
message: "Error: the number of pools and the pool sizes do not match in file: "
.to_owned()
+ &fname
+ fname
+ " in load_vcf() function.",
})
}
Expand All @@ -690,7 +712,7 @@ pub fn load_vcf<'a, 'b>(
.read_analyse_write(filter_stats, &fname_sync_out, n_threads, vcf_to_sync) {
Ok(x) => x,
Err(_) => return Err(ImputefError{
code: 826,
code: 827,
message: "Error converting the vcf into sync via read_analyse_write() method within impute().".to_owned()
})
};
Expand All @@ -706,9 +728,9 @@ pub fn load_vcf<'a, 'b>(
.convert_into_genotypes_and_phenotypes(filter_stats, false, n_threads) {
Ok(x) => x,
Err(_e) => return Err(ImputefError{
code: 827,
code: 828,
message: "Error parsing the input genotype (converted from vcf into sync): ".to_owned() +
&fname +
fname +
" and dummy phenotype data via convert_into_genotypes_and_phenotypes() method within impute()."
})
};
Expand Down

0 comments on commit fedd75a

Please sign in to comment.