From 21ac8a894d631e6fe21da333f35f106acf31ee6f Mon Sep 17 00:00:00 2001 From: jelman Date: Thu, 29 Apr 2021 13:57:29 -0700 Subject: [PATCH 1/5] Load dosage such that 2=HOM_ALT instead of 2=UNKNOWN --- software/metax/genotype/CYVCF2Genotype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/metax/genotype/CYVCF2Genotype.py b/software/metax/genotype/CYVCF2Genotype.py index c6190de..489ffe8 100644 --- a/software/metax/genotype/CYVCF2Genotype.py +++ b/software/metax/genotype/CYVCF2Genotype.py @@ -8,7 +8,7 @@ def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist=None, skip_palindromic=False, liftover_conversion=None): logging.log(9, "Processing vcf %s", path) - vcf_reader = VCF(path) + vcf_reader = VCF(path, gts012=True) is_dict_mapping = variant_mapping is not None and type(variant_mapping) == dict From 969ec55548f572d6d913afbbafa74bd5f6170c47 Mon Sep 17 00:00:00 2001 From: jelman Date: Thu, 29 Apr 2021 13:59:15 -0700 Subject: [PATCH 2/5] alts loaded as list, check length of first element --- software/metax/genotype/CYVCF2Genotype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/metax/genotype/CYVCF2Genotype.py b/software/metax/genotype/CYVCF2Genotype.py index 489ffe8..a1ab1fc 100644 --- a/software/metax/genotype/CYVCF2Genotype.py +++ b/software/metax/genotype/CYVCF2Genotype.py @@ -44,7 +44,7 @@ def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist= yield (variant_id, chr, pos, ref, alt, f) + tuple(d) elif mode == "imputed": - if len(alts) > 1: + if len(alts[0]) > 1: logging.log("VCF imputed mode doesn't support multiple ALTs, skipping %s", variant_id) continue From d2ad59fc5733bdfca77e4fd81702e9322bfcad7f Mon Sep 17 00:00:00 2001 From: jelman Date: Thu, 29 Apr 2021 14:00:47 -0700 Subject: [PATCH 3/5] Check for multiples in REFs and ALTs --- software/metax/genotype/CYVCF2Genotype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/software/metax/genotype/CYVCF2Genotype.py b/software/metax/genotype/CYVCF2Genotype.py index a1ab1fc..a30926d 100644 --- a/software/metax/genotype/CYVCF2Genotype.py +++ b/software/metax/genotype/CYVCF2Genotype.py @@ -44,8 +44,8 @@ def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist= yield (variant_id, chr, pos, ref, alt, f) + tuple(d) elif mode == "imputed": - if len(alts[0]) > 1: - logging.log("VCF imputed mode doesn't support multiple ALTs, skipping %s", variant_id) + if (len(ref)) | (len(alts[0])) > 1: + logging.log("VCF imputed mode doesn't support multiple REFs or ALTs, skipping %s", variant_id) continue alt = alts[0] From a88810b4ce003d48d98f246f27ef06e7737fddb1 Mon Sep 17 00:00:00 2001 From: jelman Date: Thu, 29 Apr 2021 14:01:29 -0700 Subject: [PATCH 4/5] logging.log() requires integer level --- software/metax/genotype/CYVCF2Genotype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/metax/genotype/CYVCF2Genotype.py b/software/metax/genotype/CYVCF2Genotype.py index a30926d..b5e3d00 100644 --- a/software/metax/genotype/CYVCF2Genotype.py +++ b/software/metax/genotype/CYVCF2Genotype.py @@ -45,7 +45,7 @@ def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist= elif mode == "imputed": if (len(ref)) | (len(alts[0])) > 1: - logging.log("VCF imputed mode doesn't support multiple REFs or ALTs, skipping %s", variant_id) + logging.log(8, "VCF imputed mode doesn't support multiple REFs or ALTs, skipping %s", variant_id) continue alt = alts[0] From bb678f31427169fea7129208e11798e701363be8 Mon Sep 17 00:00:00 2001 From: jelman Date: Thu, 29 Apr 2021 14:02:48 -0700 Subject: [PATCH 5/5] Use numpy.nanmean to allow for nans --- software/metax/genotype/CYVCF2Genotype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/software/metax/genotype/CYVCF2Genotype.py b/software/metax/genotype/CYVCF2Genotype.py index b5e3d00..3d1face 100644 --- a/software/metax/genotype/CYVCF2Genotype.py +++ b/software/metax/genotype/CYVCF2Genotype.py @@ -40,7 +40,7 @@ def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist= for sample in variant.genotypes: d_ = (sample[0] == a+1) + (sample[1] == a+1) d.append(d_) - f = numpy.mean(numpy.array(d,dtype=numpy.int32))/2 + f = numpy.nanmean(numpy.array(d,dtype=numpy.int32))/2 yield (variant_id, chr, pos, ref, alt, f) + tuple(d) elif mode == "imputed": @@ -60,7 +60,7 @@ def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist= try: d = numpy.apply_along_axis(lambda x: x[0], 1, variant.format("DS")) - f = numpy.mean(numpy.array(d)) / 2 + f = numpy.nanmean(numpy.array(d)) / 2 yield (variant_id, chr, pos, ref, alt, f) + tuple(d) except KeyError: yield RuntimeError("Missing DS field when vcf mode is imputed")