Skip to content

Commit

Permalink
fix: empty inSilicoPredictors object in variant index
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Oct 2, 2024
1 parent 1c396d2 commit 2e55225
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions src/gentropy/datasource/gnomad/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,23 @@ def as_variant_index(self: GnomADVariants) -> VariantIndex:
.drop("locus", "alleles")
.select_globals()
.to_spark(flatten=False)
.withColumn(
"variantId",
VariantIndex.hash_long_variant_ids(
f.col("variantId"),
f.col("chromosome"),
f.col("position"),
self.lenght_threshold,
),
.withColumns(
{
# Once The parsing is done, we have to drop objects with no score from inSilicoPredictors:
"inSilicoPredictors": f.expr(
"filter(inSilicoPredictors, x -> x.score IS NOT NULL)"
).cast(t.ArrayType(t.StructType())),
# Generate a variantId that is hashed for long variant ids:
"variantId": VariantIndex.hash_long_variant_ids(
f.col("variantId"),
f.col("chromosome"),
f.col("position"),
self.lenght_threshold,
),
# We are not capturing the most severe consequence from GnomAD, but this column needed for the schema:
"mostSevereConsequenceId": f.lit(None).cast(t.StringType()),
}
)
.withColumn("mostSevereConsequenceId", f.lit(None).cast(t.StringType()))
),
_schema=VariantIndex.get_schema(),
)

0 comments on commit 2e55225

Please sign in to comment.