Skip to content

Commit

Permalink
Some refactoring of schema generation
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Apr 19, 2024
1 parent d655bcf commit ad03dfc
Showing 1 changed file with 74 additions and 50 deletions.
124 changes: 74 additions & 50 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ class VcfPartition:
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
)

# TODO refactor this to have embedded Contig dataclass, Filters
# and Samples dataclasses to allow for more information to be
# retained and forward compatibility.


@dataclasses.dataclass
class IcfMetadata:
Expand Down Expand Up @@ -1242,6 +1246,50 @@ def new(**kwargs):
spec._choose_compressor_settings()
return spec

@staticmethod
def from_field(
vcf_field,
*,
num_variants,
num_samples,
variants_chunk_size,
samples_chunk_size,
variable_name=None,
):
shape = [num_variants]
prefix = "variant_"
dimensions = ["variants"]
chunks = [variants_chunk_size]
if vcf_field.category == "FORMAT":
prefix = "call_"
shape.append(num_samples)
chunks.append(samples_chunk_size)
dimensions.append("samples")
if variable_name is None:
variable_name = prefix + vcf_field.name
# TODO make an option to add in the empty extra dimension
if vcf_field.summary.max_number > 1:
shape.append(vcf_field.summary.max_number)
# TODO we should really be checking this to see if the named dimensions
# are actually correct.
if vcf_field.vcf_number == "R":
dimensions.append("alleles")
elif vcf_field.vcf_number == "A":
dimensions.append("alt_alleles")
elif vcf_field.vcf_number == "G":
dimensions.append("genotypes")
else:
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
return ZarrColumnSpec.new(
vcf_field=vcf_field.full_name,
name=variable_name,
dtype=vcf_field.smallest_dtype(),
shape=shape,
chunks=chunks,
dimensions=dimensions,
description=vcf_field.description,
)

def _choose_compressor_settings(self):
"""
Choose compressor and filter settings based on the size and
Expand All @@ -1256,8 +1304,10 @@ def _choose_compressor_settings(self):
# performance in some cases anyway. Turning on shuffle should be a
# deliberate choice.
shuffle = numcodecs.Blosc.NOSHUFFLE
if dt.itemsize == 1:
# Any 1 byte field gets BITSHUFFLE by default
if self.name == "call_genotype" and dt.itemsize == 1:
# call_genotype gets BITSHUFFLE by default as it gets
# significantly better compression (at a cost of slower
# decoding)
shuffle = numcodecs.Blosc.BITSHUFFLE
self.compressor["shuffle"] = shuffle

Expand Down Expand Up @@ -1313,6 +1363,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
)

def spec_from_field(field, variable_name=None):
return ZarrColumnSpec.from_field(
field,
num_samples=n,
num_variants=m,
samples_chunk_size=samples_chunk_size,
variants_chunk_size=variants_chunk_size,
variable_name=variable_name,
)

def fixed_field_spec(
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
):
Expand Down Expand Up @@ -1349,67 +1409,31 @@ def fixed_field_spec(
dimensions=["variants", "alleles"],
),
fixed_field_spec(
vcf_field="POS",
name="variant_position",
dtype="i4",
),
fixed_field_spec(
vcf_field=None,
name="variant_id",
dtype="str",
),
fixed_field_spec(
vcf_field=None,
name="variant_id_mask",
dtype="bool",
),
fixed_field_spec(
vcf_field="QUAL",
name="variant_quality",
dtype="f4",
),
]
name_map = {field.full_name: field for field in icf.metadata.fields}

# Only two of the fixed fields have a direct one-to-one mapping.
colspecs.extend(
[
spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
spec_from_field(name_map["POS"], variable_name="variant_position"),
]
)
colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])

gt_field = None
for field in icf.metadata.fields:
if field.category == "fixed":
continue
for field in icf.metadata.format_fields:
if field.name == "GT":
gt_field = field
continue
shape = [m]
prefix = "variant_"
dimensions = ["variants"]
chunks = [variants_chunk_size]
if field.category == "FORMAT":
prefix = "call_"
shape.append(n)
chunks.append(samples_chunk_size)
dimensions.append("samples")
# TODO make an option to add in the empty extra dimension
if field.summary.max_number > 1:
shape.append(field.summary.max_number)
# TODO we should really be checking this to see if the named dimensions
# are actually correct.
if field.vcf_number == "R":
dimensions.append("alleles")
elif field.vcf_number == "A":
dimensions.append("alt_alleles")
elif field.vcf_number == "G":
dimensions.append("genotypes")
else:
dimensions.append(f"{field.category}_{field.name}_dim")
variable_name = prefix + field.name
colspec = ZarrColumnSpec.new(
vcf_field=field.full_name,
name=variable_name,
dtype=field.smallest_dtype(),
shape=shape,
chunks=chunks,
dimensions=dimensions,
description=field.description,
)
colspecs.append(colspec)
colspecs.append(spec_from_field(field))

if gt_field is not None:
ploidy = gt_field.summary.max_number - 1
Expand Down

0 comments on commit ad03dfc

Please sign in to comment.