From cda913f3648d9c8599b7eeb09c2810ae08d18bc3 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 23 Jan 2024 15:35:42 +0100 Subject: [PATCH 1/7] bump to dev version --- svync.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/svync.go b/svync.go index b986b70..497dd86 100644 --- a/svync.go +++ b/svync.go @@ -13,7 +13,7 @@ func main() { Name: "svync", Usage: "A tool to standardize VCF files from structural variant callers", HideHelpCommand: true, - Version: "0.1.2", + Version: "0.2.0dev", Flags: []cli.Flag{ &cli.BoolFlag{ Name: "nodate", From b15f48e3a70a190bac1d902061bf06b8a86d8fb0 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Wed, 19 Mar 2025 17:15:17 +0100 Subject: [PATCH 2/7] Add defaults for values --- svync_api/config.go | 5 ++++- svync_api/execute.go | 2 +- svync_api/resolve.go | 32 +++++++++++++++++++------------- svync_api/standardize.go | 6 +++--- svync_api/structs.go | 1 + svync_api/variant.go | 6 +++--- 6 files changed, 31 insertions(+), 21 deletions(-) diff --git a/svync_api/config.go b/svync_api/config.go index fd9d0eb..734cc03 100644 --- a/svync_api/config.go +++ b/svync_api/config.go @@ -79,7 +79,10 @@ func (config *Config) defineMissing() { // Format fields if _, ok := config.Format["GT"]; !ok { config.Format["GT"] = ConfigInput{ - Value: "$FORMAT/GT", + Value: "$FORMAT/GT", + Defaults: map[string]string{ + "$FORMAT/GT": "./.", + }, Number: "1", Type: "String", Description: "Genotype", diff --git a/svync_api/execute.go b/svync_api/execute.go index fbbd396..5fb961a 100644 --- a/svync_api/execute.go +++ b/svync_api/execute.go @@ -20,10 +20,10 @@ func Execute(Cctx *cli.Context, config *Config) { file := Cctx.String("input") inputVcf, err := os.Open(file) - defer inputVcf.Close() if err != nil { logger.Fatal(err) } + defer inputVcf.Close() header := newHeader() breakEndVariants := &map[string]Variant{} headerIsMade := false diff --git a/svync_api/resolve.go b/svync_api/resolve.go index 357bc3f..ff9eaef 100644 --- a/svync_api/resolve.go +++ b/svync_api/resolve.go @@ -12,7 +12,7 @@ import ( ) // Resolve a value -func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *cli.Context) string { +func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *cli.Context, config *Config) string { logger := log.New(os.Stderr, "", 0) // Replace all the FORMAT fields @@ -21,16 +21,19 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *c if len(allFormats) > 0 && format == nil { logger.Fatalf("Cannot use a FORMAT field in a non-FORMAT context, please check your config file") } - for _, stringToReplace := range allFormats { - fieldSlice := strings.Split(stringToReplace, "/") + for _, rawField := range allFormats { + fieldSlice := strings.Split(rawField, "/") field := fieldSlice[1] formatValue, ok := format.Content[field] - // TODO implement some alternative way to handle missing fields if !ok { - if !Cctx.Bool("mute-warnings") { - logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id) + // Check if the field is a default value + defaults := config.Format[field].Defaults + if defaultValue, ok := defaults[rawField]; ok { + formatValue = []string{defaultValue} + } else if !Cctx.Bool("mute-warnings") { + logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant. Supply a default to mute this warning", field, variant.Id) } } else if len(fieldSlice) > 2 { index, err := strconv.ParseInt(fieldSlice[2], 0, 64) @@ -39,23 +42,26 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *c } formatValue = []string{formatValue[index]} } - input = strings.ReplaceAll(input, stringToReplace, strings.Join(formatValue, ",")) + input = strings.ReplaceAll(input, rawField, strings.Join(formatValue, ",")) } // Replace all the INFO fields infoRegex := regexp.MustCompile(`\$INFO/[\w\d]+(/\d+)?`) allInfos := infoRegex.FindAllString(input, -1) - for _, stringToReplace := range allInfos { - fieldSlice := strings.Split(stringToReplace, "/") + for _, rawField := range allInfos { + fieldSlice := strings.Split(rawField, "/") field := fieldSlice[1] info, ok := variant.Info[field] - // TODO implement some alternative way to handle missing fields if !ok { + // Check if the field is a default value + defaults := config.Info[field].Defaults infoType := variant.Header.Info[field].Type - if infoType != "Flag" && !Cctx.Bool("mute-warnings") { - logger.Printf("The field %s is not present in the INFO fields of the variant with ID %s, excluding it from this variant", field, variant.Id) + if defaultValue, ok := defaults[rawField]; ok { + info = []string{defaultValue} + } else if infoType != "Flag" && !Cctx.Bool("mute-warnings") { + logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant. Supply a default to mute this warning", field, variant.Id) } } else if len(fieldSlice) > 2 { index, err := strconv.ParseInt(fieldSlice[2], 0, 64) @@ -64,7 +70,7 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *c } info = []string{info[index]} } - input = strings.ReplaceAll(input, stringToReplace, strings.Join(info, ",")) + input = strings.ReplaceAll(input, rawField, strings.Join(info, ",")) } // Replace POS fields diff --git a/svync_api/standardize.go b/svync_api/standardize.go index bc5ae77..8c3daca 100644 --- a/svync_api/standardize.go +++ b/svync_api/standardize.go @@ -102,7 +102,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int } } - standardizedVariant.Id = fmt.Sprintf("%s_%v", ResolveValue(config.Id, variant, nil, Cctx), count) + standardizedVariant.Id = fmt.Sprintf("%s_%v", ResolveValue(config.Id, variant, nil, Cctx, config), count) // Add info fields for name, infoConfig := range config.Info { @@ -114,7 +114,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int if value == "" { continue } - standardizedVariant.Info[name] = []string{ResolveValue(value, variant, nil, Cctx)} + standardizedVariant.Info[name] = []string{ResolveValue(value, variant, nil, Cctx, config)} } // Add format fields @@ -127,7 +127,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int if val, ok := formatConfig.Alts[sVType]; ok { value = val } - newFormat.Content[name] = []string{ResolveValue(value, variant, &format, Cctx)} + newFormat.Content[name] = []string{ResolveValue(value, variant, &format, Cctx, config)} } standardizedVariant.Format[sample] = *newFormat } diff --git a/svync_api/structs.go b/svync_api/structs.go index 4e196f7..eac8648 100644 --- a/svync_api/structs.go +++ b/svync_api/structs.go @@ -62,6 +62,7 @@ type Config struct { type MapConfigInput map[string]ConfigInput type ConfigInput struct { Value string + Defaults map[string]string Description string Number string Type string diff --git a/svync_api/variant.go b/svync_api/variant.go index 7eaaba5..f8c8836 100644 --- a/svync_api/variant.go +++ b/svync_api/variant.go @@ -45,10 +45,10 @@ func toBreakPoint(mate1 *Variant, mate2 *Variant) *Variant { filter = mate1.Filter } - varQual, err := strconv.ParseFloat(mate1.Qual, 64) - mateQual, err := strconv.ParseFloat(mate2.Qual, 64) + varQual, err1 := strconv.ParseFloat(mate1.Qual, 64) + mateQual, err2 := strconv.ParseFloat(mate2.Qual, 64) qual := "." - if err == nil { + if err1 == nil && err2 == nil { qual = fmt.Sprintf("%f", (varQual+mateQual)/2) } From c686d7a0aefa9fc3569e04831c917c604c2a5ff1 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Wed, 19 Mar 2025 17:19:02 +0100 Subject: [PATCH 3/7] bump docs --- docs/configuration.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/configuration.md b/docs/configuration.md index 7d90005..69ee922 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -41,6 +41,9 @@ info: ### value The `value` field can be used to change the default value of the info field. The value can be resolved (see [Resolvable fields](#resolvable-fields)). +### defaults +The `defaults` field can be used to define defaults for resolvable `INFO` and `FORMAT` fields. These defaults will be used when the required field is missing from the variant. + ### type The `type` field can be used to set the type of the info field (This will be reflected in the header of the output VCF file). @@ -58,6 +61,8 @@ For example when all `SVLEN` info fields are positive, you maybe want to change info: SVLEN: value: $INFO/SVLEN + defaults: + $INFO/SVLEN: "-1" type: Integer description: "Structural variant length" number: 1 @@ -71,6 +76,9 @@ The `format` section can be used to change the format fields for each variant. T format: : value: + defaults: + : + : type: description: number: From 4bb958bff49e4191d683844c25309614d55dff96 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Wed, 19 Mar 2025 17:21:11 +0100 Subject: [PATCH 4/7] changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6062409..1086620 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# 0.2.0dev +- Added the `defaults` type to the `info` and `format` configuration. Defaults for resolvable fields can be set this way. + # 0.1.2 - Adjust ## Fixes - Fixed a bug where the output VCF has no header when the input VCF has no variants From 6843b72c4b8b02c16e856250aaffd7af77050b5a Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Thu, 20 Mar 2025 15:06:01 +0100 Subject: [PATCH 5/7] fix copy paste in docs --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 21b8a8d..92c0b7a 100644 --- a/README.md +++ b/README.md @@ -26,16 +26,16 @@ The configuration file is the core of the standardization in Svync. More informa ## Installation ### Mamba/Conda -This is the preffered way of installing BedGoVcf. +This is the preferred way of installing Svync. ```bash -mamba install -c bioconda bedgovcf +mamba install -c bioconda svync ``` or with conda: ```bash -conda install -c bioconda bedgovcf +conda install -c bioconda svync ``` ### Precompiled binaries @@ -45,17 +45,17 @@ Precompiled binaries are available for Linux and macOS on the [releases page](ht ### Installation from source Make sure you have go installed on your machine (or [install](https://go.dev/doc/install) it if you don't currently have it) -Then run these commands to install bedgovcf: +Then run these commands to install svync: ```bash go get . go build . -sudo mv bedgovcf /usr/local/bin/ +sudo mv svync /usr/local/bin/ ``` Next run this command to check if it was correctly installed: ```bash -bedgovcf --help +svync --help ``` From d1a446c06904168f37926b32ca5d98e8900183d2 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Thu, 20 Mar 2025 15:38:37 +0100 Subject: [PATCH 6/7] comment structs --- svync_api/structs.go | 158 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 126 insertions(+), 32 deletions(-) diff --git a/svync_api/structs.go b/svync_api/structs.go index eac8648..01f4718 100644 --- a/svync_api/structs.go +++ b/svync_api/structs.go @@ -1,50 +1,118 @@ package svync_api -// VCF structs - +// The struct representing the header of the input VCF file in a parseable format type Header struct { - Info map[string]HeaderLineIdNumberTypeDescription - Format map[string]HeaderLineIdNumberTypeDescription - Alt map[string]HeaderLineIdDescription - Filter map[string]HeaderLineIdDescription - Contig []HeaderLineIdLength - Other []string + // Object containing the INFO fields with their ID, Number, Type and Description + // The ID is the key of the map + // The value is a struct containing the Id, Number, Type and Description + Info map[string]HeaderLineIdNumberTypeDescription + + // Object containing the FORMAT fields with their ID, Number, Type and Description + // The ID is the key of the map + // The value is a struct containing the Id, Number, Type and Description + Format map[string]HeaderLineIdNumberTypeDescription + + // Object containing the ALT fields with their ID and Description + // The ID is the key of the map + // The value is a struct containing the Id and Description + Alt map[string]HeaderLineIdDescription + + // Object containing the FILTER fields with their ID and Description + // The ID is the key of the map + // The value is a struct containing the Id and Description + Filter map[string]HeaderLineIdDescription + + // List of all contigs in the VCF file with their ID and Length + Contig []HeaderLineIdLength + + // List of all other VCF fields + Other []string + + // List of all samples in the VCF file Samples []string } +// A struct representing a header line in the VCF file with its ID and Description type HeaderLineIdDescription struct { - Id string + // The ID of the header line + Id string + + // The description of the header line Description string } +// A struct representing a header line in the VCF file with its ID, Number, Type and Description type HeaderLineIdNumberTypeDescription struct { - Id string - Number string - Type string + // The ID of the header line + Id string + + // The number of values in the header line + // Can be any integer, "A", "G", "R" or "." + // A = one value per alternate allele + // G = one value per possible genotype + // R = one value per possible allele + // . = the number varies, is unkown or is unbounded + Number string + + // The type of the header line + // Can be "Integer", "Float", "Flag", "String" or "Character" + Type string + + // The description of the header line Description string } +// A struct representing a header line in the VCF file with its ID and Length type HeaderLineIdLength struct { - Id string + // The ID of the header line + Id string + + // The length of the header line Length int64 } +// A struct representing a variant in the input VCF file type Variant struct { + // The chromosome of the variant Chromosome string - Pos int64 - Id string - Ref string - Alt string - Qual string - Filter string - Header *Header - Info map[string][]string - Format map[string]VariantFormat - Parsed bool + + // The 1-based position of the variant + Pos int64 + + // The ID of the variant + Id string + + // The reference allele of the variant + Ref string + + // The alternate allele of the variant + Alt string + + // The Phred-scaled quality score of the variant + Qual string + + // The filter status of the variant + Filter string + + // A pointer to the header of the VCF that contains this variant + Header *Header + + // The INFO values of the variant + Info map[string][]string + + // The FORMAT values of the variant + Format map[string]VariantFormat + + // A status flag indicating if the variant has been parsed before + Parsed bool } +// A struct representing the format of a variant in the VCF file type VariantFormat struct { - Sample string + // The sample name of the variant + Sample string + + // The content of the format field Content map[string][]string } @@ -52,19 +120,45 @@ type VariantFormat struct { // Config structs // +// The struct representing the configuration file +// The config file is a YAML file type Config struct { - Id string - Alt map[string]string - Info MapConfigInput + // How to handle the ID field of each variant + Id string + + // How to handle the ALT field of each variant + // A value can be given for each SVTYPE + Alt map[string]string + + // How to handle the INFO fields of each variant + Info MapConfigInput + + // How to handle the FORMAT fields of each variant Format MapConfigInput } +// A map construct for advanced configurations type MapConfigInput map[string]ConfigInput + +// A struct representing the configuration of advanced fields (like INFO and FORMAT) type ConfigInput struct { - Value string - Defaults map[string]string + // The value of the field + // This can be a string or a reference to another field + Value string + + // The default values of the field of all resolvable values in the Value field + Defaults map[string]string + + // The description of the field + // This is used to generate the VCF header Description string - Number string - Type string - Alts map[string]string + + // The number of values in the field + Number string + + // The type of the field + Type string + + // Alternative values for each SVTYPE + Alts map[string]string } From b226e277c2def6232e7365991547cf13808e41d3 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Mon, 7 Apr 2025 10:21:03 +0200 Subject: [PATCH 7/7] structure the changelog better --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1086620..1f8b93c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,20 @@ # 0.2.0dev + +## New features + - Added the `defaults` type to the `info` and `format` configuration. Defaults for resolvable fields can be set this way. # 0.1.2 - Adjust + ## Fixes + - Fixed a bug where the output VCF has no header when the input VCF has no variants # 0.1.1 - Transform + ## Fixes + - Fixed a bug where the samples were missing from the header # 0.1.0 - Change