Skip to content

Commit 4cb50f9

Browse files
committed
variant_count in seqVCF2GDS()
1 parent fa70842 commit 4cb50f9

File tree

3 files changed

+35
-29
lines changed

3 files changed

+35
-29
lines changed

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ UTILITIES
88
`seqVCF2GDS()` is faster when obtaining the number of variants for
99
splitting files.
1010

11+
o new 'variant_count' in `seqVCF2GDS()` to specify the number of variants
12+
in the VCF file when it is known; it is only applicable when multiple
13+
cores are used.
14+
1115

1216
CHANGES IN VERSION 1.46.0
1317
-------------------------

R/ConvVCF2GDS.R

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,8 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
545545
storage.option="LZMA_RA", info.import=NULL, fmt.import=NULL,
546546
genotype.var.name="GT", ignore.chr.prefix="chr",
547547
scenario=c("general", "imputation"), reference=NULL, start=1L, count=-1L,
548-
optimize=TRUE, raise.error=TRUE, digest=TRUE, parallel=FALSE, verbose=TRUE)
548+
variant_count=NA_integer_, optimize=TRUE, raise.error=TRUE, digest=TRUE,
549+
parallel=FALSE, verbose=TRUE)
549550
{
550551
# check
551552
if (!inherits(vcf.fn, "connection"))
@@ -592,20 +593,15 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
592593
{
593594
if (pnum > 1L)
594595
stop("No parallel support when the input is a connection object.")
595-
}
596-
597-
if (is.character(vcf.fn))
598-
variant_count <- attr(vcf.fn, "variant_count")
599-
else
600-
variant_count <- NULL
601-
if (!is.null(variant_count))
596+
if (length(variant_count)!=1L || !is.na(variant_count))
597+
warning("'variant_count' is not used in seqVCF2GDS() when 'vcf.fn' is a connection object.")
598+
} else if (!identical(variant_count, NA_integer_))
602599
{
603600
if (!is.numeric(variant_count))
604-
stop("the attribute 'variant_count' of 'vcf.fn' should be a numeric vector.")
601+
stop("'variant_count' should be a numeric vector.")
605602
if (length(variant_count) != length(vcf.fn))
606-
stop("the attribute 'variant_count' of 'vcf.fn' should be as the same length as 'vcf.fn'.")
603+
stop("'variant_count' and 'vcf.fn' should have the same length.")
607604
}
608-
609605
if (verbose) cat(date(), "\n", sep="")
610606

611607
genotype.storage <- "bit2"
@@ -785,12 +781,18 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
785781
}
786782

787783
# get the number of variants in each VCF file
788-
num_array <- vapply(vcf.fn, function(fn)
784+
for (i in seq_along(vcf.fn))
789785
{
790-
v <- seqVCF_Header(fn, getnum=TRUE, parallel=parallel, verbose=FALSE)
791-
v$num.variant
792-
}, 0L)
793-
num_var <- sum(num_array)
786+
v <- variant_count[i]
787+
if (is.na(v) || (v < 0L))
788+
{
789+
fn <- vcf.fn[i]
790+
variant_count[i] <- seqVCF_Header(fn, getnum=TRUE,
791+
parallel=parallel, verbose=FALSE)$num.variant
792+
}
793+
}
794+
num_var <- sum(variant_count)
795+
if (anyNA(num_var)) stop("Getting invalid # of variants.")
794796

795797
if (start < 1L)
796798
stop("'start' should be a positive integer if conversion in parallel.")
@@ -822,31 +824,25 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
822824
seqParallel(parallel, NULL, FUN = function(
823825
vcf.fn, header, storage.option, info.import, fmt.import,
824826
genotype.var.name, ignore.chr.prefix, scenario, optim,
825-
raise.err, ptmpfn, psplit, num_array)
827+
raise.err, ptmpfn, psplit, variant_count)
826828
{
827-
# load package
828-
library(SeqArray, quietly=TRUE, verbose=FALSE)
829-
830-
attr(vcf.fn, "variant_count") <- num_array
831829
i <- process_index # the process id, starting from one
832-
833-
seqVCF2GDS(vcf.fn, ptmpfn[i], header=oldheader,
830+
SeqArray::seqVCF2GDS(vcf.fn, ptmpfn[i], header=oldheader,
834831
storage.option=storage.option, info.import=info.import,
835832
fmt.import=fmt.import, genotype.var.name=genotype.var.name,
836833
ignore.chr.prefix=ignore.chr.prefix,
837834
start = psplit[[1L]][i], count = psplit[[2L]][i],
835+
variant_count=variant_count,
838836
optimize=optim, scenario=scenario, raise.error=raise.err,
839837
digest=FALSE, parallel=FALSE, verbose=FALSE)
840-
841-
invisible()
842-
838+
invisible() # return
843839
}, split="none",
844840
vcf.fn=vcf.fn, header=header, storage.option=storage.option,
845841
info.import=info.import, fmt.import=fmt.import,
846842
genotype.var.name=genotype.var.name,
847843
ignore.chr.prefix=ignore.chr.prefix, scenario=scenario,
848844
optim=optimize, raise.err=raise.error,
849-
ptmpfn=ptmpfn, psplit=psplit, num_array=num_array)
845+
ptmpfn=ptmpfn, psplit=psplit, variant_count=variant_count)
850846

851847
if (verbose)
852848
cat(" >>> Done (", date(), ") <<<\n", sep="")

man/seqVCF2GDS.Rd

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
seqVCF2GDS(vcf.fn, out.fn, header=NULL, storage.option="LZMA_RA",
1010
info.import=NULL, fmt.import=NULL, genotype.var.name="GT",
1111
ignore.chr.prefix="chr", scenario=c("general", "imputation"),
12-
reference=NULL, start=1L, count=-1L, optimize=TRUE, raise.error=TRUE,
13-
digest=TRUE, parallel=FALSE, verbose=TRUE)
12+
reference=NULL, start=1L, count=-1L, variant_count=NA_integer_,
13+
optimize=TRUE, raise.error=TRUE, digest=TRUE, parallel=FALSE,
14+
verbose=TRUE)
1415
seqBCF2GDS(bcf.fn, out.fn, header=NULL, storage.option="LZMA_RA",
1516
info.import=NULL, fmt.import=NULL, genotype.var.name="GT",
1617
ignore.chr.prefix="chr", scenario=c("general", "imputation"),
@@ -50,6 +51,11 @@ seqBCF2GDS(bcf.fn, out.fn, header=NULL, storage.option="LZMA_RA",
5051
\item{start}{the starting variant if importing part of VCF files}
5152
\item{count}{the maximum count of variant if importing part of VCF files,
5253
-1 indicates importing to the end}
54+
\item{variant_count}{\code{NA_integer_} (default) or a numeric vector
55+
specifying the numbers of variants in the VCF file(s) in \code{vcf.fn};
56+
only applicable when multiple cores are used; if the number of variants
57+
is known, the conversion can skip counting the variants before
58+
splitting the file(s); \code{variant_count} could be an approximate}
5359
\item{optimize}{if \code{TRUE}, optimize the access efficiency by calling
5460
\code{\link{cleanup.gds}}}
5561
\item{raise.error}{\code{TRUE}: throw an error if numeric conversion fails;

0 commit comments

Comments
 (0)