From fa2dab801be35556a4aae9a19f9300af4989cfbb Mon Sep 17 00:00:00 2001 From: Jarrett Revels Date: Sun, 28 Mar 2021 14:58:08 -0400 Subject: [PATCH] break up some minor utility functions (#71) --- Project.toml | 2 +- docs/src/index.md | 1 + src/annotations.jl | 19 ++++++++++-------- src/signals.jl | 19 ++++++++++-------- src/utilities.jl | 49 ++++++++++++++++++++++++++++++++++++--------- test/annotations.jl | 3 ++- test/samples.jl | 5 +++-- test/signals.jl | 3 ++- 8 files changed, 71 insertions(+), 30 deletions(-) diff --git a/Project.toml b/Project.toml index f78c98cd..3c544b19 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Onda" uuid = "e853f5be-6863-11e9-128d-476edb89bfb5" authors = ["Beacon Biosignals, Inc."] -version = "0.12.3" +version = "0.13.0" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/docs/src/index.md b/docs/src/index.md index 8eebcda9..57962e3e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -98,6 +98,7 @@ Onda.file_format_string ## Utilities ```@docs +Onda.materialize Onda.gather Onda.validate_on_construction Onda.upgrade_onda_dataset_to_v0_5! diff --git a/src/annotations.jl b/src/annotations.jl index 62d580a4..02d90b9f 100644 --- a/src/annotations.jl +++ b/src/annotations.jl @@ -66,22 +66,25 @@ Tables.columnnames(x::Annotation) = Tables.columnnames(getfield(x, :_row)) ##### """ - read_annotations(io_or_path; materialize::Bool=false, validate_schema::Bool=true) + read_annotations(io_or_path; validate_schema::Bool=true) Return the `*.onda.annotations.arrow`-compliant table read from `io_or_path`. If `validate_schema` is `true`, the table's schema will be validated to ensure it is a `*.onda.annotations.arrow`-compliant table. An `ArgumentError` will be thrown if any schema violation is detected. - -If `materialize` is `false`, the returned table will be an `Arrow.Table` while if -`materialize` is `true`, the returned table will be a `NamedTuple` of columns. The -primary difference is that the former has a conversion-on-access behavior, while -for the latter, any potential conversion cost has been paid up front. """ -function read_annotations(io_or_path; materialize::Bool=false, validate_schema::Bool=true) - table = read_onda_table(io_or_path; materialize) +function read_annotations(io_or_path; materialize::Union{Missing,Bool}=missing, validate_schema::Bool=true) + table = read_onda_table(io_or_path) validate_schema && validate_annotation_schema(Tables.schema(table)) + if materialize isa Bool + if materialize + @warn "`read_annotations(x; materialize=true)` is deprecated; use `Onda.materialize(read_annotations(x))` instead" + return Onda.materialize(table) + else + @warn "`read_annotations(x; materialize=false)` is deprecated; use `read_annotations(x)` instead" + end + end return table end diff --git a/src/signals.jl b/src/signals.jl index 6379c755..28ad993e 100644 --- a/src/signals.jl +++ b/src/signals.jl @@ -133,22 +133,25 @@ Tables.columnnames(x::Signal) = Tables.columnnames(getfield(x, :_row)) ##### """ - read_signals(io_or_path; materialize::Bool=false, validate_schema::Bool=false) + read_signals(io_or_path; validate_schema::Bool=false) Return the `*.onda.signals.arrow`-compliant table read from `io_or_path`. If `validate_schema` is `true`, the table's schema will be validated to ensure it is a `*.onda.signals.arrow`-compliant table. An `ArgumentError` will be thrown if any schema violation is detected. - -If `materialize` is `false`, the returned table will be an `Arrow.Table` while if -`materialize` is `true`, the returned table will be a `NamedTuple` of columns. The -primary difference is that the former has a conversion-on-access behavior, while -for the latter, any potential conversion cost has been paid up front. """ -function read_signals(io_or_path; materialize::Bool=false, validate_schema::Bool=false) - table = read_onda_table(io_or_path; materialize) +function read_signals(io_or_path; materialize::Union{Missing,Bool}=missing, validate_schema::Bool=false) + table = read_onda_table(io_or_path) validate_schema && validate_signal_schema(Tables.schema(table)) + if materialize isa Bool + if materialize + @warn "`read_signals(x; materialize=true)` is deprecated; use `Onda.materialize(read_signals(x))` instead" + return Onda.materialize(table) + else + @warn "`read_signals(x; materialize=false)` is deprecated; use `read_signals(x)` instead" + end + end return table end diff --git a/src/utilities.jl b/src/utilities.jl index cbc11a32..0978288f 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -23,12 +23,14 @@ const MINIMUM_ONDA_FORMAT_VERSION = v"0.5" const MAXIMUM_ONDA_FORMAT_VERSION = v"0.5" -function is_supported_onda_format_version(v::VersionNumber) - min_major, min_minor = MINIMUM_ONDA_FORMAT_VERSION.major, MINIMUM_ONDA_FORMAT_VERSION.minor - max_major, max_minor = MAXIMUM_ONDA_FORMAT_VERSION.major, MAXIMUM_ONDA_FORMAT_VERSION.minor - return (min_major <= v.major <= max_major) && (min_minor <= v.minor <= max_minor) +function is_supported_version(v::VersionNumber, lo::VersionNumber, hi::VersionNumber) + lo_major, lo_minor = lo.major, lo.minor + hi_major, hi_minor = hi.major, hi.minor + return (lo_major <= v.major <= hi_major) && (lo_minor <= v.minor <= hi_minor) end +is_supported_onda_format_version(v::VersionNumber) = is_supported_version(v, MINIMUM_ONDA_FORMAT_VERSION, MAXIMUM_ONDA_FORMAT_VERSION) + const ALPHANUMERIC_SNAKE_CASE_CHARACTERS = Char['_', '0':'9'..., 'a':'z'...] @@ -100,11 +102,14 @@ write_full_path(path, bytes) = write(path, bytes) ##### tables ##### -function table_has_supported_onda_format_version(table) +function table_has_metadata(predicate, table) m = Arrow.getmetadata(table) - return m isa Dict && is_supported_onda_format_version(VersionNumber(get(m, "onda_format_version", v"0.0.0"))) + return m isa Dict && predicate(m) end +table_has_required_onda_metadata(table) = table_has_metadata(m -> is_supported_onda_format_version(VersionNumber(get(m, "onda_format_version", v"0.0.0"))), + table) + # It would be better if Arrow.jl supported a generic API for nonstandard path-like types so that # we can avoid potential intermediate copies here, but its documentation is explicit that it only # supports `Union{IO,String}`. @@ -116,10 +121,12 @@ write_arrow_table(path::String, table; kwargs...) = Arrow.write(path, table; kwa write_arrow_table(io::IO, table; kwargs...) = Arrow.write(io, table; file=true, kwargs...) write_arrow_table(path, table; kwargs...) = (io = IOBuffer(); write_arrow_table(io, table; kwargs...); write_full_path(path, take!(io))) -function read_onda_table(path; materialize::Bool=false) +function read_onda_table(path) table = read_arrow_table(path) - table_has_supported_onda_format_version(table) || error("supported `onda_format_version` not found in annotations file") - return materialize ? map(collect, Tables.columntable(table)) : table + if !table_has_required_onda_metadata(table) + throw(ArgumentError("required Onda metadata not found in Arrow file; use `Onda.read_arrow_table` to read the file without this validation check")) + end + return table end function write_onda_table(path, table; kwargs...) @@ -169,3 +176,27 @@ function gather(column_name, tables::Vararg{Any,N}; iters = ntuple(i -> _iterator_for_column(tables[i], column_name), N) return Dict(id => ntuple(i -> extract(tables[i], locs[i]), N) for (id, locs) in locations(iters)) end + +""" + materialize(table) + +Return a fully deserialized copy of `table`. + +This function is useful when `table` has built-in deserialize-on-access or +conversion-on-access behavior (like `Arrow.Table`) and you'd like to pay +such access costs upfront before repeatedly accessing the table. For example: + +``` +julia> annotations = read_annotations(path_to_annotations_file); + +# iterate through all elements of `annotations.span` +julia> @time foreach(identity, (span for span in annotations.span)); +0.000126 seconds (306 allocations: 6.688 KiB) + +julia> materialized = Onda.materialize(annotations); + +julia> @time foreach(identity, (span for span in materialized.span)); + 0.000014 seconds (2 allocations: 80 bytes) +``` +""" +materialize(table) = map(collect, Tables.columntable(table)) \ No newline at end of file diff --git a/test/annotations.jl b/test/annotations.jl index d09fb50a..f6cf831b 100644 --- a/test/annotations.jl +++ b/test/annotations.jl @@ -43,7 +43,8 @@ end seekstart(io) for roundtripped in (read_annotations(annotations_file_path; materialize=false, validate_schema=false), read_annotations(annotations_file_path; materialize=true, validate_schema=true), - read_annotations(io; validate_schema=true)) + Onda.materialize(read_annotations(io)), + read_annotations(seekstart(io); validate_schema=true)) roundtripped = collect(Tables.rows(roundtripped)) @test length(roundtripped) == length(annotations) for (r, a) in zip(roundtripped, annotations) diff --git a/test/samples.jl b/test/samples.jl index f86f1de3..8845c0a7 100644 --- a/test/samples.jl +++ b/test/samples.jl @@ -119,7 +119,8 @@ end @testset "`Samples` pretty printing" begin info = SamplesInfo("eeg", ["a", "b", "c-d"], "unit", 0.25, -0.5, Int16, 50.2) samples = Samples(rand(Random.MersenneTwister(0), info.sample_type, 3, 5), info, true) - @test sprint(show, samples, context=(:compact => true)) == "Samples(3×5 Array{Int16,2})" + M = VERSION >= v"1.6" ? "Matrix{Int16}" : "Array{Int16,2}" + @test sprint(show, samples, context=(:compact => true)) == "Samples(3×5 $M)" @test sprint(show, samples) == """ Samples (00:00:00.099601594): info.kind: "eeg" @@ -131,7 +132,7 @@ end info.sample_rate: 50.2 Hz encoded: true data: - 3×5 Array{Int16,2}: + 3×5 $M: 20032 4760 27427 -20758 24287 14240 5037 5598 -5888 21784 16885 600 20880 -32493 -19305""" diff --git a/test/signals.jl b/test/signals.jl index 30e45d6e..a2dd8156 100644 --- a/test/signals.jl +++ b/test/signals.jl @@ -108,7 +108,8 @@ end seekstart(io) for roundtripped in (read_signals(signals_file_path; materialize=false, validate_schema=false), read_signals(signals_file_path; materialize=true, validate_schema=true), - read_signals(io; validate_schema=true)) + Onda.materialize(read_signals(io)), + read_signals(seekstart(io); validate_schema=true)) roundtripped = collect(Tables.rows(roundtripped)) @test length(roundtripped) == length(signals) for (r, s) in zip(roundtripped, signals)