break up some minor utility functions (#71)

beacon-biosignals · Mar 28, 2021 · fa2dab8 · fa2dab8 · jrevels · Mar 28, 2021
1 parent 3d0a8bd
commit fa2dab8
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 30 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Onda"
 uuid = "e853f5be-6863-11e9-128d-476edb89bfb5"
 authors = ["Beacon Biosignals, Inc."]
-version = "0.12.3"
+version = "0.13.0"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -98,6 +98,7 @@ Onda.file_format_string
 ## Utilities
 
 ```@docs
+Onda.materialize
 Onda.gather
 Onda.validate_on_construction
 Onda.upgrade_onda_dataset_to_v0_5!

diff --git a/src/annotations.jl b/src/annotations.jl
@@ -66,22 +66,25 @@ Tables.columnnames(x::Annotation) = Tables.columnnames(getfield(x, :_row))
 #####
 
 """
-    read_annotations(io_or_path; materialize::Bool=false, validate_schema::Bool=true)
+    read_annotations(io_or_path; validate_schema::Bool=true)
 
 Return the `*.onda.annotations.arrow`-compliant table read from `io_or_path`.
 
 If `validate_schema` is `true`, the table's schema will be validated to ensure it is
 a `*.onda.annotations.arrow`-compliant table. An `ArgumentError` will be thrown if
 any schema violation is detected.
-
-If `materialize` is `false`, the returned table will be an `Arrow.Table` while if
-`materialize` is `true`, the returned table will be a `NamedTuple` of columns. The
-primary difference is that the former has a conversion-on-access behavior, while
-for the latter, any potential conversion cost has been paid up front.
 """
-function read_annotations(io_or_path; materialize::Bool=false, validate_schema::Bool=true)
-    table = read_onda_table(io_or_path; materialize)
+function read_annotations(io_or_path; materialize::Union{Missing,Bool}=missing, validate_schema::Bool=true)
+    table = read_onda_table(io_or_path)
     validate_schema && validate_annotation_schema(Tables.schema(table))
+    if materialize isa Bool
+        if materialize
+            @warn "`read_annotations(x; materialize=true)` is deprecated; use `Onda.materialize(read_annotations(x))` instead"
+            return Onda.materialize(table)
+        else
+            @warn "`read_annotations(x; materialize=false)` is deprecated; use `read_annotations(x)` instead"
+        end
+    end
     return table
 end
 

diff --git a/src/signals.jl b/src/signals.jl
@@ -133,22 +133,25 @@ Tables.columnnames(x::Signal) = Tables.columnnames(getfield(x, :_row))
 #####
 
 """
-    read_signals(io_or_path; materialize::Bool=false, validate_schema::Bool=false)
+    read_signals(io_or_path; validate_schema::Bool=false)
 
 Return the `*.onda.signals.arrow`-compliant table read from `io_or_path`.
 
 If `validate_schema` is `true`, the table's schema will be validated to ensure it is
 a `*.onda.signals.arrow`-compliant table. An `ArgumentError` will be thrown if
 any schema violation is detected.
-
-If `materialize` is `false`, the returned table will be an `Arrow.Table` while if
-`materialize` is `true`, the returned table will be a `NamedTuple` of columns. The
-primary difference is that the former has a conversion-on-access behavior, while
-for the latter, any potential conversion cost has been paid up front.
 """
-function read_signals(io_or_path; materialize::Bool=false, validate_schema::Bool=false)
-    table = read_onda_table(io_or_path; materialize)
+function read_signals(io_or_path; materialize::Union{Missing,Bool}=missing, validate_schema::Bool=false)
+    table = read_onda_table(io_or_path)
     validate_schema && validate_signal_schema(Tables.schema(table))
+    if materialize isa Bool
+        if materialize
+            @warn "`read_signals(x; materialize=true)` is deprecated; use `Onda.materialize(read_signals(x))` instead"
+            return Onda.materialize(table)
+        else
+            @warn "`read_signals(x; materialize=false)` is deprecated; use `read_signals(x)` instead"
+        end
+    end
     return table
 end
 

diff --git a/src/utilities.jl b/src/utilities.jl
@@ -23,12 +23,14 @@ const MINIMUM_ONDA_FORMAT_VERSION = v"0.5"
 
 const MAXIMUM_ONDA_FORMAT_VERSION = v"0.5"
 
-function is_supported_onda_format_version(v::VersionNumber)
-    min_major, min_minor = MINIMUM_ONDA_FORMAT_VERSION.major, MINIMUM_ONDA_FORMAT_VERSION.minor
-    max_major, max_minor = MAXIMUM_ONDA_FORMAT_VERSION.major, MAXIMUM_ONDA_FORMAT_VERSION.minor
-    return (min_major <= v.major <= max_major) && (min_minor <= v.minor <= max_minor)
+function is_supported_version(v::VersionNumber, lo::VersionNumber, hi::VersionNumber)
+    lo_major, lo_minor = lo.major, lo.minor
+    hi_major, hi_minor = hi.major, hi.minor
+    return (lo_major <= v.major <= hi_major) && (lo_minor <= v.minor <= hi_minor)
 end
 
+is_supported_onda_format_version(v::VersionNumber) = is_supported_version(v, MINIMUM_ONDA_FORMAT_VERSION, MAXIMUM_ONDA_FORMAT_VERSION)
+
 const ALPHANUMERIC_SNAKE_CASE_CHARACTERS = Char['_',
                                                 '0':'9'...,
                                                 'a':'z'...]
@@ -100,11 +102,14 @@ write_full_path(path, bytes) = write(path, bytes)
 ##### tables
 #####
 
-function table_has_supported_onda_format_version(table)
+function table_has_metadata(predicate, table)
     m = Arrow.getmetadata(table)
-    return m isa Dict && is_supported_onda_format_version(VersionNumber(get(m, "onda_format_version", v"0.0.0")))
+    return m isa Dict && predicate(m)
 end
 
+table_has_required_onda_metadata(table) = table_has_metadata(m -> is_supported_onda_format_version(VersionNumber(get(m, "onda_format_version", v"0.0.0"))),
+                                                             table)
+
 # It would be better if Arrow.jl supported a generic API for nonstandard path-like types so that
 # we can avoid potential intermediate copies here, but its documentation is explicit that it only
 # supports `Union{IO,String}`.
@@ -116,10 +121,12 @@ write_arrow_table(path::String, table; kwargs...) = Arrow.write(path, table; kwa
 write_arrow_table(io::IO, table; kwargs...) = Arrow.write(io, table; file=true, kwargs...)
 write_arrow_table(path, table; kwargs...) = (io = IOBuffer(); write_arrow_table(io, table; kwargs...); write_full_path(path, take!(io)))
 
-function read_onda_table(path; materialize::Bool=false)
+function read_onda_table(path)
     table = read_arrow_table(path)
-    table_has_supported_onda_format_version(table) || error("supported `onda_format_version` not found in annotations file")
-    return materialize ? map(collect, Tables.columntable(table)) : table
+    if !table_has_required_onda_metadata(table)
+        throw(ArgumentError("required Onda metadata not found in Arrow file; use `Onda.read_arrow_table` to read the file without this validation check"))
+    end
+    return table
 end
 
 function write_onda_table(path, table; kwargs...)
@@ -169,3 +176,27 @@ function gather(column_name, tables::Vararg{Any,N};
     iters = ntuple(i -> _iterator_for_column(tables[i], column_name), N)
     return Dict(id => ntuple(i -> extract(tables[i], locs[i]), N) for (id, locs) in locations(iters))
 end
+
+"""
+    materialize(table)
+
+Return a fully deserialized copy of `table`.
+
+This function is useful when `table` has built-in deserialize-on-access or
+conversion-on-access behavior (like `Arrow.Table`) and you'd like to pay
+such access costs upfront before repeatedly accessing the table. For example:
+
+```
+julia> annotations = read_annotations(path_to_annotations_file);
+
+# iterate through all elements of `annotations.span`
+julia> @time foreach(identity, (span for span in annotations.span));
+0.000126 seconds (306 allocations: 6.688 KiB)
+
+julia> materialized = Onda.materialize(annotations);
+
+julia> @time foreach(identity, (span for span in materialized.span));
+  0.000014 seconds (2 allocations: 80 bytes)
+```
+"""
+materialize(table) = map(collect, Tables.columntable(table))
diff --git a/test/annotations.jl b/test/annotations.jl
@@ -43,7 +43,8 @@ end
     seekstart(io)
     for roundtripped in (read_annotations(annotations_file_path; materialize=false, validate_schema=false),
                          read_annotations(annotations_file_path; materialize=true, validate_schema=true),
-                         read_annotations(io; validate_schema=true))
+                         Onda.materialize(read_annotations(io)),
+                         read_annotations(seekstart(io); validate_schema=true))
         roundtripped = collect(Tables.rows(roundtripped))
         @test length(roundtripped) == length(annotations)
         for (r, a) in zip(roundtripped, annotations)

diff --git a/test/samples.jl b/test/samples.jl
@@ -119,7 +119,8 @@ end
 @testset "`Samples` pretty printing" begin
     info = SamplesInfo("eeg", ["a", "b", "c-d"], "unit", 0.25, -0.5, Int16, 50.2)
     samples = Samples(rand(Random.MersenneTwister(0), info.sample_type, 3, 5), info, true)
-    @test sprint(show, samples, context=(:compact => true)) == "Samples(3×5 Array{Int16,2})"
+    M = VERSION >= v"1.6" ? "Matrix{Int16}" : "Array{Int16,2}"
+    @test sprint(show, samples, context=(:compact => true)) == "Samples(3×5 $M)"
     @test sprint(show, samples) == """
                                    Samples (00:00:00.099601594):
                                      info.kind: "eeg"
@@ -131,7 +132,7 @@ end
                                      info.sample_rate: 50.2 Hz
                                      encoded: true
                                      data:
-                                   3×5 Array{Int16,2}:
+                                   3×5 $M:
                                     20032  4760  27427  -20758   24287
                                     14240  5037   5598   -5888   21784
                                     16885   600  20880  -32493  -19305"""

diff --git a/test/signals.jl b/test/signals.jl
@@ -108,7 +108,8 @@ end
     seekstart(io)
     for roundtripped in (read_signals(signals_file_path; materialize=false, validate_schema=false),
                          read_signals(signals_file_path; materialize=true, validate_schema=true),
-                         read_signals(io; validate_schema=true))
+                         Onda.materialize(read_signals(io)),
+                         read_signals(seekstart(io); validate_schema=true))
         roundtripped = collect(Tables.rows(roundtripped))
         @test length(roundtripped) == length(signals)
         for (r, s) in zip(roundtripped, signals)