update Onda for OndaFormat v0.3.0 (#28)

Co-Authored-By: Alex Arslan <ararslan@comcast.net>
beacon-biosignals · Mar 4, 2020 · 53d7655 · 53d7655 · jrevels · Mar 4, 2020
1 parent 9e61956
commit 53d7655
Show file tree

Hide file tree

Showing 15 changed files with 401 additions and 298 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Onda"
 uuid = "e853f5be-6863-11e9-128d-476edb89bfb5"
 authors = ["Beacon Biosignals, Inc."]
-version = "0.7.6"
+version = "0.8.0"
 
 [deps]
 CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -14,7 +14,6 @@ Note that Onda.jl's API follows a specific philosophy with respect to property a
 Dataset
 samples_path
 create_recording!
-set_duration!
 load
 store!
 delete!
@@ -26,8 +25,10 @@ save_recordings_file
 ```@docs
 Signal
 signal_from_template
+span
 Annotation
 Recording
+set_span!
 annotate!
 ```
 
@@ -68,3 +69,9 @@ serialize_lpcm
 LPCM
 LPCMZst
 ```
+
+## Upgrading Older Datasets to Newer Datasets
+
+```@docs
+Onda.upgrade_onda_format_from_v0_2_to_v0_3!
+```
diff --git a/examples/flac.jl b/examples/flac.jl
@@ -2,7 +2,7 @@
 # that it's a naive implementation - it just shells out and assumes you have
 # the `flac` command line utility installed and available on your system.
 
-using Onda, Test, Random
+using Onda, Test, Random, Dates
 
 #####
 ##### FLAC
@@ -69,7 +69,7 @@ end
 
 if VERSION >= v"1.1.0"
     @testset "FLAC example" begin
-        signal = Signal([:a, :b, :c], :unit, 0.25, Int16, 50, :flac, Dict(:level => 2))
+        signal = Signal([:a, :b, :c], Nanosecond(0), Nanosecond(0), :unit, 0.25, 0.0, Int16, 50.0, :flac, Dict(:level => 2))
         samples = encode(Samples(signal, false, rand(MersenneTwister(1), 3, 50 * 10))).data
         s = serializer(signal)
         bytes = serialize_lpcm(samples, s)

diff --git a/examples/tour.jl b/examples/tour.jl
@@ -22,21 +22,27 @@ eeg_signal = Signal(channel_names=[:fp1, :f3, :c3, :p3,
                                    :fz, :cz, :pz,
                                    :fp2, :f4, :c4, :p4,
                                    :f8, :t4, :t6, :o2],
+                    start_nanosecond=Nanosecond(0),
+                    stop_nanosecond=Nanosecond(Second(20)),
                     sample_unit=:microvolts,
                     sample_resolution_in_unit=0.25,
+                    sample_offset_in_unit=0.0,
                     sample_type=Int16,
-                    sample_rate=256, # Hz
+                    sample_rate=256.0, # Hz
                     file_extension=:lpcm,
                     file_options=nothing)
 
 ecg_signal = signal_from_template(eeg_signal; channel_names=[:avl, :avr],
                                   file_extension=Symbol("lpcm.zst"))
 
 spo2_signal = Signal(channel_names=[:spo2],
+                     start_nanosecond=Nanosecond(Second(3)),
+                     stop_nanosecond=Nanosecond(Second(17)),
                      sample_unit=:percentage,
                      sample_resolution_in_unit=(100 / typemax(UInt8)),
+                     sample_offset_in_unit=0.0,
                      sample_type=UInt8,
-                     sample_rate=20, # Hz
+                     sample_rate=20.5, # Hz
                      file_extension=:lpcm,
                      file_options=nothing)
 
@@ -52,35 +58,45 @@ spo2_signal = Signal(channel_names=[:spo2],
 # an aside: The hypothetical person from which these hypothetical signals were
 # hypothetically recorded must be experiencing some pretty crazy pathologies if
 # their EEG/ECG are just saw waves...
-saws(signal, seconds) = [(j + i) % 100 * signal.sample_resolution_in_unit for
-                         i in 1:channel_count(signal),
-                         j in 1:(signal.sample_rate * seconds)]
+saws(signal) = [(j + i) % 100 * signal.sample_resolution_in_unit for
+                i in 1:channel_count(signal), j in 1:sample_count(signal)]
 
 # The second argument in the `Samples` constructor is a `Bool` that specifies if
 # the data is in its encoded representation. Here, we construct our signals as
 # "decoded" (i.e. in actual units, though for this toy example it doesn't really
 # matter) and then "encode" them according to the specified:
-eeg = encode(Samples(eeg_signal, false, saws(eeg_signal, 20)))
-ecg = encode(Samples(ecg_signal, false, saws(ecg_signal, 20)))
-spo2 = encode(Samples(spo2_signal, false, saws(spo2_signal, 20)))
+eeg = encode(Samples(eeg_signal, false, saws(eeg_signal)))
+ecg = encode(Samples(ecg_signal, false, saws(ecg_signal)))
+spo2 = encode(Samples(spo2_signal, false, saws(spo2_signal)))
 
 # Here are some basic functions for examining `Samples` instances:
-@test sample_count(eeg) == eeg.signal.sample_rate * 20
+@test sample_count(eeg) == sample_count(eeg_signal) == 20 * eeg_signal.sample_rate
 @test channel_count(eeg) == channel_count(eeg_signal) == 19
 @test channel(eeg, :f3) == channel(eeg_signal, :f3) == 2
 @test channel(eeg, 2) == channel(eeg_signal, 2) == :f3
-@test duration(eeg) == duration(ecg) == duration(spo2) == Second(20)
+@test duration(eeg) == duration(span(eeg_signal)) == Second(20)
 
 # Here are some basic indexing examples using `getindex` and `view` wherein
 # channel names and sample-rate-agnostic `TimeSpan`s are employed as indices:
-span = TimeSpan(Second(3), Second(9))
-span_range = index_from_time(eeg.signal.sample_rate, span)
-@test eeg[:, span].data == view(eeg, :, span_range).data
+slice_span = TimeSpan(Second(3), Second(9))
+span_range = index_from_time(eeg.signal.sample_rate, slice_span)
+@test eeg[:, slice_span].data == view(eeg, :, span_range).data
 @test eeg[:f3, :].data == view(eeg, 2, :).data
 @test eeg[:f3, 1:10].data == view(eeg, 2, 1:10).data
-@test eeg[:f3, span].data == view(eeg, 2, span_range).data
+@test eeg[:f3, slice_span].data == view(eeg, 2, span_range).data
 @test eeg[[:f3, :c3, :p3], 1:10].data == view(eeg, 2:4, 1:10).data
-@test eeg[[:c3, 4, :f3], span].data == view(eeg, [3, 4, 2], span_range).data
+@test eeg[[:c3, 4, :f3], slice_span].data == view(eeg, [3, 4, 2], span_range).data
+
+# NOTE: Keep in mind that `duration(samples.signal)` is not generally equivalent
+# to `duration(samples)`; the former is the duration of the original signal in
+# the context of its parent recording, whereas the latter is the actual duration
+# of `samples.data` given `signal.sample_rate`. This is similarly true for the
+# `sample_count` function for the same reason!
+eeg_slice = eeg[:, slice_span]
+@test duration(eeg_slice) == duration(slice_span)
+@test duration(eeg_slice) != duration(eeg_signal)
+@test sample_count(eeg_slice) == length(span_range)
+@test sample_count(eeg_slice) != sample_count(eeg_signal)
 
 # NOTE: `Samples` is not an `AbstractArray` subtype; this special indexing
 # behavior is only defined for convenient data manipulation. It is thus fine
@@ -106,12 +122,7 @@ dataset = Dataset(joinpath(root, "example.onda"); create=true)
 # `dataset.recordings` dictionary before returning the pair, such that the
 # `recording` variable we assign here references the same `Recording` instance
 # stored within `dataset`.
-uuid, recording = create_recording!(dataset, duration(eeg), Dict())
-
-# The last argument to `create_recording!` above provided a dictionary for the
-# `custom` field of the returned `Recording` instance. Let's populate it with
-# some custom metadata:
-recording.custom["some_custom_information"] = "hey, I think this recording might be fake"
+uuid, recording = create_recording!(dataset)
 
 # Store our signals/samples for the recording in our `dataset`. This both serializes
 # sample data to disk and adds the signal metadata to the recording stored in
@@ -125,7 +136,7 @@ store!(dataset, uuid, :spo2, spo2)
 # like in there. For example, Beacon Biosignals stores JSON snippets in
 # annotations. Here, let's just go the simple route and pretend we found an
 # epileptiform spike in our EEG/ECG/SpO2 recording:
-spike_annotation = Annotation("epileptiform", "spike", TimeSpan(Millisecond(1500), Second(2)))
+spike_annotation = Annotation("epileptiform_spike", TimeSpan(Millisecond(1500), Second(2)))
 annotate!(recording, spike_annotation)
 
 # You can add as many annotations as you'd like to a recording. Just keep in mind
@@ -138,7 +149,7 @@ annotate!(recording, spike_annotation)
 # short 2 second epochs across the entire recording:
 for (i, t) in enumerate(2:2:Second(duration(recording)).value)
     stage = rand(["awake", "nrem1", "nrem2", "nrem3", "rem"])
-    ann = Annotation("sleep_stage", "$stage", TimeSpan(Second(t - 2), Second(t)))
+    ann = Annotation(stage, TimeSpan(Second(t - 2), Second(t)))
     annotate!(recording, ann)
 end
 
@@ -159,7 +170,7 @@ dataset = Dataset(joinpath(root, "example.onda"))
 uuid, recording = first(dataset.recordings)
 
 # Grab the first spike annotation we see...
-spike_annotation = first(ann for ann in recording.annotations if ann.value == "spike")
+spike_annotation = first(ann for ann in recording.annotations if ann.value == "epileptiform_spike")
 
 # ...and load that segment of the EEG from disk as a `Samples` instance!
 spike_segment = load(dataset, uuid, :eeg, spike_annotation)
@@ -180,7 +191,7 @@ spike_segment = load(dataset, uuid, :eeg, spike_annotation)
 # Welp, looks like a spike to me! Let's leave an annotation to confirm we
 # checked it. Remember - `spike_annotation isa AbstractTimeSpan`, so we can
 # generally pass it wherever we'd pass a `TimeSpan` object:
-annotate!(recording, Annotation("confirmation", "spike", spike_annotation))
+annotate!(recording, Annotation("confirmed_spike_by_me", spike_annotation))
 
 # ...and, finally, of course, let's save our annotation!
 save_recordings_file(dataset)
diff --git a/src/Onda.jl b/src/Onda.jl
@@ -5,7 +5,7 @@ using MsgPack
 using TranscodingStreams
 using CodecZstd
 
-const ONDA_FORMAT_VERSION = v"0.2"
+const ONDA_FORMAT_VERSION = v"0.3"
 
 #####
 ##### utilities
@@ -56,7 +56,7 @@ export AbstractTimeSpan, TimeSpan, contains, overlaps, shortest_timespan_contain
        index_from_time, time_from_index, duration
 
 include("recordings.jl")
-export Recording, Signal, signal_from_template, Annotation, annotate!
+export Recording, Signal, signal_from_template, Annotation, annotate!, span
 
 include("serialization.jl")
 export AbstractLPCMSerializer, serializer, deserialize_lpcm, serialize_lpcm,
@@ -66,9 +66,70 @@ include("samples.jl")
 export Samples, encode, encode!, decode, decode!, channel, channel_count, sample_count
 
 include("dataset.jl")
-export Dataset, samples_path, create_recording!, set_duration!, load, store!, delete!,
+export Dataset, samples_path, create_recording!, set_span!, load, store!, delete!,
        save_recordings_file
 
 include("printing.jl")
 
+#####
+##### upgrades/deprecations
+#####
+
+@deprecate set_duration!(dataset, uuid, duration) begin
+    r = dataset.recordings[uuid]
+    set_span!(r, TimeSpan(Nanosecond(0), duration))
+    r
+end
+
+"""
+    Onda.upgrade_onda_format_from_v0_2_to_v0_3!(path, combine_annotation_key_value)
+
+Upgrade the Onda v0.2 dataset at `path` to a Onda v0.3 dataset, returning the
+upgraded `Dataset`. This upgrade process overwrites `path/recordings.msgpack.zst`
+with a v0.3-compliant version of this file; for safety's sake, the old v0.2 file
+is preserved at `path/old.recordings.msgpack.zst.backup`.
+
+A couple of the Onda v0.2 -> v0.3 changes require some special handling:
+
+- The `custom` field was removed from recording objects. This function thus writes out
+  a file at `path/recordings_custom.msgpack.zst` that contains a map of UUIDs to
+  corresponding recordings' `custom` values before deleting the `custom` field. This
+  file can be deserialized via `MsgPack.unpack(Onda.zstd_decompress(read("recordings_custom.msgpack.zst")))`.
+
+- Annotations no longer have a `key` field. Thus, each annotation's existing `key` and `value`
+  fields are combined into the single new `value` field via the provided callback
+  `combine_annotation_key_value(annotation_key, annotation_value)`.
+"""
+function upgrade_onda_format_from_v0_2_to_v0_3!(path, combine_annotation_key_value)
+    file_path = joinpath(path, "recordings.msgpack.zst")
+    bytes = zstd_decompress(read(file_path))
+    mv(file_path, joinpath(path, "old.recordings.msgpack.zst.backup"))
+    io = IOBuffer(bytes)
+    read(io, UInt8) == 0x92 || error("corrupt recordings.msgpack.zst")
+    header = MsgPack.unpack(io, Header)
+    v"0.2" <= header.onda_format_version < v"0.3" || error("unsupported original onda_format_version: $(header.onda_format_version)")
+    recordings = MsgPack.unpack(io, Dict{UUID,Any})
+    customs = Dict{UUID,Any}(uuid => recording["custom"] for (uuid, recording) in recordings)
+    write(joinpath(path, "recordings_custom.msgpack.zst"), zstd_compress(MsgPack.pack(customs)))
+    for (uuid, recording) in recordings
+        signal_stop_nanosecond = recording["duration_in_nanoseconds"]
+        for signal in values(recording["signals"])
+            signal["start_nanosecond"] = 0
+            signal["stop_nanosecond"] = signal_stop_nanosecond
+            signal["sample_offset_in_unit"] = 0.0
+            signal["sample_rate"] = float(signal["sample_rate"])
+        end
+        for annotation in recording["annotations"]
+            annotation["value"] = combine_annotation_key_value(annotation["key"], annotation["value"])
+            delete!(annotation, "key")
+        end
+        delete!(recording, "duration_in_nanoseconds")
+        delete!(recording, "custom")
+    end
+    fixed_recordings = MsgPack.unpack(MsgPack.pack(recordings), Dict{UUID,Recording})
+    dataset = Dataset(path, Header(v"0.3.0", true), fixed_recordings)
+    save_recordings_file(dataset)
+    return dataset
+end
+
 end # module