forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
apacheGH-36217: [MATLAB] Add arrow.array.TimestampArray (apache#36333)
### Rationale for this change We would like to extend the MATLAB interface to support timestamp data. ### What changes are included in this PR? 1. Added a new `arrow.array.MATLABArray` class that can be converted to/from a MATLAB `datetime` array. 2. Added a new type class called `arrow.type.TimestampType`. 3. Added a new enum class called `arrow.type.TimeUnit`. **Example** ```matlab >> dates = datetime(2023, 6, 27) + days(0:3)' dates = 3×1 datetime array 27-Jun-2023 28-Jun-2023 29-Jun-2023 >> timestampArray = arrow.array.TimestampArray(dates) timestampArray = [ 2023-06-27 00:00:00.000000, 2023-06-28 00:00:00.000000, 2023-06-29 00:00:00.000000 ] >> fromArrow = datetime(timestampArray) fromArrow = 3×1 datetime array 27-Jun-2023 28-Jun-2023 29-Jun-2023 >> isequal(dates, fromArrow) ans = logical 1 ``` `TimestampArray` uses `Microsecond` as the default `TimeUnit.` However, you can specify the `TimeUnit` via a name-value pair during construction. ```matlab >> timestampArray = arrow.array.TimestampArray(dates, TimeUnit="second") timestampArray = [ 2023-06-27 00:00:00, 2023-06-28 00:00:00, 2023-06-29 00:00:00 ] >> timestampArray.Type ans = TimestampType with properties: TimeZone: "" TimeUnit: Second ID: Timestamp BitWidth: 64 NumFields: 0 NumBuffers: 2 ``` The `TimestampArray` is timezone-aware if the MATLAB `datetime` array's TimeZone property is set: ```matlab >> dates = datetime(2023, 6, 27, TimeZone="America/Anchorage") + days(0:3)'; >> timestampArray = arrow.array.TimestampArray(dates); >> timestampArray.Type.TimeZone ans = "America/Anchorage" ``` Lastly, `arrow.array.TimestampArray` treats `NaT` as null values by default. However, users can control this behavior by supplying either `InferNulls` or `Valid` as name-value pairs. ```matlab >> dates = [datetime(2023, 6, 27) NaT datetime(2023, 6, 29)]; >> timestampArray = arrow.array.TimestampArray(dates) timestampArray = [ 2023-06-27 00:00:00.000000, null, 2023-06-29 00:00:00.000000 ] >> timestampArray = arrow.array.TimestampArray(dates, Valid=3) timestampArray = [ null, null, 2023-06-29 00:00:00.000000 ] ``` ### Are these changes tested? Added three new test files: 1. `tTimestampArray.m` 2.`tTimestampType.m` 3. `tTimeUnit.m` ### Are there any user-facing changes? Yes. 1. Added `arrow.array.TimestampArray` ### Notes Thank you @ kevingurney for the help! * Closes: apache#36217 Authored-by: Sarah Gilmore <sgilmore@mathworks.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
- Loading branch information
1 parent
4cfe9fa
commit 0344a2c
Showing
15 changed files
with
753 additions
and
21 deletions.
There are no files selected for viewing
92 changes: 92 additions & 0 deletions
92
matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/matlab/array/proxy/timestamp_array.h" | ||
|
||
#include "arrow/matlab/error/error.h" | ||
#include "arrow/matlab/bit/pack.h" | ||
#include "arrow/matlab/bit/unpack.h" | ||
|
||
#include "arrow/matlab/type/time_unit.h" | ||
#include "arrow/util/utf8.h" | ||
#include "arrow/type.h" | ||
#include "arrow/builder.h" | ||
|
||
|
||
namespace arrow::matlab::array::proxy { | ||
|
||
namespace { | ||
const uint8_t* getUnpackedValidityBitmap(const ::matlab::data::TypedArray<bool>& valid_elements) { | ||
const auto valid_elements_iterator(valid_elements.cbegin()); | ||
return reinterpret_cast<const uint8_t*>(valid_elements_iterator.operator->()); | ||
} | ||
} // anonymous namespace | ||
|
||
libmexclass::proxy::MakeResult TimestampArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { | ||
namespace mda = ::matlab::data; | ||
|
||
mda::StructArray opts = constructor_arguments[0]; | ||
|
||
// Get the mxArray from constructor arguments | ||
const mda::TypedArray<int64_t> timestamp_mda = opts[0]["MatlabArray"]; | ||
const mda::TypedArray<bool> validity_bitmap_mda = opts[0]["Valid"]; | ||
|
||
const mda::TypedArray<mda::MATLABString> timezone_mda = opts[0]["TimeZone"]; | ||
const mda::TypedArray<mda::MATLABString> units_mda = opts[0]["TimeUnit"]; | ||
|
||
// extract the time zone string | ||
MATLAB_ASSIGN_OR_ERROR(const auto timezone, arrow::util::UTF16StringToUTF8(timezone_mda[0]), | ||
error::UNICODE_CONVERSION_ERROR_ID); | ||
|
||
// extract the time unit | ||
MATLAB_ASSIGN_OR_ERROR(const auto time_unit, arrow::matlab::type::timeUnitFromString(units_mda[0]), | ||
error::UKNOWN_TIME_UNIT_ERROR_ID) | ||
|
||
// create the timestamp_type | ||
auto data_type = arrow::timestamp(time_unit, timezone); | ||
arrow::TimestampBuilder builder(data_type, arrow::default_memory_pool()); | ||
|
||
// Get raw pointer of mxArray | ||
auto it(timestamp_mda.cbegin()); | ||
auto dt = it.operator->(); | ||
|
||
// Pack the validity bitmap values. | ||
const uint8_t* valid_mask = getUnpackedValidityBitmap(validity_bitmap_mda); | ||
const auto num_elements = timestamp_mda.getNumberOfElements(); | ||
|
||
// Append values | ||
MATLAB_ERROR_IF_NOT_OK(builder.AppendValues(dt, num_elements, valid_mask), error::APPEND_VALUES_ERROR_ID); | ||
MATLAB_ASSIGN_OR_ERROR(auto timestamp_array, builder.Finish(), error::BUILD_ARRAY_ERROR_ID); | ||
|
||
return std::make_shared<arrow::matlab::array::proxy::TimestampArray>(timestamp_array); | ||
} | ||
|
||
void TimestampArray::toMATLAB(libmexclass::proxy::method::Context& context) { | ||
namespace mda = ::matlab::data; | ||
|
||
const auto num_elements = static_cast<size_t>(array->length()); | ||
const auto timestamp_array = std::static_pointer_cast<arrow::TimestampArray>(array); | ||
const int64_t* const data_begin = timestamp_array->raw_values(); | ||
const int64_t* const data_end = data_begin + num_elements; | ||
|
||
mda::ArrayFactory factory; | ||
|
||
// Constructs a TypedArray from the raw values. Makes a copy. | ||
mda::TypedArray<int64_t> result = factory.createArray({num_elements, 1}, data_begin, data_end); | ||
context.outputs[0] = result; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include "arrow/array.h" | ||
|
||
#include "arrow/matlab/array/proxy/array.h" | ||
|
||
#include "libmexclass/proxy/Proxy.h" | ||
|
||
namespace arrow::matlab::array::proxy { | ||
|
||
class TimestampArray : public arrow::matlab::array::proxy::Array { | ||
public: | ||
TimestampArray(const std::shared_ptr<arrow::Array> timestamp_array) | ||
: arrow::matlab::array::proxy::Array() { | ||
array = timestamp_array; | ||
} | ||
|
||
static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); | ||
|
||
protected: | ||
|
||
void toMATLAB(libmexclass::proxy::method::Context& context) override; | ||
}; | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/matlab/type/time_unit.h" | ||
#include "arrow/util/utf8.h" | ||
|
||
namespace arrow::matlab::type { | ||
|
||
arrow::Result<arrow::TimeUnit::type> timeUnitFromString(const std::u16string& unit_str) { | ||
if (unit_str == u"Second") { | ||
return arrow::TimeUnit::type::SECOND; | ||
} else if (unit_str == u"Millisecond") { | ||
return arrow::TimeUnit::type::MILLI; | ||
} else if (unit_str == u"Microsecond") { | ||
return arrow::TimeUnit::type::MICRO; | ||
} else if (unit_str == u"Nanosecond") { | ||
return arrow::TimeUnit::type::NANO; | ||
} else { | ||
auto maybe_utf8 = arrow::util::UTF16StringToUTF8(unit_str); | ||
auto msg = maybe_utf8.ok() ? "Unknown time unit string: " + *maybe_utf8 : "Unknown time unit string"; | ||
return arrow::Status::Invalid(msg); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/type_fwd.h" | ||
#include "arrow/result.h" | ||
|
||
#include <string> | ||
|
||
namespace arrow::matlab::type { | ||
|
||
arrow::Result<arrow::TimeUnit::type> timeUnitFromString(const std::u16string& unit_str); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
% Licensed to the Apache Software Foundation (ASF) under one or more | ||
% contributor license agreements. See the NOTICE file distributed with | ||
% this work for additional information regarding copyright ownership. | ||
% The ASF licenses this file to you under the Apache License, Version | ||
% 2.0 (the "License"); you may not use this file except in compliance | ||
% with the License. You may obtain a copy of the License at | ||
% | ||
% http://www.apache.org/licenses/LICENSE-2.0 | ||
% | ||
% Unless required by applicable law or agreed to in writing, software | ||
% distributed under the License is distributed on an "AS IS" BASIS, | ||
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
% implied. See the License for the specific language governing | ||
% permissions and limitations under the License. | ||
|
||
classdef TimestampArray < arrow.array.Array | ||
% arrow.array.TimestampArray | ||
|
||
properties(Access=private) | ||
NullSubstitutionValue = NaT; | ||
end | ||
|
||
properties(SetAccess=private, GetAccess=public) | ||
Type = arrow.type.TimestampType % temporarily default value | ||
end | ||
|
||
methods | ||
function obj = TimestampArray(data, opts) | ||
arguments | ||
data | ||
opts.TimeUnit(1, 1) arrow.type.TimeUnit = arrow.type.TimeUnit.Microsecond | ||
opts.InferNulls(1, 1) logical = true | ||
opts.Valid | ||
end | ||
arrow.args.validateTypeAndShape(data, "datetime"); | ||
validElements = arrow.args.parseValidElements(data, opts); | ||
ptime = arrow.array.TimestampArray.convertToEpochTime(data, opts.TimeUnit); | ||
timezone = string(data.TimeZone); | ||
|
||
args = struct(MatlabArray=ptime, Valid=validElements, TimeZone=timezone, TimeUnit=string(opts.TimeUnit)); | ||
obj@arrow.array.Array("Name", "arrow.array.proxy.TimestampArray", "ConstructorArguments", {args}); | ||
obj.Type = arrow.type.TimestampType(TimeUnit=opts.TimeUnit, TimeZone=timezone); | ||
end | ||
|
||
function dates = toMATLAB(obj) | ||
time = obj.Proxy.toMATLAB(); | ||
|
||
epoch = datetime(1970, 1, 1, TimeZone="UTC"); | ||
|
||
tz = obj.Type.TimeZone; | ||
ticsPerSecond = obj.Type.TimeUnit.TicksPerSecond; | ||
|
||
dates = datetime(time, ConvertFrom="epochtime", Epoch=epoch, ... | ||
TimeZone=tz, TicksPerSecond=ticsPerSecond); | ||
|
||
dates(~obj.Valid) = obj.NullSubstitutionValue; | ||
end | ||
|
||
function dates = datetime(obj) | ||
dates = toMATLAB(obj); | ||
end | ||
end | ||
|
||
methods (Static, Access = private) | ||
function time = convertToEpochTime(dates, units) | ||
|
||
time = zeros(size(dates), "int64"); | ||
indices = ~isnat(dates); | ||
|
||
% convertTo uses Jan-1-1970 as the default epoch. If the input | ||
% datetime array has a TimeZone, the epoch is Jan-1-1970 UTC. | ||
% | ||
% TODO: convertTo may error if the datetime is 2^63-1 before or | ||
% after the epoch. We should throw a custom error in this case. | ||
time(indices) = convertTo(dates(indices), "epochtime", TicksPerSecond=units.TicksPerSecond); | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.