From d7b3d4f9bbc9e9f618ddf15fbfe8ccc4c6d5b3ce Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Wed, 21 Jun 2023 22:36:25 +0300 Subject: [PATCH] GH-36141: [Go] Support large and fixed types in csv (#36142) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * Closes: #36141 Authored-by: izveigor Signed-off-by: Matt Topol --- go/arrow/csv/common.go | 6 +- go/arrow/csv/reader.go | 134 ++++++++++++++++++++++++++++++- go/arrow/csv/reader_test.go | 22 ++++- go/arrow/csv/testdata/header.csv | 8 +- go/arrow/csv/testdata/types.csv | 8 +- go/arrow/csv/transformer.go | 65 +++++++++++++++ go/arrow/csv/writer_test.go | 66 ++++++++++----- 7 files changed, 276 insertions(+), 33 deletions(-) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 72a704de4d99a..48224d7ad25bd 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -218,12 +218,12 @@ func validate(schema *arrow.Schema) { case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: - case *arrow.StringType: + case *arrow.StringType, *arrow.LargeStringType: case *arrow.TimestampType: case *arrow.Date32Type, *arrow.Date64Type: case *arrow.Decimal128Type, *arrow.Decimal256Type: - case *arrow.ListType: - case *arrow.BinaryType: + case *arrow.ListType, *arrow.LargeListType, *arrow.FixedSizeListType: + case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: case arrow.ExtensionType: default: panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft)) diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go index d525fcb71e13c..8843cac87ca1a 100644 --- a/go/arrow/csv/reader.go +++ b/go/arrow/csv/reader.go @@ -451,6 +451,21 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { bldr.(*array.StringBuilder).Append(str) } } + case *arrow.LargeStringType: + // specialize the implementation when we know we cannot have nulls + if r.stringsCanBeNull { + return func(str string) { + if r.isNull(str) { + bldr.AppendNull() + } else { + bldr.(*array.LargeStringBuilder).Append(str) + } + } + } else { + return func(str string) { + bldr.(*array.LargeStringBuilder).Append(str) + } + } case *arrow.TimestampType: return func(str string) { r.parseTimestamp(bldr, str, dt.Unit) @@ -475,10 +490,26 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(s string) { r.parseList(bldr, s) } + case *arrow.LargeListType: + return func(s string) { + r.parseLargeList(bldr, s) + } + case *arrow.FixedSizeListType: + return func(s string) { + r.parseFixedSizeList(bldr, s, int(dt.Len())) + } case *arrow.BinaryType: return func(s string) { r.parseBinaryType(bldr, s) } + case *arrow.LargeBinaryType: + return func(s string) { + r.parseLargeBinaryType(bldr, s) + } + case *arrow.FixedSizeBinaryType: + return func(s string) { + r.parseFixedSizeBinaryType(bldr, s, dt.Bytes()) + } case arrow.ExtensionType: return func(s string) { r.parseExtension(bldr, s) @@ -783,6 +814,68 @@ func (r *Reader) parseList(field array.Builder, str string) { } } +func (r *Reader) parseLargeList(field array.Builder, str string) { + if r.isNull(str) { + field.AppendNull() + return + } + if !(strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}")) { + r.err = errors.New("invalid list format. should start with '{' and end with '}'") + return + } + str = strings.Trim(str, "{}") + largeListBldr := field.(*array.LargeListBuilder) + largeListBldr.Append(true) + if len(str) == 0 { + // we don't want to create the csv reader if we already know the + // string is empty + return + } + valueBldr := largeListBldr.ValueBuilder() + reader := csv.NewReader(strings.NewReader(str)) + items, err := reader.Read() + if err != nil { + r.err = err + return + } + for _, str := range items { + r.initFieldConverter(valueBldr)(str) + } +} + +func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { + if r.isNull(str) { + field.AppendNull() + return + } + if !(strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}")) { + r.err = errors.New("invalid list format. should start with '{' and end with '}'") + return + } + str = strings.Trim(str, "{}") + fixedSizeListBldr := field.(*array.FixedSizeListBuilder) + fixedSizeListBldr.Append(true) + if len(str) == 0 { + // we don't want to create the csv reader if we already know the + // string is empty + return + } + valueBldr := fixedSizeListBldr.ValueBuilder() + reader := csv.NewReader(strings.NewReader(str)) + items, err := reader.Read() + if err != nil { + r.err = err + return + } + if len(items) == n { + for _, str := range items { + r.initFieldConverter(valueBldr)(str) + } + } else { + r.err = fmt.Errorf("%w: fixed size list items should match the fixed size list length, expected %d, got %d", arrow.ErrInvalid, n, len(items)) + } +} + func (r *Reader) parseBinaryType(field array.Builder, str string) { // specialize the implementation when we know we cannot have nulls if r.isNull(str) { @@ -791,11 +884,50 @@ func (r *Reader) parseBinaryType(field array.Builder, str string) { } decodedVal, err := base64.StdEncoding.DecodeString(str) if err != nil { - panic("cannot decode base64 string " + str) + r.err = fmt.Errorf("cannot decode base64 string %s", str) + field.AppendNull() + return } + field.(*array.BinaryBuilder).Append(decodedVal) } +func (r *Reader) parseLargeBinaryType(field array.Builder, str string) { + // specialize the implementation when we know we cannot have nulls + if r.isNull(str) { + field.AppendNull() + return + } + decodedVal, err := base64.StdEncoding.DecodeString(str) + if err != nil { + r.err = fmt.Errorf("cannot decode base64 string %s", str) + field.AppendNull() + return + } + + field.(*array.BinaryBuilder).Append(decodedVal) +} + +func (r *Reader) parseFixedSizeBinaryType(field array.Builder, str string, byteWidth int) { + // specialize the implementation when we know we cannot have nulls + if r.isNull(str) { + field.AppendNull() + return + } + decodedVal, err := base64.StdEncoding.DecodeString(str) + if err != nil { + r.err = fmt.Errorf("cannot decode base64 string %s", str) + field.AppendNull() + return + } + + if len(decodedVal) == byteWidth { + field.(*array.FixedSizeBinaryBuilder).Append(decodedVal) + } else { + r.err = fmt.Errorf("%w: the length of fixed size binary value should match the fixed size binary byte width, expected %d, got %d", arrow.ErrInvalid, byteWidth, len(decodedVal)) + } +} + func (r *Reader) parseExtension(field array.Builder, str string) { if r.isNull(str) { field.AppendNull() diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go index c4d55ba2703b3..d6d635163226e 100644 --- a/go/arrow/csv/reader_test.go +++ b/go/arrow/csv/reader_test.go @@ -348,9 +348,14 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeN {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "large_str", Type: arrow.BinaryTypes.LargeString}, {Name: "ts", Type: arrow.FixedWidthTypes.Timestamp_ms}, {Name: "list(i64)", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)}, + {Name: "large_list(i64)", Type: arrow.LargeListOf(arrow.PrimitiveTypes.Int64)}, + {Name: "fixed_size_list(i64)", Type: arrow.FixedSizeListOf(3, arrow.PrimitiveTypes.Int64)}, {Name: "binary", Type: arrow.BinaryTypes.Binary}, + {Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary}, + {Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}}, {Name: "uuid", Type: types.NewUUIDType()}, }, nil, @@ -406,9 +411,14 @@ rec[0]["f16"]: [1.0996094] rec[0]["f32"]: [1.1] rec[0]["f64"]: [1.1] rec[0]["str"]: ["str-1"] +rec[0]["large_str"]: ["str-1"] rec[0]["ts"]: [1652054461000] rec[0]["list(i64)"]: [[1 2 3]] +rec[0]["large_list(i64)"]: [[1 2 3]] +rec[0]["fixed_size_list(i64)"]: [[1 2 3]] rec[0]["binary"]: ["\x00\x01\x02"] +rec[0]["large_binary"]: ["\x00\x01\x02"] +rec[0]["fixed_size_binary"]: ["\x00\x01\x02"] rec[0]["uuid"]: ["00000000-0000-0000-0000-000000000001"] rec[1]["bool"]: [false] rec[1]["i8"]: [-2] @@ -423,9 +433,14 @@ rec[1]["f16"]: [2.1992188] rec[1]["f32"]: [2.2] rec[1]["f64"]: [2.2] rec[1]["str"]: [%s] +rec[1]["large_str"]: [%s] rec[1]["ts"]: [1652140799000] rec[1]["list(i64)"]: [[]] +rec[1]["large_list(i64)"]: [[]] +rec[1]["fixed_size_list(i64)"]: [[4 5 6]] rec[1]["binary"]: [(null)] +rec[1]["large_binary"]: [(null)] +rec[1]["fixed_size_binary"]: [(null)] rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"] rec[2]["bool"]: [(null)] rec[2]["i8"]: [(null)] @@ -440,11 +455,16 @@ rec[2]["f16"]: [(null)] rec[2]["f32"]: [(null)] rec[2]["f64"]: [(null)] rec[2]["str"]: [%s] +rec[2]["large_str"]: [%s] rec[2]["ts"]: [(null)] rec[2]["list(i64)"]: [(null)] +rec[2]["large_list(i64)"]: [(null)] +rec[2]["fixed_size_list(i64)"]: [(null)] rec[2]["binary"]: [(null)] +rec[2]["large_binary"]: [(null)] +rec[2]["fixed_size_binary"]: [(null)] rec[2]["uuid"]: [(null)] -`, str1Value, str2Value) +`, str1Value, str1Value, str2Value, str2Value) got, want := out.String(), want require.Equal(t, want, got) diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv index d0a04f2635fbc..50be4f5e4daca 100644 --- a/go/arrow/csv/testdata/header.csv +++ b/go/arrow/csv/testdata/header.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;ts;list(i64);binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv index 2b09760ff6978..d32941f4b214d 100644 --- a/go/arrow/csv/testdata/types.csv +++ b/go/arrow/csv/testdata/types.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;timestamp;binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go index bd47330fe8b7b..886282d49ff80 100644 --- a/go/arrow/csv/transformer.go +++ b/go/arrow/csv/transformer.go @@ -149,6 +149,15 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] res[i] = w.nullValue } } + case *arrow.LargeStringType: + arr := col.(*array.LargeString) + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + res[i] = arr.Value(i) + } else { + res[i] = w.nullValue + } + } case *arrow.Date32Type: arr := col.(*array.Date32) for i := 0; i < arr.Len(); i++ { @@ -225,6 +234,44 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] res[i] = w.nullValue } } + case *arrow.LargeListType: + arr := col.(*array.LargeList) + listVals, offsets := arr.ListValues(), arr.Offsets() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) + var b bytes.Buffer + b.Write([]byte{'{'}) + writer := csv.NewWriter(&b) + writer.Write(w.transformColToStringArr(list.DataType(), list)) + writer.Flush() + b.Truncate(b.Len() - 1) + b.Write([]byte{'}'}) + res[i] = b.String() + list.Release() + } else { + res[i] = w.nullValue + } + } + case *arrow.FixedSizeListType: + arr := col.(*array.FixedSizeList) + listVals := arr.ListValues() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + list := array.NewSlice(listVals, int64((arr.Len()-1)*i), int64((arr.Len()-1)*(i+1))) + var b bytes.Buffer + b.Write([]byte{'{'}) + writer := csv.NewWriter(&b) + writer.Write(w.transformColToStringArr(list.DataType(), list)) + writer.Flush() + b.Truncate(b.Len() - 1) + b.Write([]byte{'}'}) + res[i] = b.String() + list.Release() + } else { + res[i] = w.nullValue + } + } case *arrow.BinaryType: arr := col.(*array.Binary) for i := 0; i < arr.Len(); i++ { @@ -234,6 +281,24 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] res[i] = w.nullValue } } + case *arrow.LargeBinaryType: + arr := col.(*array.LargeBinary) + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + res[i] = base64.StdEncoding.EncodeToString(arr.Value(i)) + } else { + res[i] = w.nullValue + } + } + case *arrow.FixedSizeBinaryType: + arr := col.(*array.FixedSizeBinary) + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + res[i] = base64.StdEncoding.EncodeToString(arr.Value(i)) + } else { + res[i] = w.nullValue + } + } case arrow.ExtensionType: arr := col.(array.ExtensionArray) for i := 0; i < arr.Len(); i++ { diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index cfda625e793f0..cfce4dd0a6142 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -134,18 +134,18 @@ func Example_writer() { var ( fullData = [][]string{ - {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "binary", "uuid"}, - {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "AAEC", "00000000-0000-0000-0000-000000000001"}, - {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "AwQF", "00000000-0000-0000-0000-000000000002"}, - {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "", "00000000-0000-0000-0000-000000000003"}, - {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, + {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "large_str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "large_list(i64)", "fixed_size_list(i64)", "binary", "large_binary", "fixed_size_binary", "uuid"}, + {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "{1,2,3}", "{1,2,3}", "AAEC", "AAEC", "AAEC", "00000000-0000-0000-0000-000000000001"}, + {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "{4,5,6}", "{4,5,6}", "AwQF", "AwQF", "AwQF", "00000000-0000-0000-0000-000000000002"}, + {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "{7,8,9}", "{7,8,9}", "", "", "AAAA", "00000000-0000-0000-0000-000000000003"}, + {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, } bananaData = [][]string{ - {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "binary", "uuid"}, - {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "AAEC", "00000000-0000-0000-0000-000000000001"}, - {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "AwQF", "00000000-0000-0000-0000-000000000002"}, - {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "", "00000000-0000-0000-0000-000000000003"}, - {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, + {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "large_str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "large_list(i64)", "fixed_size_list(i64)", "binary", "large_binary", "fixed_size_binary", "uuid"}, + {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "{1,2,3}", "{1,2,3}", "AAEC", "AAEC", "AAEC", "00000000-0000-0000-0000-000000000001"}, + {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "{4,5,6}", "{4,5,6}", "AwQF", "AwQF", "AwQF", "00000000-0000-0000-0000-000000000002"}, + {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "{7,8,9}", "{7,8,9}", "", "", "AAAA", "00000000-0000-0000-0000-000000000003"}, + {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, } ) @@ -217,13 +217,18 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "large_str", Type: arrow.BinaryTypes.LargeString}, {Name: "ts_s", Type: arrow.FixedWidthTypes.Timestamp_s}, {Name: "d32", Type: arrow.FixedWidthTypes.Date32}, {Name: "d64", Type: arrow.FixedWidthTypes.Date64}, {Name: "dec128", Type: &arrow.Decimal128Type{Precision: 5, Scale: 2}}, {Name: "dec256", Type: &arrow.Decimal256Type{Precision: 5, Scale: 2}}, {Name: "list(i64)", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)}, + {Name: "large_list(i64)", Type: arrow.LargeListOf(arrow.PrimitiveTypes.Int64)}, + {Name: "fixed_size_list(i64)", Type: arrow.FixedSizeListOf(3, arrow.PrimitiveTypes.Int64)}, {Name: "binary", Type: arrow.BinaryTypes.Binary}, + {Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary}, + {Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}}, {Name: "uuid", Type: types.NewUUIDType()}, }, nil, @@ -245,12 +250,13 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo b.Field(10).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil) b.Field(11).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil) b.Field(12).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) - b.Field(13).(*array.TimestampBuilder).AppendValues(genTimestamps(arrow.Second), nil) - b.Field(14).(*array.Date32Builder).AppendValues([]arrow.Date32{17304, 19304, 20304}, nil) - b.Field(15).(*array.Date64Builder).AppendValues([]arrow.Date64{1840400000000, 1940400000000, 2040400000000}, nil) - b.Field(16).(*array.Decimal128Builder).AppendValues([]decimal128.Num{decimal128.FromI64(-12345), decimal128.FromI64(0), decimal128.FromI64(12345)}, nil) - b.Field(17).(*array.Decimal256Builder).AppendValues([]decimal256.Num{decimal256.FromI64(-12345), decimal256.FromI64(0), decimal256.FromI64(12345)}, nil) - listBuilder := b.Field(18).(*array.ListBuilder) + b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + b.Field(14).(*array.TimestampBuilder).AppendValues(genTimestamps(arrow.Second), nil) + b.Field(15).(*array.Date32Builder).AppendValues([]arrow.Date32{17304, 19304, 20304}, nil) + b.Field(16).(*array.Date64Builder).AppendValues([]arrow.Date64{1840400000000, 1940400000000, 2040400000000}, nil) + b.Field(17).(*array.Decimal128Builder).AppendValues([]decimal128.Num{decimal128.FromI64(-12345), decimal128.FromI64(0), decimal128.FromI64(12345)}, nil) + b.Field(18).(*array.Decimal256Builder).AppendValues([]decimal256.Num{decimal256.FromI64(-12345), decimal256.FromI64(0), decimal256.FromI64(12345)}, nil) + listBuilder := b.Field(19).(*array.ListBuilder) listBuilderInt64 := listBuilder.ValueBuilder().(*array.Int64Builder) listBuilder.Append(true) listBuilderInt64.AppendValues([]int64{1, 2, 3}, nil) @@ -258,8 +264,26 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo listBuilderInt64.AppendValues([]int64{4, 5, 6}, nil) listBuilder.Append(true) listBuilderInt64.AppendValues([]int64{7, 8, 9}, nil) - b.Field(19).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil) - b.Field(20).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil) + largeListBuilder := b.Field(20).(*array.LargeListBuilder) + largeListBuilderInt64 := largeListBuilder.ValueBuilder().(*array.Int64Builder) + largeListBuilder.Append(true) + largeListBuilderInt64.AppendValues([]int64{1, 2, 3}, nil) + largeListBuilder.Append(true) + largeListBuilderInt64.AppendValues([]int64{4, 5, 6}, nil) + largeListBuilder.Append(true) + largeListBuilderInt64.AppendValues([]int64{7, 8, 9}, nil) + fixedSizeListBuilder := b.Field(21).(*array.FixedSizeListBuilder) + fixedSizeListBuilderInt64 := fixedSizeListBuilder.ValueBuilder().(*array.Int64Builder) + fixedSizeListBuilder.Append(true) + fixedSizeListBuilderInt64.AppendValues([]int64{1, 2, 3}, nil) + fixedSizeListBuilder.Append(true) + fixedSizeListBuilderInt64.AppendValues([]int64{4, 5, 6}, nil) + fixedSizeListBuilder.Append(true) + fixedSizeListBuilderInt64.AppendValues([]int64{7, 8, 9}, nil) + b.Field(22).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil) + b.Field(23).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil) + b.Field(24).(*array.FixedSizeBinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil) + b.Field(25).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil) for _, field := range b.Fields() { field.AppendNull() @@ -353,6 +377,7 @@ func BenchmarkWrite(b *testing.B) { {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "large_str", Type: arrow.BinaryTypes.LargeString}, {Name: "dec128", Type: &arrow.Decimal128Type{Precision: 4, Scale: 3}}, {Name: "dec128", Type: &arrow.Decimal256Type{Precision: 4, Scale: 3}}, }, @@ -377,8 +402,9 @@ func BenchmarkWrite(b *testing.B) { bldr.Field(10).(*array.Float32Builder).Append(float32(i)) bldr.Field(11).(*array.Float64Builder).Append(float64(i)) bldr.Field(12).(*array.StringBuilder).Append(fmt.Sprintf("str-%d", i)) - bldr.Field(13).(*array.Decimal128Builder).Append(decimal128.FromI64(int64(i))) - bldr.Field(14).(*array.Decimal256Builder).Append(decimal256.FromI64(int64(i))) + bldr.Field(13).(*array.LargeStringBuilder).Append(fmt.Sprintf("str-%d", i)) + bldr.Field(14).(*array.Decimal128Builder).Append(decimal128.FromI64(int64(i))) + bldr.Field(15).(*array.Decimal256Builder).Append(decimal256.FromI64(int64(i))) } rec := bldr.NewRecord()