From 60ebe641d6609d9b490ab258c05eeda4ffea09de Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 11 Dec 2024 13:58:59 +0100 Subject: [PATCH 1/8] implicit conversion from Char to String in DataColumn.convertTo and DataFrame.convert() --- .../kotlinx/dataframe/impl/api/convert.kt | 8 ++++++- .../kotlinx/dataframe/api/convert.kt | 22 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index af4e6d3b9a..40c7a6a523 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -369,7 +369,13 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Char::class -> when (toClass) { Int::class -> convert { it.code } - else -> null + + else -> // convert char to string and then to target type + getConverter(typeOf(), to, options)?.let { stringConverter -> + convert { + stringConverter(it.toString()) + } + } } Int::class -> when (toClass) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 47c49736db..55d4a802ea 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.assertions.throwables.shouldNotThrow import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe +import io.kotest.matchers.shouldNotBe import kotlinx.datetime.Clock import kotlinx.datetime.Instant import kotlinx.datetime.LocalTime @@ -69,6 +70,20 @@ class ConvertTests { @Test fun `convert string to enum`() { columnOf("A", "B").convertTo() shouldBe columnOf(EnumClass.A, EnumClass.B) + + dataFrameOf(columnOf("A", "B") named "colA") + .convert("colA").to() + .getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA") + } + + @Test + fun `convert char to enum`() { + // Char -> String -> Enum + columnOf('A', 'B').convertTo() shouldBe columnOf(EnumClass.A, EnumClass.B) + + dataFrameOf(columnOf('A', 'B') named "colA") + .convert("colA").to() + .getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA") } @JvmInline @@ -199,6 +214,13 @@ class ConvertTests { val col = columnOf(65, 66) col.convertTo() shouldBe columnOf('A', 'B') col.convertTo().convertTo() shouldBe col + + // this means + columnOf('1', '2').convertToInt() shouldNotBe columnOf(1, 2) + columnOf('1', '2').convertToInt() shouldBe columnOf(49, 50) + + // but + columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2) } @Test From ff1f42891fea7611c805d7383ffb8ac052f43323 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 11 Dec 2024 14:40:27 +0100 Subject: [PATCH 2/8] introducing parsing of Char? columns. It works the same as String parsing, failing when the returned type is Char or String --- core/api/core.api | 4 ++ .../jetbrains/kotlinx/dataframe/api/parse.kt | 40 +++++++++++++++++++ .../kotlinx/dataframe/impl/api/convert.kt | 8 +++- .../kotlinx/dataframe/impl/api/parse.kt | 13 ++---- .../kotlinx/dataframe/api/convert.kt | 24 +++++++++++ .../jetbrains/kotlinx/dataframe/api/parse.kt | 16 ++++++++ .../kotlinx/dataframe/io/ParserTests.kt | 6 +++ .../jetbrains/kotlinx/dataframe/api/parse.kt | 40 ++++++++++++++++++- .../kotlinx/dataframe/impl/api/parse.kt | 13 ++---- .../kotlinx/dataframe/api/convert.kt | 2 + .../jetbrains/kotlinx/dataframe/api/parse.kt | 18 +++++++++ .../kotlinx/dataframe/io/ParserTests.kt | 6 +++ 12 files changed, 169 insertions(+), 21 deletions(-) diff --git a/core/api/core.api b/core/api/core.api index 7d1a0b3d47..8e39418c71 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -3547,8 +3547,12 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt { public static synthetic fun parse$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun parseAnyFrameNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; public static synthetic fun parseAnyFrameNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static final fun parseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static synthetic fun parseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; public static final fun tryParse (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; public static synthetic fun tryParse$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static final fun tryParseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static synthetic fun tryParseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; } public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions { diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 3fa7142898..e2de48c7e6 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -20,6 +20,7 @@ import java.time.format.DateTimeFormatter import java.util.Locale import kotlin.reflect.KProperty import kotlin.reflect.KType +import kotlin.reflect.typeOf import kotlin.uuid.ExperimentalUuidApi import kotlin.uuid.Uuid @@ -312,6 +313,28 @@ public class ParserOptions( * @return a new column with parsed values */ public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options) +/** + * Tries to parse a column of chars into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. If all the others fail, the final parser + * returns strings. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled) + * @return a new column with parsed values + */ +@JvmName("tryParseChar") +public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> { + // skip the Char parser, as we're trying to parse away from Char + val providedSkipTypes = options?.skipTypes ?: DataFrame.parser.skipTypes + val parserOptions = (options ?: ParserOptions()).copy(skipTypes = providedSkipTypes + typeOf()) + + return map { it?.toString() }.tryParse(parserOptions) +} + public fun DataFrame.parse(options: ParserOptions? = null): DataFrame = parse(options) { colsAtAnyDepth().filter { !it.isColumnGroup() } @@ -335,6 +358,23 @@ public fun DataFrame.parse(options: ParserOptions? = null): DataFrame public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") } +/** + * Tries to parse a column of chars as strings into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. + * + * If all fail, the column is returned as `String`, this can never fail. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @return a new column with parsed values + */ +@JvmName("parseChar") +public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = + tryParse(options) // no need to throw an exception, as Char can always be parsed as String + @JvmName("parseAnyFrameNullable") public fun DataColumn.parse(options: ParserOptions? = null): DataColumn = map { it?.parse(options) } diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index af4e6d3b9a..40c7a6a523 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -369,7 +369,13 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Char::class -> when (toClass) { Int::class -> convert { it.code } - else -> null + + else -> // convert char to string and then to target type + getConverter(typeOf(), to, options)?.let { stringConverter -> + convert { + stringConverter(it.toString()) + } + } } Int::class -> when (toClass) { diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 40dcf7bc35..702b82b4f4 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -716,29 +716,24 @@ internal fun DataFrame.parseImpl(options: ParserOptions?, columns: Column when { // when a frame column is requested to be parsed, // parse each value/frame column at any depth inside each DataFrame in the frame column - col.isFrameColumn() -> { + col.isFrameColumn() -> col.map { it.parseImpl(options) { colsAtAnyDepth().filter { !it.isColumnGroup() } } } - } // when a column group is requested to be parsed, // parse each column in the group - col.isColumnGroup() -> { + col.isColumnGroup() -> col.parseImpl(options) { all() } .asColumnGroup(col.name()) .asDataColumn() - } // Base case, parse the column if it's a `String?` column - col.isSubtypeOf() -> { + col.isSubtypeOf() -> col.cast().tryParseImpl(options) - } - else -> { - col - } + else -> col } } diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 47c49736db..b8ca8cca1a 100644 --- a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.assertions.throwables.shouldNotThrow import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe +import io.kotest.matchers.shouldNotBe import kotlinx.datetime.Clock import kotlinx.datetime.Instant import kotlinx.datetime.LocalTime @@ -69,6 +70,20 @@ class ConvertTests { @Test fun `convert string to enum`() { columnOf("A", "B").convertTo() shouldBe columnOf(EnumClass.A, EnumClass.B) + + dataFrameOf(columnOf("A", "B") named "colA") + .convert("colA").to() + .getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA") + } + + @Test + fun `convert char to enum`() { + // Char -> String -> Enum + columnOf('A', 'B').convertTo() shouldBe columnOf(EnumClass.A, EnumClass.B) + + dataFrameOf(columnOf('A', 'B') named "colA") + .convert("colA").to() + .getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA") } @JvmInline @@ -199,6 +214,15 @@ class ConvertTests { val col = columnOf(65, 66) col.convertTo() shouldBe columnOf('A', 'B') col.convertTo().convertTo() shouldBe col + + // this means + columnOf('1', '2').convertToInt() shouldNotBe columnOf(1, 2) + columnOf('1', '2').convertToInt() shouldBe columnOf(49, 50) + + // but + columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2) + // or + columnOf('1', '2').parse() shouldBe columnOf(1, 2) } @Test diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index c291c62aa5..af5928fdbc 100644 --- a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -38,6 +38,22 @@ import kotlin.time.Instant as StdlibInstant import kotlinx.datetime.Instant as DeprecatedInstant class ParseTests { + + @Test + fun `parse to chars`() { + val char = columnOf('a', 'b', 'c') + char.parse() shouldBe char + char.tryParse() shouldBe char + char.convertToString().parse() shouldBe char + } + + @Test + fun `parse chars to int`() { + val char = columnOf('1', '2', '3') + char.parse() shouldBe columnOf(1, 2, 3) + char.tryParse() shouldBe columnOf(1, 2, 3) + } + @Test fun parseDate() { val currentLocale = Locale.getDefault() diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 553fac5961..5faaf255a2 100644 --- a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -46,6 +46,12 @@ class ParserTests { DataFrame.parser.resetToDefault() } + @Test + fun `parse to Char`() { + val col by columnOf("a", "b") + col.parse().type() shouldBe typeOf() + } + @Test(expected = IllegalStateException::class) fun `parse should throw`() { val col by columnOf("a", "bc") diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index ff7f4b6460..c7e74f3b68 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -12,7 +12,6 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser -import org.jetbrains.kotlinx.dataframe.typeClass import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY @@ -302,6 +301,23 @@ public class ParserOptions( /** @include [tryParseImpl] */ public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options) +/** + * Tries to parse a column of chars into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. If all the others fail, the final parser + * returns strings. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled) + * @return a new column with parsed values + */ +@JvmName("tryParseChar") +public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> = + map { it?.toString() }.tryParseImpl(options) + public fun DataFrame.parse(options: ParserOptions? = null): DataFrame = parse(options) { colsAtAnyDepth().filter { !it.isColumnGroup() } @@ -323,7 +339,27 @@ public fun DataFrame.parse(options: ParserOptions? = null): DataFrame * @return a new column with parsed values */ public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = - tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") } + tryParse(options).also { if (it.isSubtypeOf()) error("Can't guess column type") } + +/** + * Tries to parse a column of chars as strings into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. + * + * If all fail [IllegalStateException] is thrown. If you don't want this exception to be thrown, + * use [tryParse] instead. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @return a new column with parsed values + */ +@JvmName("parseChar") +public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = + map { it?.toString() } + .tryParse(options) + .also { if (it.isSubtypeOf() || it.isSubtypeOf()) error("Can't guess column type") } @JvmName("parseAnyFrameNullable") public fun DataColumn.parse(options: ParserOptions? = null): DataColumn = diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 40dcf7bc35..702b82b4f4 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -716,29 +716,24 @@ internal fun DataFrame.parseImpl(options: ParserOptions?, columns: Column when { // when a frame column is requested to be parsed, // parse each value/frame column at any depth inside each DataFrame in the frame column - col.isFrameColumn() -> { + col.isFrameColumn() -> col.map { it.parseImpl(options) { colsAtAnyDepth().filter { !it.isColumnGroup() } } } - } // when a column group is requested to be parsed, // parse each column in the group - col.isColumnGroup() -> { + col.isColumnGroup() -> col.parseImpl(options) { all() } .asColumnGroup(col.name()) .asDataColumn() - } // Base case, parse the column if it's a `String?` column - col.isSubtypeOf() -> { + col.isSubtypeOf() -> col.cast().tryParseImpl(options) - } - else -> { - col - } + else -> col } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 55d4a802ea..b8ca8cca1a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -221,6 +221,8 @@ class ConvertTests { // but columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2) + // or + columnOf('1', '2').parse() shouldBe columnOf(1, 2) } @Test diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index c291c62aa5..9a05fa2d29 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -1,5 +1,6 @@ package org.jetbrains.kotlinx.dataframe.api +import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.should import io.kotest.matchers.shouldBe import io.kotest.matchers.shouldNotBe @@ -38,6 +39,23 @@ import kotlin.time.Instant as StdlibInstant import kotlinx.datetime.Instant as DeprecatedInstant class ParseTests { + + @Test + fun `parse to chars`() { + val char = columnOf('a', 'b', 'c') + shouldThrow { char.parse() } + char.tryParse() shouldBe char + char.convertToString().parse() shouldBe char + char.convertToString().tryParse() shouldBe char + } + + @Test + fun `parse chars to int`() { + val char = columnOf('1', '2', '3') + char.parse() shouldBe columnOf(1, 2, 3) + char.tryParse() shouldBe columnOf(1, 2, 3) + } + @Test fun parseDate() { val currentLocale = Locale.getDefault() diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 553fac5961..5faaf255a2 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -46,6 +46,12 @@ class ParserTests { DataFrame.parser.resetToDefault() } + @Test + fun `parse to Char`() { + val col by columnOf("a", "b") + col.parse().type() shouldBe typeOf() + } + @Test(expected = IllegalStateException::class) fun `parse should throw`() { val col by columnOf("a", "bc") From 3e531c26e472dff5c2727c489ab68220deed221a Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 1 Sep 2025 12:55:02 +0200 Subject: [PATCH 3/8] fixup! introducing parsing of Char? columns. It works the same as String parsing, failing when the returned type is Char or String --- .../kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 702b82b4f4..2a74a7f84f 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -730,6 +730,10 @@ internal fun DataFrame.parseImpl(options: ParserOptions?, columns: Column .asColumnGroup(col.name()) .asDataColumn() + // Base case, parse the column if it's a `Char?` column + col.isSubtypeOf() -> + col.map { it?.toString() }.tryParseImpl(options) + // Base case, parse the column if it's a `String?` column col.isSubtypeOf() -> col.cast().tryParseImpl(options) From 321174943b69103bf5e849d5ef8f72bc7de97d70 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 1 Sep 2025 12:55:17 +0200 Subject: [PATCH 4/8] updating docs for `Char`s in convert and parse --- docs/StardustDocs/topics/convert.md | 8 +++++++- docs/StardustDocs/topics/parse.md | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/docs/StardustDocs/topics/convert.md b/docs/StardustDocs/topics/convert.md index f4d9734b0f..c69057e41e 100644 --- a/docs/StardustDocs/topics/convert.md +++ b/docs/StardustDocs/topics/convert.md @@ -57,7 +57,7 @@ df.convert { name }.asColumn { col -> `convert` supports automatic type conversions between the following types: -* `String` (uses [`parse`](parse.md) to convert from `String` to other types) +* `String`, `Char` (uses [`parse`](parse.md) to convert from `String` to other types) * `Boolean` * `Byte` * `Short` @@ -72,6 +72,12 @@ df.convert { name }.asColumn { col -> * `LocalTime` (kotlinx.datetime and java.time) * `Instant` (kotlinx.datetime, kotlin.time, and java.time) +Note that converting between `Char` and `Int` is done by ASCII character code. +This means the `Char` `'1'` becomes the `Int` `49`. + +If you want to convert `Char` `'1'` to the `Int` `1`, use [parse()](parse.md) instead, or use `String` +as intermediate type. + ```kotlin diff --git a/docs/StardustDocs/topics/parse.md b/docs/StardustDocs/topics/parse.md index c6f0df5ed5..d7cd258406 100644 --- a/docs/StardustDocs/topics/parse.md +++ b/docs/StardustDocs/topics/parse.md @@ -1,12 +1,12 @@ [//]: # (title: parse) -Returns a [`DataFrame`](DataFrame.md) in which the given `String` columns are parsed into other types. +Returns a [`DataFrame`](DataFrame.md) in which the given `String` and `Char` columns are parsed into other types. -This is a special case of the [convert](convert.md) operation. +This is a special case of the [](convert.md) operation. This parsing operation is sometimes executed implicitly, for example, when [reading from CSV](read.md) or -[type converting from `String` columns](convert.md). +[type converting from `String`/`Char` columns](convert.md). You can recognize this by the `locale` or `parserOptions` arguments in these functions. Related operations: [](updateConvert.md) @@ -20,7 +20,10 @@ df.parse() -To parse only particular columns use a [column selector](ColumnSelectors.md): +When no columns are specified, all `String` and `Char` columns are parsed, +even those inside [column groups](DataColumn.md#columngroup) and inside [frame columns](DataColumn.md#framecolumn). + +To parse only particular columns, use a [column selector](ColumnSelectors.md): @@ -33,7 +36,7 @@ df.parse { age and weight } ### Parsing Order -`parse` tries to parse every `String` column into one of supported types in the following order: +`parse` tries to parse every `String`/`Char` column into one of the supported types in the following order: * `Int` * `Long` * `Instant` (`kotlin.time`) (requires `parseExperimentalInstant = true`, available from Kotlin 2.1+.) @@ -48,6 +51,12 @@ df.parse { age and weight } * `Uuid` ([`kotlin.uuid.Uuid`](https://kotlinlang.org/api/core/kotlin-stdlib/kotlin.uuid/-uuid/)) (requires `parseExperimentalUuid = true`) * `BigDecimal` * `JSON` (arrays and objects) (requires the `org.jetbrains.kotlinx:dataframe-json` dependency) +* `Char` +* `String` + +When `.parse()` is called on a single column and the input (`String`/`Char`) type is the same as the output type, +(a.k.a., it cannot be parsed further) an `IllegalStateException` is thrown. +To avoid this, use `col.tryParse()` instead. ### Parser Options From ccdd066987ca3810fb38039563f1e3d35d4a4a51 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 1 Sep 2025 17:47:08 +0200 Subject: [PATCH 5/8] `convertTo<> { parser {} }` can now also defines `Char` parsing unless a char converter is passed explicitly --- .../kotlinx/dataframe/api/convertTo.kt | 6 +- .../kotlinx/dataframe/impl/api/convertTo.kt | 21 +++++++ .../kotlinx/dataframe/api/convertTo.kt | 59 +++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt index 70f2954940..08113dc91b 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt @@ -52,7 +52,7 @@ public class ConverterScope(public val fromType: KType, public val toSchema: Col * df.convertTo { * // defines how to convert Int? -> String * convert().with { it?.toString() ?: "No input given" } - * // defines how to convert String -> SomeType + * // defines how to convert String/Char -> SomeType * parser { SomeType(it) } * // fill missing column `sum` with expression `a+b` * fill { sum }.with { a + b } @@ -102,6 +102,10 @@ public fun ConvertToFill.with(expr: RowExpression) { /** * Defines how to convert `String` values into given type [C]. + * + * This method is a shortcut for `convert().with { }`. + * + * If no converter is defined for `Char` values, this converter will be used for them as well. */ public inline fun ConvertSchemaDsl<*>.parser(noinline parser: (String) -> C): Unit = convert().with(parser) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt index 48c2864df8..d714083f26 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt @@ -45,8 +45,10 @@ import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.jetbrains.kotlinx.dataframe.size import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.full.withNullability import kotlin.reflect.jvm.jvmErasure +import kotlin.reflect.typeOf private val logger = KotlinLogging.logger {} @@ -144,6 +146,25 @@ internal fun AnyFrame.convertToImpl( val from = originalColumn.type() val to = targetSchema.type val converter = dsl.getConverter(from, targetSchema) + ?: run { + // Special case for Char columns: + // If there is no explicit Char converter, + // check if we have any converters for String -> target + // if so, we can convert Char -> String -> target + // this allows `parser {}` to work both for Strings and Chars :) + + if (!from.isSubtypeOf(typeOf())) return@run null + + val stringConverter = dsl.getConverter( + fromType = typeOf().withNullability(from.isMarkedNullable), + toSchema = targetSchema, + ) ?: return@run null + + Converter( + transform = { stringConverter.transform(this, (it as Char?)?.toString()) }, + skipNulls = stringConverter.skipNulls, + ) + } val convertedColumn = if (converter != null) { val nullsAllowed = to.isMarkedNullable diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt index 176ab06975..aa16906ac3 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt @@ -51,6 +51,38 @@ class ConvertToTests { df.convertTo { parser { A(it.toInt()) } } .single() .a.value shouldBe 1 + + // shortcut for: + df.convertTo { convert().with { A(it.toInt()) } } + .single() + .a.value shouldBe 1 + } + + @Test + fun `convert from char with parser`() { + val df = dataFrameOf("a")('1') + + shouldThrow { + df.convertTo() + } + + // Char -> String -> Target + df.convertTo { parser { A(it.toInt()) } } + .single() + .a.value shouldBe 1 + + // shortcut for: + df.convertTo { convert().with { A(it.toInt()) } } + .single() + .a.value shouldBe 1 + + // Char -> Target + df.convertTo { + parser { error("should not be triggered if convert() is present") } + convert().with<_, A> { error("should not be triggered if convert() is present") } + + convert().with { A(it.digitToInt()) } + }.single().a.value shouldBe 1 } @Test @@ -335,4 +367,31 @@ class ConvertToTests { DataFrame.emptyOf(), ) } + + enum class SimpleEnum { A, B } + + @DataSchema + interface SchemaWithNullableEnum { + val a: SimpleEnum? + } + + @Test + fun `convert Char to Enum`() { + val df = dataFrameOf("a")('A', 'B', null) + + val converted = df.convertTo() + converted["a"].type() shouldBe typeOf() + converted shouldBe dataFrameOf("a")(SimpleEnum.A, SimpleEnum.B, null) + } + + @Test + fun `convert Char to Enum custom charParser`() { + val df = dataFrameOf("a")('a', 'b', null) + + val converted = df.convertTo { + parser { SimpleEnum.valueOf(it.uppercase()) } + } + converted["a"].type() shouldBe typeOf() + converted shouldBe dataFrameOf("a")(SimpleEnum.A, SimpleEnum.B, null) + } } From 2c4f742675ce7e19a919b34d84ae3047ba94d2f8 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 1 Sep 2025 17:48:14 +0200 Subject: [PATCH 6/8] updates convert and convertTo docs with new char parser behavior and generally a better explanation of how convertTo works --- .../kotlinx/dataframe/samples/api/Modify.kt | 15 ++++++-- docs/StardustDocs/topics/convert.md | 3 +- docs/StardustDocs/topics/convertTo.md | 37 ++++++++++++++----- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Modify.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Modify.kt index 1fb498d172..ba93b276a9 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Modify.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Modify.kt @@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.samples.api import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.alsoDebug @@ -1102,11 +1103,17 @@ class Modify : TestBase() { @TransformDataFrameExpressions fun customConverters() { // SampleStart - val df = dataFrameOf("a", "b")(1, "2") + val df: AnyFrame = dataFrameOf( + "a" to columnOf(1, 2, 3), + "b" to columnOf("1", "2", "3"), + ) df.convertTo { - convert().with { MyType(it) } // converts `a` from Int to MyType - parser { MyType(it.toInt()) } // converts `b` from String to MyType - fill { c }.with { a.value + b.value } // computes missing column `c` + // providing the converter: Int -> MyType, so column `a` can be converted + convert().with { MyType(it) } + // providing the parser: String -> MyType, so column `b` can be converted + parser { MyType(it.toInt()) } + // providing the filler for `c`, as it's missing in `df` + fill { c }.with { a.value + b.value } } // SampleEnd } diff --git a/docs/StardustDocs/topics/convert.md b/docs/StardustDocs/topics/convert.md index c69057e41e..95ffff1aee 100644 --- a/docs/StardustDocs/topics/convert.md +++ b/docs/StardustDocs/topics/convert.md @@ -56,7 +56,7 @@ df.convert { name }.asColumn { col -> -`convert` supports automatic type conversions between the following types: +`convert {}.to<>()` supports automatic type conversions between the following types: * `String`, `Char` (uses [`parse`](parse.md) to convert from `String` to other types) * `Boolean` * `Byte` @@ -71,6 +71,7 @@ df.convert { name }.asColumn { col -> * `LocalDate` (kotlinx.datetime and java.time) * `LocalTime` (kotlinx.datetime and java.time) * `Instant` (kotlinx.datetime, kotlin.time, and java.time) +* `enum` classes (by name) Note that converting between `Char` and `Int` is done by ASCII character code. This means the `Char` `'1'` becomes the `Int` `49`. diff --git a/docs/StardustDocs/topics/convertTo.md b/docs/StardustDocs/topics/convertTo.md index afd9d92cbc..98ba77989d 100644 --- a/docs/StardustDocs/topics/convertTo.md +++ b/docs/StardustDocs/topics/convertTo.md @@ -1,18 +1,31 @@ [//]: # (title: convertTo) -[Converts](convert.md) columns in [`DataFrame`](DataFrame.md) to match a given schema [`Schema`](schema.md). +[Converts](convert.md) all columns in the [`DataFrame`](DataFrame.md) to match a given schema [`Schema`](schema.md). ```kotlin convertTo(excessiveColumns = ExcessiveColumns.Keep) ``` -**Related operations**: [](adjustSchema.md) +**Related operations**: [](adjustSchema.md), [](convert.md) + +Conversion to match the target schema is done mostly automatically; +DataFrame knows how to convert between many types (see [](convert.md) for details and the supported types). + +However, if you have a custom type in your target schema, or the automatic conversion fails, +you can provide a custom converter, parser, or filler for it. +These have priority over the automatic ones. Customization DSL: -* `convert`—how specific column types should be converted -* `parser`—how to parse strings into custom types -* `fill`—how to fill missing columns +* `convert.with { it.toB() }` + * Provides `convertTo<>()` with the knowledge of how to convert `A` to `B` +* `parser { YourType.fromString(it) }` + * Provides `convertTo<>()` with the knowledge of how to parse strings/chars into `YourType` + * Shortcut for `convert().with { YourType.fromString(it) }` + * Chars are treated as strings unless you explicitly specify `convert().with { YourType.fromChar(it) }` +* `fill { some cols }.with { rowExpression }` + * Makes `convertTo<>()` fill missing (or existing) columns from the target schema + with values computed by the given row expression @@ -27,11 +40,17 @@ class MySchema(val a: MyType, val b: MyType, val c: Int) ```kotlin -val df = dataFrameOf("a", "b")(1, "2") +val df: AnyFrame = dataFrameOf( + "a" to columnOf(1, 2, 3), + "b" to columnOf("1", "2", "3"), +) df.convertTo { - convert().with { MyType(it) } // converts `a` from Int to MyType - parser { MyType(it.toInt()) } // converts `b` from String to MyType - fill { c }.with { a.value + b.value } // computes missing column `c` + // providing the converter: Int -> MyType, so column `a` can be converted + convert().with { MyType(it) } + // providing the parser: String -> MyType, so column `b` can be converted + parser { MyType(it.toInt()) } + // providing the filler for `c`, as it's missing in `df` + fill { c }.with { a.value + b.value } } ``` From a29f5c6f87d7ecdacfa7deb64c3b46568ab5ec38 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 1 Sep 2025 21:21:53 +0200 Subject: [PATCH 7/8] tiny `createConverter()` refactor so the logic path is a bit clearer --- .../kotlinx/dataframe/impl/api/convert.kt | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index 40c7a6a523..1419635e91 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -230,6 +230,8 @@ internal fun getConverter(from: KType, to: KType, options: ParserOptions? = null internal typealias TypeConverter = (Any) -> Any? +private val TypeConverterIdentity: TypeConverter = { it } + internal fun Any.convertTo(type: KType): Any? { val clazz = javaClass.kotlin if (clazz.isSubclassOf(type.jvmErasure)) return this @@ -242,6 +244,7 @@ internal inline fun convert(crossinline converter: (T) -> Any?): TypeConvert private enum class DummyEnum +@Suppress("UNCHECKED_CAST") internal fun createConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? { if (from.arguments.isNotEmpty() || to.arguments.isNotEmpty()) return null if (from.isMarkedNullable) { @@ -250,25 +253,24 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n } val fromClass = from.jvmErasure val toClass = to.jvmErasure + return when { + fromClass == toClass -> TypeConverterIdentity - if (fromClass == toClass) return { it } - - if (toClass.isValue) { - val constructor = - toClass.primaryConstructor ?: error("Value type $toClass doesn't have primary constructor") - val underlyingType = constructor.parameters.single().type - val converter = getConverter(from, underlyingType) - ?: throw TypeConverterNotFoundException(from, underlyingType, null) - return convert { - val converted = converter(it) - if (converted == null && !underlyingType.isMarkedNullable) { - throw TypeConversionException(it, from, underlyingType, null) + toClass.isValue -> { + val constructor = + toClass.primaryConstructor ?: error("Value type $toClass doesn't have primary constructor") + val underlyingType = constructor.parameters.single().type + val converter = getConverter(from, underlyingType) + ?: throw TypeConverterNotFoundException(from, underlyingType, null) + return convert { + val converted = converter(it) + if (converted == null && !underlyingType.isMarkedNullable) { + throw TypeConversionException(it, from, underlyingType, null) + } + constructor.call(converted) } - constructor.call(converted) } - } - return when { fromClass == String::class -> { val parser = Parsers[to.withNullability(false)] when { From d93c956cf924521a642665dff10293213929f9e0 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 1 Oct 2025 10:57:13 +0200 Subject: [PATCH 8/8] expanded warning for Int<->Char conversion --- .../org/jetbrains/kotlinx/dataframe/api/convert.kt | 12 ++++++++---- docs/StardustDocs/topics/convert.md | 7 +++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 4bc61e18c9..24e56f1c37 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -132,6 +132,10 @@ internal interface ConvertDocs { * * [LocalDateTime], [LocalDate], [LocalTime], * `Instant` ([kotlinx.datetime][DeprecatedInstant], [kotlin.time][StdlibInstant], and [java.time]), * * [URL], [IMG], [IFRAME]. + * + * __NOTE__: Conversion between [Int] and [Char] is done by UTF-16 [Char.code]. + * To convert [Char]->[Int] the way it is written, use [parse()][parse] instead, or, + * in either case, use [String] as intermediary type. */ interface SupportedTypes @@ -362,7 +366,7 @@ public class Convert( * preserving their original names and positions within the [DataFrame]. * * The target type is provided as a reified type argument. - * For the full list of supported types, see [ConvertDocs.SupportedTypes]. + * For the full list of supported types, see [SupportedTypes][ConvertDocs.SupportedTypes]. * * For more information: {@include [DocumentationUrls.Convert]} * @@ -390,7 +394,7 @@ public class Convert( * preserving their original names and positions within the [DataFrame]. * * The target type is provided as a [KType]. - * For the full list of supported types, see [ConvertDocs.SupportedTypes]. + * For the full list of supported types, see [SupportedTypes][ConvertDocs.SupportedTypes]. * * For more information: {@include [DocumentationUrls.Convert]} * @@ -554,7 +558,7 @@ public inline fun Convert.perRowCol( * * The target type is provided as a reified type argument. * - * For the full list of supported types, see [ConvertDocs.SupportedTypes]. + * For the full list of supported types, see [SupportedTypes][ConvertDocs.SupportedTypes]. * * @param [C] The target type to convert values to. * @return A new [DataColumn] with the values converted to type [C]. @@ -564,7 +568,7 @@ public inline fun AnyCol.convertTo(): DataColumn = convertTo(type /** * Converts values in this column to the specified [type]. * - * For the full list of supported types, see [ConvertDocs.SupportedTypes]. + * For the full list of supported types, see [SupportedTypes][ConvertDocs.SupportedTypes]. * * @param type The target type, provided as a [KType], to convert values to. * @return A new [DataColumn] with the values converted to [type]. diff --git a/docs/StardustDocs/topics/convert.md b/docs/StardustDocs/topics/convert.md index 07aa0ebdd6..dd34af90e8 100644 --- a/docs/StardustDocs/topics/convert.md +++ b/docs/StardustDocs/topics/convert.md @@ -73,8 +73,11 @@ df.convert { name }.asColumn { col -> * `Instant` (kotlinx.datetime, kotlin.time, and java.time) * `enum` classes (by name) -Note that converting between `Char` and `Int` is done by ASCII character code. -This means the `Char` `'1'` becomes the `Int` `49`. +> Note that converting between `Char` and `Int` is done by UTF-16 character code. +> This means the `Char` `'1'` becomes the `Int` `49`. +> To convert `Char -> Int` the way it is written, use `parse()` instead, or, +> in either case, use `String` as intermediary type. +> {style="warning"} If you want to convert `Char` `'1'` to the `Int` `1`, use [parse()](parse.md) instead, or use `String` as intermediate type.