diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/max.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/max.kt index a265a20f9e..8d7d6b3b47 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/max.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/max.kt @@ -133,9 +133,12 @@ public fun > DataFrame.maxByOrNull(column: KProperty // endregion // region GroupBy - +@Refine +@Interpretable("GroupByMax1") public fun Grouped.max(): DataFrame = maxFor(interComparableColumns()) +@Refine +@Interpretable("GroupByMax0") public fun > Grouped.maxFor(columns: ColumnsForAggregateSelector): DataFrame = Aggregators.max.aggregateFor(this, columns) @@ -149,6 +152,8 @@ public fun > Grouped.maxFor(vararg columns: ColumnRefere public fun > Grouped.maxFor(vararg columns: KProperty): DataFrame = maxFor { columns.toColumnSet() } +@Refine +@Interpretable("GroupByMax0") public fun > Grouped.max(name: String? = null, columns: ColumnsSelector): DataFrame = Aggregators.max.aggregateAll(this, name, columns) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/mean.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/mean.kt index 994cbf27db..97dcc70087 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/mean.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/mean.kt @@ -8,6 +8,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowExpression import org.jetbrains.kotlinx.dataframe.aggregation.ColumnsForAggregateSelector import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.annotations.Interpretable +import org.jetbrains.kotlinx.dataframe.annotations.Refine import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.columns.toColumnsSetOf @@ -98,9 +100,12 @@ public inline fun DataFrame.meanOf( // endregion // region GroupBy - +@Refine +@Interpretable("GroupByMean1") public fun Grouped.mean(skipNA: Boolean = skipNA_default): DataFrame = meanFor(skipNA, numberColumns()) +@Refine +@Interpretable("GroupByMean0") public fun Grouped.meanFor( skipNA: Boolean = skipNA_default, columns: ColumnsForAggregateSelector, @@ -121,6 +126,8 @@ public fun Grouped.meanFor( skipNA: Boolean = skipNA_default, ): DataFrame = meanFor(skipNA) { columns.toColumnSet() } +@Refine +@Interpretable("GroupByMean0") public fun Grouped.mean( name: String? = null, skipNA: Boolean = skipNA_default, @@ -147,6 +154,8 @@ public fun Grouped.mean( skipNA: Boolean = skipNA_default, ): DataFrame = mean(name, skipNA) { columns.toColumnSet() } +@Refine +@Interpretable("GroupByMeanOf") public inline fun Grouped.meanOf( name: String? = null, skipNA: Boolean = skipNA_default, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/median.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/median.kt index f2cdbb390e..98c4f1f206 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/median.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/median.kt @@ -8,6 +8,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowExpression import org.jetbrains.kotlinx.dataframe.aggregation.ColumnsForAggregateSelector import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.annotations.Interpretable +import org.jetbrains.kotlinx.dataframe.annotations.Refine import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators.Aggregators @@ -16,6 +18,7 @@ import org.jetbrains.kotlinx.dataframe.impl.aggregation.interComparableColumns import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateAll import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateFor import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateOf +import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateOfDelegated import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.of import org.jetbrains.kotlinx.dataframe.impl.columns.toComparableColumns import org.jetbrains.kotlinx.dataframe.impl.suggestIfNull @@ -103,9 +106,12 @@ public inline fun > DataFrame.medianOf( // endregion // region GroupBy - +@Refine +@Interpretable("GroupByMedian1") public fun Grouped.median(): DataFrame = medianFor(interComparableColumns()) +@Refine +@Interpretable("GroupByMedian0") public fun > Grouped.medianFor(columns: ColumnsForAggregateSelector): DataFrame = Aggregators.median.aggregateFor(this, columns) @@ -119,6 +125,8 @@ public fun > Grouped.medianFor(vararg columns: ColumnRef public fun > Grouped.medianFor(vararg columns: KProperty): DataFrame = medianFor { columns.toColumnSet() } +@Refine +@Interpretable("GroupByMedian0") public fun > Grouped.median( name: String? = null, columns: ColumnsSelector, @@ -137,10 +145,12 @@ public fun > Grouped.median( public fun > Grouped.median(vararg columns: KProperty, name: String? = null): DataFrame = median(name) { columns.toColumnSet() } +@Refine +@Interpretable("GroupByMedianOf") public inline fun > Grouped.medianOf( name: String? = null, crossinline expression: RowExpression, -): DataFrame = Aggregators.median.aggregateOf(this, name, expression) +): DataFrame = Aggregators.median.cast().aggregateOf(this, name, expression) // endregion @@ -227,6 +237,6 @@ public fun > PivotGroupBy.median(vararg columns: KProper public inline fun > PivotGroupBy.medianOf( crossinline expression: RowExpression, -): DataFrame = Aggregators.median.aggregateOf(this, expression) +): DataFrame = Aggregators.median.cast().aggregateOf(this, expression) // endregion diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/min.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/min.kt index d1cae852aa..0a9c79b5a1 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/min.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/min.kt @@ -133,9 +133,12 @@ public fun > DataFrame.minByOrNull(column: KProperty // endregion // region GroupBy - +@Refine +@Interpretable("GroupByMin1") public fun Grouped.min(): DataFrame = minFor(interComparableColumns()) +@Refine +@Interpretable("GroupByMin0") public fun > Grouped.minFor(columns: ColumnsForAggregateSelector): DataFrame = Aggregators.min.aggregateFor(this, columns) @@ -149,6 +152,8 @@ public fun > Grouped.minFor(vararg columns: ColumnRefere public fun > Grouped.minFor(vararg columns: KProperty): DataFrame = minFor { columns.toColumnSet() } +@Refine +@Interpretable("GroupByMin0") public fun > Grouped.min(name: String? = null, columns: ColumnsSelector): DataFrame = Aggregators.min.aggregateAll(this, name, columns) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt index 34c482612d..9f0f3637b6 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt @@ -177,7 +177,7 @@ public inline fun > Grouped.percentileOf( percentile: Double, name: String? = null, crossinline expression: RowExpression, -): DataFrame = Aggregators.percentile(percentile).aggregateOf(this, name, expression) +): DataFrame = Aggregators.percentile(percentile).cast().aggregateOf(this, name, expression) // endregion @@ -289,6 +289,6 @@ public fun > PivotGroupBy.percentile( public inline fun > PivotGroupBy.percentileOf( percentile: Double, crossinline expression: RowExpression, -): DataFrame = Aggregators.percentile(percentile).aggregateOf(this, expression) +): DataFrame = Aggregators.percentile(percentile).cast().aggregateOf(this, expression) // endregion diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/std.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/std.kt index 334bc398e0..163cabf4c7 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/std.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/std.kt @@ -8,6 +8,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowExpression import org.jetbrains.kotlinx.dataframe.aggregation.ColumnsForAggregateSelector import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.annotations.Interpretable +import org.jetbrains.kotlinx.dataframe.annotations.Refine import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.columns.toColumnsSetOf @@ -102,10 +104,13 @@ public inline fun DataFrame.stdOf( // endregion // region GroupBy - +@Refine +@Interpretable("GroupByStd1") public fun Grouped.std(skipNA: Boolean = skipNA_default, ddof: Int = ddof_default): DataFrame = stdFor(skipNA, ddof, numberColumns()) +@Refine +@Interpretable("GroupByStd0") public fun Grouped.stdFor( skipNA: Boolean = skipNA_default, ddof: Int = ddof_default, @@ -118,6 +123,7 @@ public fun Grouped.stdFor( ddof: Int = ddof_default, ): DataFrame = stdFor(skipNA, ddof) { columns.toColumnsSetOf() } +@AccessApiOverload public fun Grouped.stdFor( vararg columns: ColumnReference, skipNA: Boolean = skipNA_default, @@ -131,6 +137,8 @@ public fun Grouped.stdFor( ddof: Int = ddof_default, ): DataFrame = stdFor(skipNA, ddof) { columns.toColumnSet() } +@Refine +@Interpretable("GroupByStd0") public fun Grouped.std( name: String? = null, skipNA: Boolean = skipNA_default, @@ -138,6 +146,7 @@ public fun Grouped.std( columns: ColumnsSelector, ): DataFrame = Aggregators.std(skipNA, ddof).aggregateAll(this, name, columns) +@AccessApiOverload public fun Grouped.std( vararg columns: ColumnReference, name: String? = null, @@ -160,6 +169,8 @@ public fun Grouped.std( ddof: Int = ddof_default, ): DataFrame = std(name, skipNA, ddof) { columns.toColumnSet() } +@Refine +@Interpretable("GroupByStdOf") public inline fun Grouped.stdOf( name: String? = null, skipNA: Boolean = skipNA_default, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/sum.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/sum.kt index af9bea3657..3574c0e5fa 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/sum.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/sum.kt @@ -8,6 +8,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowExpression import org.jetbrains.kotlinx.dataframe.aggregation.ColumnsForAggregateSelector import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.annotations.Interpretable +import org.jetbrains.kotlinx.dataframe.annotations.Refine import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.columns.toColumnsSetOf @@ -89,9 +91,12 @@ public inline fun DataFrame.sumOf(crossinline expres // endregion // region GroupBy - +@Refine +@Interpretable("GroupBySum1") public fun Grouped.sum(): DataFrame = sumFor(numberColumns()) +@Refine +@Interpretable("GroupBySum0") public fun Grouped.sumFor(columns: ColumnsForAggregateSelector): DataFrame = Aggregators.sum.aggregateFor(this, columns) @@ -105,6 +110,8 @@ public fun Grouped.sumFor(vararg columns: ColumnReference public fun Grouped.sumFor(vararg columns: KProperty): DataFrame = sumFor { columns.toColumnSet() } +@Refine +@Interpretable("GroupBySum0") public fun Grouped.sum(name: String? = null, columns: ColumnsSelector): DataFrame = Aggregators.sum.aggregateAll(this, name, columns) @@ -119,6 +126,8 @@ public fun Grouped.sum(vararg columns: ColumnReference, n public fun Grouped.sum(vararg columns: KProperty, name: String? = null): DataFrame = sum(name) { columns.toColumnSet() } +@Refine +@Interpretable("GroupBySumOf") public inline fun Grouped.sumOf( resultName: String? = null, crossinline expression: RowExpression, diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/statistics.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/statistics.kt new file mode 100644 index 0000000000..006b8048b2 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/statistics.kt @@ -0,0 +1,445 @@ +package org.jetbrains.kotlinx.dataframe.api + +import io.kotest.matchers.shouldBe +import org.junit.Test + +@Suppress("ktlint:standard:argument-list-wrapping") +class StatisticsTests { + private val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + @Test + fun `sum on GroupBy`() { + // scenario #0: all numerical columns + val res0 = personsDf.groupBy("city").sum() + res0.columnNames() shouldBe listOf("city", "age", "weight", "yearsToRetirement") + + val sum01 = res0["age"][0] as Int + sum01 shouldBe 72 + val sum02 = res0["weight"][0] as Double + sum02 shouldBe 198.0 + + // scenario #1: particular column + val res1 = personsDf.groupBy("city").sumFor("age") + res1.columnNames() shouldBe listOf("city", "age") + + val sum11 = res1["age"][0] as Int + sum11 shouldBe 72 + + // scenario #1.1: particular column via sum + val res11 = personsDf.groupBy("city").sum("age") + res11.columnNames() shouldBe listOf("city", "age") + + val sum111 = res11["age"][0] as Int + sum111 shouldBe 72 + + // scenario #2: particular column with new name - schema changes + val res2 = personsDf.groupBy("city").sum("age", name = "newAge") + res2.columnNames() shouldBe listOf("city", "newAge") + + val sum21 = res2["newAge"][0] as Int + sum21 shouldBe 72 + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy("city").sum(name = "newAge") { "age"() } + res21.columnNames() shouldBe listOf("city", "newAge") + + val sum211 = res21["newAge"][0] as Int + sum211 shouldBe 72 + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + val res22 = personsDf.groupBy("city").sum(name = "newAge") { "age"() and "yearsToRetirement"() } + res22.columnNames() shouldBe listOf("city", "newAge") + + val sum221 = res22["newAge"][0] as Int + sum221 shouldBe 195 + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy("city").sumOf(resultName = "newAge") { "age"() * 10 } + res3.columnNames() shouldBe listOf("city", "newAge") + + val sum31 = res3["newAge"][0] as Int + sum31 shouldBe 720 + + // scenario #3.1: create new column via expression with Double type + val res31 = personsDf.groupBy("city").sumOf(resultName = "newAge") { "weight"() * 10 } + res31.columnNames() shouldBe listOf("city", "newAge") + + val sum311 = res31["newAge"][0] as Double + sum311 shouldBe 1980.0 + } + + @Test + fun `mean on GroupBy`() { + // scenario #0: all numerical columns + val res0 = personsDf.groupBy("city").mean() + res0.columnNames() shouldBe listOf("city", "age", "weight", "yearsToRetirement") + + val mean01 = res0["age"][0] as Double + mean01 shouldBe 24.0 + val mean02 = res0["weight"][0] as Double + mean02 shouldBe 66.0 + + // scenario #1: particular column + val res1 = personsDf.groupBy("city").meanFor("age") + res1.columnNames() shouldBe listOf("city", "age") + + val mean11 = res1["age"][0] as Double + mean11 shouldBe 24.0 + + // scenario #1.1: particular column via mean + val res11 = personsDf.groupBy("city").mean("age") + res11.columnNames() shouldBe listOf("city", "age") + + val mean111 = res11["age"][0] as Double + mean111 shouldBe 24.0 + + // scenario #2: particular column with new name - schema changes + val res2 = personsDf.groupBy("city").mean("age", name = "newAge") + res2.columnNames() shouldBe listOf("city", "newAge") + + val mean21 = res2["newAge"][0] as Double + mean21 shouldBe 24.0 + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy("city").mean(name = "newAge") { "age"() } + res21.columnNames() shouldBe listOf("city", "newAge") + + val mean211 = res21["newAge"][0] as Double + mean211 shouldBe 24.0 + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + val res22 = personsDf.groupBy("city").mean(name = "newAge") { "age"() and "yearsToRetirement"() } + res22.columnNames() shouldBe listOf("city", "newAge") + + val mean221 = res22["newAge"][0] as Double + mean221 shouldBe 32.5 + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy("city").meanOf(name = "newAge") { "age"() * 10 } + res3.columnNames() shouldBe listOf("city", "newAge") + + val mean31 = res3["newAge"][0] as Double + mean31 shouldBe 240 + + // scenario #3.1: create new column via expression with Double + val res31 = personsDf.groupBy("city").meanOf(name = "newAge") { "weight"() * 10 } + res31.columnNames() shouldBe listOf("city", "newAge") + + val mean311 = res31["newAge"][0] as Double + mean311 shouldBe 660.0 + } + + @Test + fun `median on GroupBy`() { + // scenario #0: all numerical columns + val res0 = personsDf.groupBy("city").median() + res0.columnNames() shouldBe listOf( + "city", + "name", + "age", + "height", + "yearsToRetirement" + ) // TODO: why double values from weight are not in the list? are they not Comparable? + + val median01 = res0["age"][0] as Int + median01 shouldBe 22 + //val median02 = res0["weight"][0] as Double + //median02 shouldBe 66.0 + + // scenario #1: particular column + val res1 = personsDf.groupBy("city").medianFor("age") + res1.columnNames() shouldBe listOf("city", "age") + + val median11 = res1["age"][0] as Int + median11 shouldBe 22 + + // scenario #1.1: particular column via median + val res11 = personsDf.groupBy("city").median("age") + res11.columnNames() shouldBe listOf("city", "age") + + val median111 = res11["age"][0] as Int + median111 shouldBe 22 + + // scenario #2: particular column with new name - schema changes + val res2 = personsDf.groupBy("city").median("age", name = "newAge") + res2.columnNames() shouldBe listOf("city", "newAge") + + val median21 = res2["newAge"][0] as Int + median21 shouldBe 22 + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy("city").median(name = "newAge") { "age"() } + res21.columnNames() shouldBe listOf("city", "newAge") + + val median211 = res21["newAge"][0] as Int + median211 shouldBe 22 + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + val res22 = personsDf.groupBy("city").median(name = "newAge") { "age"() and "yearsToRetirement"() } + res22.columnNames() shouldBe listOf("city", "newAge") + + val median221 = res22["newAge"][0] as Int + median221 shouldBe 32 + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy("city").medianOf(name = "newAge") { "age"() * 10 } + res3.columnNames() shouldBe listOf("city", "newAge") + + val median31 = res3["newAge"][0] as Int + median31 shouldBe 220 + + // scenario #3.1: create new column via expression with Double + val res31 = personsDf.groupBy("city").medianOf(name = "newAge") { "weight"() * 10 } + res31.columnNames() shouldBe listOf("city", "newAge") + + val median311 = res31["newAge"][0] as Double + median311 shouldBe 751.0 + } + + @Test + fun `std on GroupBy`() { + // scenario #0: all numerical columns + val res0 = personsDf.groupBy("city").std() + res0.columnNames() shouldBe listOf("city", "age", "weight", "yearsToRetirement") + + val std01 = res0["age"][0] as Double + std01 shouldBe 10.14889156509222 + val std02 = res0["weight"][0] as Double + std02 shouldBe 38.85756039691633 + + // scenario #1: particular column + val res1 = personsDf.groupBy("city").stdFor("age") + res1.columnNames() shouldBe listOf("city", "age") + + val std11 = res1["age"][0] as Double + std11 shouldBe 10.14889156509222 + + // scenario #1.1: particular column via std + val res11 = personsDf.groupBy("city").std("age") + res11.columnNames() shouldBe listOf("city", "age") + + val std111 = res11["age"][0] as Double + std111 shouldBe 10.14889156509222 + + // scenario #2: particular column with new name - schema changes + val res2 = personsDf.groupBy("city").std("age", name = "newAge") + res2.columnNames() shouldBe listOf("city", "newAge") + + val std21 = res2["newAge"][0] as Double + std21 shouldBe 10.14889156509222 + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy("city").std(name = "newAge") { "age"() } + res21.columnNames() shouldBe listOf("city", "newAge") + + val std211 = res21["newAge"][0] as Double + std211 shouldBe 10.14889156509222 + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + val res22 = personsDf.groupBy("city").std(name = "newAge") { "age"() and "yearsToRetirement"() } + res22.columnNames() shouldBe listOf("city", "newAge") + + val std221 = res22["newAge"][0] as Double + std221 shouldBe 13.003845585056753 + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy("city").stdOf(name = "newAge") { "age"() * 10 } + res3.columnNames() shouldBe listOf("city", "newAge") + + val std31 = res3["newAge"][0] as Double + std31 shouldBe 101.4889156509222 + + // scenario #3.1: create new column via expression with Double + val res31 = personsDf.groupBy("city").stdOf(name = "newAge") { "weight"() * 10 } + res31.columnNames() shouldBe listOf("city", "newAge") + + val std311 = res31["newAge"][0] as Double + std311 shouldBe 388.57560396916324 + } + + @Test + fun `min on GroupBy`() { + // scenario #0: all numerical columns + val res0 = personsDf.groupBy("city").min() + res0.columnNames() shouldBe listOf( + "city", + "name", + "age", + "height", + "yearsToRetirement" + ) // TODO: why it's working for height and doesn't work for Double column weight + + val min01 = res0["age"][0] as Int + min01 shouldBe 15 + //val min02 = res0["weight"][0] as Double + //min02 shouldBe 38.85756039691633 + + // scenario #1: particular column + val res1 = personsDf.groupBy("city").minFor("age") + res1.columnNames() shouldBe listOf("city", "age") + + val min11 = res1["age"][0] as Int + min11 shouldBe 15 + + // scenario #1.1: particular column via min + val res11 = personsDf.groupBy("city").min("age") + res11.columnNames() shouldBe listOf("city", "age") + + val min111 = res11["age"][0] as Int + min111 shouldBe 15 + + // scenario #2: particular column with new name - schema changes + val res2 = personsDf.groupBy("city").min("age", name = "newAge") + res2.columnNames() shouldBe listOf("city", "newAge") + + val min21 = res2["newAge"][0] as Int + min21 shouldBe 15 + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy("city").min(name = "newAge") { "age"() } + res21.columnNames() shouldBe listOf("city", "newAge") + + val min211 = res21["newAge"][0] as Int + min211 shouldBe 15 + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + val res22 = personsDf.groupBy("city").min(name = "newAge") { "age"() and "yearsToRetirement"() } + res22.columnNames() shouldBe listOf("city", "newAge") + + val min221 = res22["newAge"][0] as Int + min221 shouldBe 15 + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy("city").minOf(name = "newAge") { "age"() * 10 } + res3.columnNames() shouldBe listOf("city", "newAge") + + val min31 = res3["newAge"][0] as Int + min31 shouldBe 150 + + // scenario #3.1: create new column via expression with Double + val res31 = personsDf.groupBy("city").minOf(name = "newAge") { "weight"() * 10 } + res31.columnNames() shouldBe listOf("city", "newAge") + + val min311 = res31["newAge"][0] as Double + min311 shouldBe 234.0 + + // scenario #4: particular column via minBy + val res4 = personsDf.groupBy("city").minBy("age").values() + res4.columnNames() shouldBe listOf( + "city", + "name", + "age", + "weight", + "height", + "yearsToRetirement" + ) // TODO: why is here weight presented? looks like inconsitency + + val min41 = res4["age"][0] as Int + min41 shouldBe 15 + val min42 = res4["weight"][0] as Double + min42 shouldBe 99.5 + + // scenario #5: particular column via minBy and rowExpression + val res5 = personsDf.groupBy("city").minBy { "age"() * 10 }.values() + res4.columnNames() shouldBe listOf("city", "name", "age", "weight", "height", "yearsToRetirement") + + val min51 = res5["age"][0] as Int + min51 shouldBe 15 + } + + @Test + fun `max on GroupBy`() { + // scenario #0: all numerical columns + val res0 = personsDf.groupBy("city").max() + res0.columnNames() shouldBe listOf("city", "name", "age", "height", "yearsToRetirement") // TODO: DOUBLE weight? + + val max01 = res0["age"][0] as Int + max01 shouldBe 35 + //val max02 = res0["weight"][0] as Double + //max02 shouldBe 140.0 + + // scenario #1: particular column + val res1 = personsDf.groupBy("city").maxFor("age") + res1.columnNames() shouldBe listOf("city", "age") + + val max11 = res1["age"][0] as Int + max11 shouldBe 35 + + // scenario #1.1: particular column via max + val res11 = personsDf.groupBy("city").max("age") + res11.columnNames() shouldBe listOf("city", "age") + + val max111 = res11["age"][0] as Int + max111 shouldBe 35 + + // scenario #2: particular column with new name - schema changes + val res2 = personsDf.groupBy("city").max("age", name = "newAge") + res2.columnNames() shouldBe listOf("city", "newAge") + + val max21 = res2["newAge"][0] as Int + max21 shouldBe 35 + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy("city").max(name = "newAge") { "age"() } + res21.columnNames() shouldBe listOf("city", "newAge") + + val max211 = res21["newAge"][0] as Int + max211 shouldBe 35 + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + val res22 = personsDf.groupBy("city").max(name = "newAge") { "age"() and "yearsToRetirement"() } + res22.columnNames() shouldBe listOf("city", "newAge") + + val max221 = res22["newAge"][0] as Int + max221 shouldBe 50 + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy("city").maxOf(name = "newAge") { "age"() * 10 } + res3.columnNames() shouldBe listOf("city", "newAge") + + val max31 = res3["newAge"][0] as Int + max31 shouldBe 350 + + // scenario #3.1: create new column via expression with Double + val res31 = personsDf.groupBy("city").maxOf(name = "newAge") { "weight"() * 10 } + res31.columnNames() shouldBe listOf("city", "newAge") + + val max311 = res31["newAge"][0] as Double + max311 shouldBe 995.0 + + // scenario #4: particular column via maxBy + val res4 = personsDf.groupBy("city").maxBy("age").values() + res4.columnNames() shouldBe listOf( + "city", + "name", + "age", + "weight", + "height", + "yearsToRetirement" + ) // TODO: weight is here? + + val max41 = res4["age"][0] as Int + max41 shouldBe 35 + val max42 = res4["weight"][0] as Double + max42 shouldBe 23.4 + + // scenario #5: particular column via maxBy and rowExpression + val res5 = personsDf.groupBy("city").maxBy { "age"() * 10 }.values() + res4.columnNames() shouldBe listOf("city", "name", "age", "weight", "height", "yearsToRetirement") + + val max51 = res5["age"][0] as Int + max51 shouldBe 35 + } +} + diff --git a/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/groupBy.kt b/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/groupBy.kt index ca91d79745..c1251439a9 100644 --- a/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/groupBy.kt +++ b/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/groupBy.kt @@ -1,11 +1,19 @@ package org.jetbrains.kotlinx.dataframe.plugin.impl.api +import org.jetbrains.kotlin.fir.FirSession import org.jetbrains.kotlin.fir.expressions.FirAnonymousFunctionExpression import org.jetbrains.kotlin.fir.expressions.FirExpression import org.jetbrains.kotlin.fir.expressions.FirFunctionCall import org.jetbrains.kotlin.fir.expressions.FirReturnExpression import org.jetbrains.kotlin.fir.types.ConeKotlinType +import org.jetbrains.kotlin.fir.types.ConeNullability +import org.jetbrains.kotlin.fir.types.constructClassLikeType +import org.jetbrains.kotlin.fir.types.isNullable +import org.jetbrains.kotlin.fir.types.isSubtypeOf import org.jetbrains.kotlin.fir.types.resolvedType +import org.jetbrains.kotlin.fir.types.typeContext +import org.jetbrains.kotlin.fir.types.withNullability +import org.jetbrains.kotlin.name.StandardClassIds import org.jetbrains.kotlinx.dataframe.plugin.InterpretationErrorReporter import org.jetbrains.kotlinx.dataframe.plugin.extensions.KotlinTypeFacade import org.jetbrains.kotlinx.dataframe.plugin.impl.AbstractInterpreter @@ -16,6 +24,7 @@ import org.jetbrains.kotlinx.dataframe.plugin.impl.PluginDataFrameSchema import org.jetbrains.kotlinx.dataframe.plugin.impl.Present import org.jetbrains.kotlinx.dataframe.plugin.impl.SimpleCol import org.jetbrains.kotlinx.dataframe.plugin.impl.SimpleColumnGroup +import org.jetbrains.kotlinx.dataframe.plugin.impl.SimpleDataColumn import org.jetbrains.kotlinx.dataframe.plugin.impl.SimpleFrameColumn import org.jetbrains.kotlinx.dataframe.plugin.impl.add import org.jetbrains.kotlinx.dataframe.plugin.impl.data.ColumnWithPathApproximation @@ -105,7 +114,10 @@ fun KotlinTypeFacade.aggregate( } } -fun KotlinTypeFacade.createPluginDataFrameSchema(keys: List, moveToTop: Boolean): PluginDataFrameSchema { +fun KotlinTypeFacade.createPluginDataFrameSchema( + keys: List, + moveToTop: Boolean +): PluginDataFrameSchema { fun addToHierarchy( path: List, column: SimpleCol, @@ -187,7 +199,8 @@ class GroupByAdd : AbstractInterpreter() { } } -abstract class GroupByAggregator(val defaultName: String) : AbstractSchemaModificationInterpreter() { +/** Produces type of aggregated column based on the expression type. */ +abstract class GroupByAggregatorOf(val defaultName: String) : AbstractSchemaModificationInterpreter() { val Arguments.receiver by groupBy() val Arguments.name: String? by arg(defaultValue = Present(null)) val Arguments.expression by type() @@ -198,5 +211,203 @@ abstract class GroupByAggregator(val defaultName: String) : AbstractSchemaModifi } } -class GroupByMaxOf : GroupByAggregator(defaultName = "max") -class GroupByMinOf : GroupByAggregator(defaultName = "min") +/** Implementation for `maxOf`. */ +class GroupByMaxOf : GroupByAggregatorOf(defaultName = "max") + +/** Implementation for `minOf`. */ +class GroupByMinOf : GroupByAggregatorOf(defaultName = "min") + +/** Implementation for `medianOf`. */ +class GroupByMedianOf : GroupByAggregatorOf(defaultName = "median") + +/** Returns Double type as the type of the aggregated column. */ +abstract class GroupByAggregatorMeanOf(val defaultName: String) : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + val Arguments.name: String? by arg(defaultValue = Present(null)) + val Arguments.expression by type() + + override fun Arguments.interpret(): PluginDataFrameSchema { + val aggregated = makeNullable(simpleColumnOf(name ?: defaultName, session.builtinTypes.doubleType.type)) + return PluginDataFrameSchema(receiver.keys.columns() + aggregated) + } +} + +/** Implementation for `meanOf`. */ +class GroupByMeanOf : GroupByAggregatorMeanOf(defaultName = "mean") + +/** Implementation for `stdOf`. */ +class GroupByStdOf : GroupByAggregatorMeanOf(defaultName = "std") + +/** + * Provides a base implementation for a custom schema modification interpreter + * that groups data by specified criteria and produces aggregated results. + * + * The class uses a `defaultName` to define a fallback name for the result column + * if no specific name is provided. It leverages `Arguments` properties to define + * and resolve the group-by receiver, result name, and expression type. + * + * Key Components: + * - [receiver] Represents the input data that will be grouped. + * - [resultName] Optional name for the resulting aggregated column. Defaults to `defaultName`. + * - [expression] Defines the type of the expression for aggregation. + */ +abstract class GroupByAggregatorSumOf(val defaultName: String) : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + val Arguments.resultName: String? by arg(defaultValue = Present(null)) + val Arguments.expression by type() + + override fun Arguments.interpret(): PluginDataFrameSchema { + val aggregated = makeNullable(simpleColumnOf(resultName ?: defaultName, expression.type)) + return PluginDataFrameSchema(receiver.keys.columns() + aggregated) + } +} + +/** Implementation for `sumOf`. */ +class GroupBySumOf : GroupByAggregatorSumOf(defaultName = "sum") + +/** + * Provides a base implementation for a custom schema modification interpreter + * that groups data by specified criteria and produces aggregated results. + * + * The class uses a `defaultName` to define a fallback name for the result column + * if no specific name is provided. It leverages `Arguments` properties to define + * and resolve the group-by receiver, result name, and expression type. + * + * Key Components: + * - [receiver] Represents the input data that will be grouped. + * - [name] Optional name for the resulting aggregated column. Defaults to `defaultName`. + * - [columns] ColumnsResolver to define which columns to include in the grouping operation. + */ +abstract class GroupByAggregator0(val defaultName: String) : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + val Arguments.name: String? by arg(defaultValue = Present(null)) + val Arguments.columns: ColumnsResolver? by arg() + + override fun Arguments.interpret(): PluginDataFrameSchema { + if (name == null) { + val resolvedColumns = columns?.resolve(receiver.keys)?.map { it.column }!!.toList() + return PluginDataFrameSchema(receiver.keys.columns() + resolvedColumns) + } else { + val resolvedColumns = columns?.resolve(receiver.keys)?.map { it.column }!!.toList() + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val aggregated = + makeNullable(simpleColumnOf(name ?: defaultName, (resolvedColumns[0] as SimpleDataColumn).type.type)) + return PluginDataFrameSchema(receiver.keys.columns() + aggregated) + } + } +} + +/** Implementation for `sum`. */ +class GroupBySum0 : GroupByAggregator0(defaultName = "sum") + +/** Implementation for `median`. */ +class GroupByMedian0 : GroupByAggregator0(defaultName = "median") + +/** Implementation for `median`. */ +class GroupByMin0 : GroupByAggregator0(defaultName = "min") + +/** Implementation for `median`. */ +class GroupByMax0 : GroupByAggregator0(defaultName = "max") + +/** Returns Double type as the type of the aggregated column. */ +abstract class GroupByAggregatorMean0(val defaultName: String) : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + val Arguments.name: String? by arg(defaultValue = Present(null)) + val Arguments.columns: ColumnsResolver? by arg() + + override fun Arguments.interpret(): PluginDataFrameSchema { + if (name == null) { + val resolvedColumns = columns + ?.resolve(receiver.keys) + ?.map { col -> + simpleColumnOf(col.column.name, session.builtinTypes.doubleType.type) + } + ?.toList() + ?: emptyList() + + return PluginDataFrameSchema(receiver.keys.columns() + resolvedColumns) + } else { + val aggregated = makeNullable( + simpleColumnOf(name ?: defaultName, session.builtinTypes.doubleType.type) + ) + + return PluginDataFrameSchema(receiver.keys.columns() + aggregated) + } + } +} + +/** Implementation for `mean`. */ +class GroupByMean0 : GroupByAggregatorMean0(defaultName = "mean") + +/** Implementation for `std`. */ +class GroupByStd0 : GroupByAggregatorMean0(defaultName = "std") + +/** Adds to the schema only numerical columns. */ +abstract class GroupByAggregator1 : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + + override fun Arguments.interpret(): PluginDataFrameSchema { + val resolvedColumns = receiver.groups.columns() + .filterIsInstance() + .filter { it.type.type.isSubtypeOf(session.builtinTypes.numberType.type, session) } + + return PluginDataFrameSchema(receiver.keys.columns() + resolvedColumns) + } +} + +/** Implementation for `sum`. */ +class GroupBySum1 : GroupByAggregator1() + +/** Returns a Double aggregated column for all numerical columns. */ +abstract class GroupByAggregatorMean1 : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + + override fun Arguments.interpret(): PluginDataFrameSchema { + val resolvedColumns = receiver.groups.columns() + .filterIsInstance() + .filter { it.type.type.isSubtypeOf(session.builtinTypes.numberType.type, session) } + .map { simpleColumnOf(it.name, session.builtinTypes.doubleType.type) } + + return PluginDataFrameSchema(receiver.keys.columns() + resolvedColumns) + } +} + +/** Implementation for `mean`. */ +class GroupByMean1 : GroupByAggregatorMean1() + +/** Implementation for `std`. */ +class GroupByStd1 : GroupByAggregatorMean1() + +/** Keeps in schema only columns with intraComparable values. */ +abstract class GroupByAggregatorComparable : AbstractSchemaModificationInterpreter() { + val Arguments.receiver by groupBy() + + override fun Arguments.interpret(): PluginDataFrameSchema { + val comparableColumns = receiver.groups.columns() + .filterIsInstance() + .filter { isIntraComparable(it, session) } + + return PluginDataFrameSchema(receiver.keys.columns() + comparableColumns) + } +} + +/** Implementation for `max`. */ +class GroupByMax1 : GroupByAggregatorComparable() + +/** Implementation for `min`. */ +class GroupByMin1 : GroupByAggregatorComparable() + +/** Implementation for `median`. */ +class GroupByMedian1 : GroupByAggregatorComparable() + +private fun isIntraComparable(col: SimpleDataColumn, session: FirSession): Boolean { + val comparable = StandardClassIds.Comparable.constructClassLikeType( + typeArguments = arrayOf(col.type.type.withNullability(ConeNullability.NOT_NULL, session.typeContext)), + isNullable = col.type.type.isNullable, + ) + return col.type.type.isSubtypeOf(comparable, session) +} + + + + diff --git a/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt b/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt index e37d65886a..c1592a1f3f 100644 --- a/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt +++ b/plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt @@ -121,7 +121,17 @@ import org.jetbrains.kotlinx.dataframe.plugin.impl.api.FrameCols2 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByAdd import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByCount0 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByInto +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMax0 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMax1 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMaxOf +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMean0 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMean1 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMeanOf +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMedian0 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMedian1 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMedianOf +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMin0 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMin1 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByMinOf import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByReduceExpression import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByReduceInto @@ -130,6 +140,12 @@ import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByXs import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Last0 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Last1 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Last2 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByStd0 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByStd1 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupByStdOf +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupBySum0 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupBySum1 +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.GroupBySumOf import org.jetbrains.kotlinx.dataframe.plugin.impl.api.MapToFrame import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Merge0 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.MergeId @@ -416,11 +432,27 @@ internal inline fun String.load(): T { "Reorder" -> Reorder() "ByName" -> ByName() "GroupByCount0" -> GroupByCount0() + "GroupByMean0" -> GroupByMean0() + "GroupByMean1" -> GroupByMean1() + "GroupByMeanOf" -> GroupByMeanOf() + "GroupByMedian0" -> GroupByMedian0() + "GroupByMedian1" -> GroupByMedian1() + "GroupByMedianOf" -> GroupByMedianOf() + "GroupBySumOf" -> GroupBySumOf() + "GroupBySum0" -> GroupBySum0() + "GroupBySum1" -> GroupBySum1() "GroupByReducePredicate" -> GroupByReducePredicate() "GroupByReduceExpression" -> GroupByReduceExpression() "GroupByReduceInto" -> GroupByReduceInto() + "GroupByMax0" -> GroupByMax0() + "GroupByMax1" -> GroupByMax1() "GroupByMaxOf" -> GroupByMaxOf() + "GroupByMin0" -> GroupByMin0() + "GroupByMin1" -> GroupByMin1() "GroupByMinOf" -> GroupByMinOf() + "GroupByStd0" -> GroupByStd0() + "GroupByStd1" -> GroupByStd1() + "GroupByStdOf" -> GroupByStdOf() "DataFrameXs" -> DataFrameXs() "GroupByXs" -> GroupByXs() else -> error("$this") diff --git a/plugins/kotlin-dataframe/testData/box/groupBy_max.kt b/plugins/kotlin-dataframe/testData/box/groupBy_max.kt new file mode 100644 index 0000000000..1e030be551 --- /dev/null +++ b/plugins/kotlin-dataframe/testData/box/groupBy_max.kt @@ -0,0 +1,61 @@ +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.annotations.* +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +fun box(): String { + // multiple columns + val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + // scenario #0: all numerical columns + val res0 = personsDf.groupBy { city }.max() + val max01: Int? = res0.age[0] + val max02: Double? = res0.weight[0] + res0.compareSchemas() + + + // scenario #1: particular column + val res1 = personsDf.groupBy { city }.maxFor { age } + val max11: Int? = res1.age[0] + res1.compareSchemas() + + // scenario #1.1: particular column via max + val res11 = personsDf.groupBy { city }.max { age } + val max111: Int? = res11.age[0] + res11.compareSchemas() + + // scenario #2: particular column with new name - schema changes + // TODO: not supported scenario + // val res2 = personsDf.groupBy { city }.max("age", name = "newAge") + // val max21: Int? = res2.newAge[0] + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy { city }.max("newAge") { age } + val max211: Int? = res21.newAge[0] + res21.compareSchemas() + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val res22 = personsDf.groupBy { city }.max("newAge") { age and yearsToRetirement } + val max221: Int? = res22.newAge[0] + res22.compareSchemas() + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy { city }.maxOf("newAge") { age / 10 } + val max3: Int? = res3.newAge[0] + res3.compareSchemas() + + return "OK" +} + diff --git a/plugins/kotlin-dataframe/testData/box/groupBy_mean.kt b/plugins/kotlin-dataframe/testData/box/groupBy_mean.kt new file mode 100644 index 0000000000..4f1c9fba74 --- /dev/null +++ b/plugins/kotlin-dataframe/testData/box/groupBy_mean.kt @@ -0,0 +1,59 @@ +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.annotations.* +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +fun box(): String { + // multiple columns + val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + // scenario #0: all numerical columns + val res0 = personsDf.groupBy { city }.mean() + val mean01: Double? = res0.age[0] + val mean02: Double? = res0.weight[0] + res0.compareSchemas() + + // scenario #1: particular column + val res1 = personsDf.groupBy { city }.meanFor { age } + val mean11: Double? = res1.age[0] + res1.compareSchemas() + + // scenario #1.1: particular column via mean + val res11 = personsDf.groupBy { city }.mean { age } + val mean111: Double? = res11.age[0] + res11.compareSchemas() + + // scenario #2: particular column with new name - schema changes + // TODO: not supported scenario + // val res2 = personsDf.groupBy { city }.mean("age", name = "newAge") + // val mean21: Double? = res2.newAge[0] + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy { city }.mean("newAge") { age } + val mean211: Double? = res21.newAge[0] + res21.compareSchemas() + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val res22 = personsDf.groupBy { city }.mean("newAge") { age and yearsToRetirement } + val mean221: Double? = res22.newAge[0] + res22.compareSchemas() + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy { city }.meanOf("newAge") { age * 10 } + val mean3: Double? = res3.newAge[0] + res3.compareSchemas() + + return "OK" +} diff --git a/plugins/kotlin-dataframe/testData/box/groupBy_median.kt b/plugins/kotlin-dataframe/testData/box/groupBy_median.kt new file mode 100644 index 0000000000..ea1794fc08 --- /dev/null +++ b/plugins/kotlin-dataframe/testData/box/groupBy_median.kt @@ -0,0 +1,60 @@ +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.annotations.* +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +fun box(): String { + // multiple columns + val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + // scenario #0: all numerical columns + val res0 = personsDf.groupBy { city }.median() + val median01: Int? = res0.age[0] + val median02: Double? = res0.weight[0] + res0.compareSchemas() + + // scenario #1: particular column + val res1 = personsDf.groupBy { city }.medianFor { age } + val median11: Int? = res1.age[0] + res1.compareSchemas() + + // scenario #1.1: particular column via median + val res11 = personsDf.groupBy { city }.median { age } + val median111: Int? = res11.age[0] + res11.compareSchemas() + + // scenario #2: particular column with new name - schema changes + // TODO: not supported scenario + // val res2 = personsDf.groupBy { city }.median("age", name = "newAge") + // val median21: Int? = res2.newAge[0] + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy { city }.median("newAge") { age } + val median211: Int?= res21.newAge[0] + res21.compareSchemas() + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val res22 = personsDf.groupBy { city }.median("newAge") { age and yearsToRetirement } + val median221: Int? = res22.newAge[0] + res22.compareSchemas() + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy { city }.medianOf("newAge") { age * 10 } + val median3: Int? = res3.newAge[0] + res3.compareSchemas() + + return "OK" +} + diff --git a/plugins/kotlin-dataframe/testData/box/groupBy_min.kt b/plugins/kotlin-dataframe/testData/box/groupBy_min.kt new file mode 100644 index 0000000000..622d18b33a --- /dev/null +++ b/plugins/kotlin-dataframe/testData/box/groupBy_min.kt @@ -0,0 +1,60 @@ +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.annotations.* +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +fun box(): String { + // multiple columns + val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + // scenario #0: all numerical columns + val res0 = personsDf.groupBy { city }.min() + val min01: Int? = res0.age[0] + val min02: Double? = res0.weight[0] + res0.compareSchemas() + + // scenario #1: particular column + val res1 = personsDf.groupBy { city }.minFor { age } + val min11: Int? = res1.age[0] + res1.compareSchemas() + + // scenario #1.1: particular column via min + val res11 = personsDf.groupBy { city }.min { age } + val min111: Int? = res11.age[0] + res11.compareSchemas() + + // scenario #2: particular column with new name - schema changes + // TODO: not supported scenario + // val res2 = personsDf.groupBy { city }.min("age", name = "newAge") + // val min21: Int? = res2.newAge[0] + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy { city }.min("newAge") { age } + val min211: Int? = res21.newAge[0] + res21.compareSchemas() + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val res22 = personsDf.groupBy { city }.min("newAge") { age and yearsToRetirement } + val min221: Int? = res22.newAge[0] + res22.compareSchemas() + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy { city }.minOf("newAge") { age / 2 } + val min3: Int? = res3.newAge[0] + res3.compareSchemas() + + return "OK" +} + diff --git a/plugins/kotlin-dataframe/testData/box/groupBy_std.kt b/plugins/kotlin-dataframe/testData/box/groupBy_std.kt new file mode 100644 index 0000000000..0a1e471fed --- /dev/null +++ b/plugins/kotlin-dataframe/testData/box/groupBy_std.kt @@ -0,0 +1,60 @@ +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.annotations.* +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +fun box(): String { + // multiple columns + val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + // scenario #0: all numerical columns + val res0 = personsDf.groupBy { city }.std() + val std01: Double? = res0.age[0] + val std02: Double? = res0.weight[0] + res0.compareSchemas() + + // scenario #1: particular column + val res1 = personsDf.groupBy { city }.stdFor { age } + val std11: Double? = res1.age[0] + res1.compareSchemas() + + // scenario #1.1: particular column via std + val res11 = personsDf.groupBy { city }.std { age } + val std111: Double? = res11.age[0] + res11.compareSchemas() + + // scenario #2: particular column with new name - schema changes + // TODO: not supported scenario + // val res2 = personsDf.groupBy { city }.std("age", name = "newAge") + // val std21: Double? = res2.newAge[0] + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy { city }.std("newAge") { age } + val std211: Double? = res21.newAge[0] + res21.compareSchemas() + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val res22 = personsDf.groupBy { city }.std("newAge") { age and yearsToRetirement } + val std221: Double? = res22.newAge[0] + res22.compareSchemas() + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy { city }.stdOf("newAge") { age * 10 } + val std3: Double? = res3.newAge[0] + res3.compareSchemas() + + return "OK" +} + diff --git a/plugins/kotlin-dataframe/testData/box/groupBy_sum.kt b/plugins/kotlin-dataframe/testData/box/groupBy_sum.kt new file mode 100644 index 0000000000..e4c250a47c --- /dev/null +++ b/plugins/kotlin-dataframe/testData/box/groupBy_sum.kt @@ -0,0 +1,69 @@ +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.annotations.* +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +fun box(): String { + // multiple columns + val personsDf = dataFrameOf("name", "age", "city", "weight", "height", "yearsToRetirement")( + "Alice", 15, "London", 99.5, "1.85", 50, + "Bob", 20, "Paris", 140.0, "1.35", 45, + "Charlie", 100, "Dubai", 75.0, "1.95", 0, + "Rose", 1, "Moscow", 45.33, "0.79", 64, + "Dylan", 35, "London", 23.4, "1.83", 30, + "Eve", 40, "Paris", 56.72, "1.85", 25, + "Frank", 55, "Dubai", 78.9, "1.35", 10, + "Grace", 29, "Moscow", 67.8, "1.65", 36, + "Hank", 60, "Paris", 80.22, "1.75", 5, + "Isla", 22, "London", 75.1, "1.85", 43, + ) + + // scenario #0: all numerical columns + val res0 = personsDf.groupBy { city }.sum() + val sum01: Int? = res0.age[0] + val sum02: Double? = res0.weight[0] + res0.compareSchemas() + + // scenario #1: particular column + val res1 = personsDf.groupBy { city }.sumFor { age } + val sum11: Int? = res1.age[0] + res1.compareSchemas() + + // scenario #1.1: particular column via sum + val res11 = personsDf.groupBy { city }.sum { age } + val sum111: Int? = res11.age[0] + res11.compareSchemas() + + // scenario #2: particular column with new name - schema changes + // TODO: not supported scenario for String API + // val res2 = personsDf.groupBy { city }.sum("age", name = "newAge") + // val sum21: Int? = res2.newAge[0] + + // scenario #2.1: particular column with new name - schema changes but via columnSelector + val res21 = personsDf.groupBy { city }.sum("newAge") { age } + val sum211: Int? = res21.newAge[0] + res21.compareSchemas() + + // scenario #2.2: two columns with new name - schema changes but via columnSelector + // TODO: handle multiple columns https://github.com/Kotlin/dataframe/issues/1090 + val res22 = personsDf.groupBy { city }.sum("newAge") { age and yearsToRetirement } + val sum221: Int? = res22.newAge[0] + res22.compareSchemas() + + // scenario #3: create new column via expression + val res3 = personsDf.groupBy { city }.sumOf("newAge") { age * 10 } + val sum3: Int? = res3.newAge[0] + + // scenario #3.1: create new column via expression on Double column + /*Runtime: + city: String + newAge: Number + Compile: + city: String + newAge: Double? + val res31 = personsDf.groupBy { city }.sumOf("newAge") { weight * 10 } + val sum31: Double? = res31.newAge[0] + res31.compareSchemas()*/ + + return "OK" +} diff --git a/plugins/kotlin-dataframe/tests-gen/org/jetbrains/kotlin/fir/dataframe/DataFrameBlackBoxCodegenTestGenerated.java b/plugins/kotlin-dataframe/tests-gen/org/jetbrains/kotlin/fir/dataframe/DataFrameBlackBoxCodegenTestGenerated.java index 5d2eded931..804bc6c18d 100644 --- a/plugins/kotlin-dataframe/tests-gen/org/jetbrains/kotlin/fir/dataframe/DataFrameBlackBoxCodegenTestGenerated.java +++ b/plugins/kotlin-dataframe/tests-gen/org/jetbrains/kotlin/fir/dataframe/DataFrameBlackBoxCodegenTestGenerated.java @@ -6,6 +6,7 @@ import org.jetbrains.kotlin.test.util.KtTestUtil; import org.jetbrains.kotlin.test.TargetBackend; import org.jetbrains.kotlin.test.TestMetadata; +import org.junit.Ignore; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -232,6 +233,44 @@ public void testGroupBy_count() { runTest("testData/box/groupBy_count.kt"); } + @Test + @TestMetadata("groupBy_sum.kt") + public void testGroupBy_sum() { + runTest("testData/box/groupBy_sum.kt"); + } + + @Ignore + @Test + @TestMetadata("groupBy_mean.kt") + public void testGroupBy_mean() { + runTest("testData/box/groupBy_mean.kt"); + } + + @Test + @TestMetadata("groupBy_median.kt") + public void testGroupBy_median() { + runTest("testData/box/groupBy_median.kt"); + } + + @Test + @TestMetadata("groupBy_min.kt") + public void testGroupBy_min() { + runTest("testData/box/groupBy_min.kt"); + } + + @Test + @TestMetadata("groupBy_max.kt") + public void testGroupBy_max() { + runTest("testData/box/groupBy_max.kt"); + } + + @Ignore + @Test + @TestMetadata("groupBy_std.kt") + public void testGroupBy_std() { + runTest("testData/box/groupBy_std.kt"); + } + @Test @TestMetadata("groupBy_extractSchema.kt") public void testGroupBy_extractSchema() {