feat: implements selectTyped functions

Jolanrensen · web-flow · commit 36d2253901a8 · 2021-05-29T10:28:41.000+03:00
fixes#85
diff --git a/kotlin-spark-api/2.4/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt b/kotlin-spark-api/2.4/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt
@@ -724,6 +724,49 @@ inline operator fun <reified T, reified U> Dataset<T>.invoke(column: KProperty1<
  */
 fun <T> Dataset<T>.showDS(numRows: Int = 20, truncate: Boolean = true) = apply { show(numRows, truncate) }
 
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+): Dataset<Pair<U1, U2>> =
+    select(c1, c2).map { Pair(it._1(), it._2()) }
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2, reified U3> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+    c3: TypedColumn<T, U3>,
+): Dataset<Triple<U1, U2, U3>> =
+    select(c1, c2, c3).map { Triple(it._1(), it._2(), it._3()) }
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2, reified U3, reified U4> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+    c3: TypedColumn<T, U3>,
+    c4: TypedColumn<T, U4>,
+): Dataset<Arity4<U1, U2, U3, U4>> =
+    select(c1, c2, c3, c4).map { Arity4(it._1(), it._2(), it._3(), it._4()) }
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2, reified U3, reified U4, reified U5> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+    c3: TypedColumn<T, U3>,
+    c4: TypedColumn<T, U4>,
+    c5: TypedColumn<T, U5>,
+): Dataset<Arity5<U1, U2, U3, U4, U5>> =
+    select(c1, c2, c3, c4, c5).map { Arity5(it._1(), it._2(), it._3(), it._4(), it._5()) }
+
+
 @OptIn(ExperimentalStdlibApi::class)
 inline fun <reified T> schema(map: Map<String, KType> = mapOf()) = schema(typeOf<T>(), map)
 
diff --git a/kotlin-spark-api/2.4/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt b/kotlin-spark-api/2.4/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt
@@ -26,6 +26,7 @@ import org.apache.spark.sql.streaming.GroupState
 import org.apache.spark.sql.streaming.GroupStateTimeout
 import scala.collection.Seq
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.TypedColumn
 import org.apache.spark.sql.functions.*
 import scala.Product
 import scala.Tuple1
@@ -35,6 +36,7 @@ import java.io.Serializable
 import java.sql.Date
 import java.sql.Timestamp
 import java.time.LocalDate
+import kotlin.reflect.KProperty1
 import scala.collection.Iterator as ScalaIterator
 import scala.collection.Map as ScalaMap
 import scala.collection.mutable.Map as ScalaMutableMap
@@ -326,6 +328,46 @@ class ApiTest : ShouldSpec({
                 val asList = dataset.takeAsList(2)
                 asList.first().tuple shouldBe Tuple3(5L, "test", Tuple1(""))
             }
+            @Suppress("UNCHECKED_CAST")
+            should("support dataset select") {
+                val dataset = dsOf(
+                    SomeClass(intArrayOf(1, 2, 3), 3),
+                    SomeClass(intArrayOf(1, 2, 4), 5),
+                )
+
+                val typedColumnA: TypedColumn<Any, IntArray> = dataset.col("a").`as`(encoder())
+
+                val newDS2 = dataset.selectTyped(
+//                    col(SomeClass::a), NOTE that this doesn't work on 2.4, returnting a data class with an array in it
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS2.show()
+
+                val newDS3 = dataset.selectTyped(
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS3.show()
+
+                val newDS4 = dataset.selectTyped(
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS4.show()
+
+                val newDS5 = dataset.selectTyped(
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS5.show()
+            }
             should("Access columns using invoke on datasets") {
                 val dataset = dsOf(
                     SomeClass(intArrayOf(1, 2, 3), 4),
@@ -399,6 +441,7 @@ class ApiTest : ShouldSpec({
     }
 })
 
+
 data class DataClassWithTuple<T : Product>(val tuple: T)
 
 
diff --git a/kotlin-spark-api/3.0/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt b/kotlin-spark-api/3.0/src/main/kotlin/org/jetbrains/kotlinx/spark/api/ApiV1.kt
@@ -720,6 +720,49 @@ inline operator fun <reified T, reified U> Dataset<T>.invoke(column: KProperty1<
  */
 fun <T> Dataset<T>.showDS(numRows: Int = 20, truncate: Boolean = true) = apply { show(numRows, truncate) }
 
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+): Dataset<Pair<U1, U2>> =
+    select(c1, c2).map { Pair(it._1(), it._2()) }
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2, reified U3> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+    c3: TypedColumn<T, U3>,
+): Dataset<Triple<U1, U2, U3>> =
+    select(c1, c2, c3).map { Triple(it._1(), it._2(), it._3()) }
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2, reified U3, reified U4> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+    c3: TypedColumn<T, U3>,
+    c4: TypedColumn<T, U4>,
+): Dataset<Arity4<U1, U2, U3, U4>> =
+    select(c1, c2, c3, c4).map { Arity4(it._1(), it._2(), it._3(), it._4()) }
+
+/**
+ * Returns a new Dataset by computing the given [Column] expressions for each element.
+ */
+inline fun <reified T, reified U1, reified U2, reified U3, reified U4, reified U5> Dataset<T>.selectTyped(
+    c1: TypedColumn<T, U1>,
+    c2: TypedColumn<T, U2>,
+    c3: TypedColumn<T, U3>,
+    c4: TypedColumn<T, U4>,
+    c5: TypedColumn<T, U5>,
+): Dataset<Arity5<U1, U2, U3, U4, U5>> =
+    select(c1, c2, c3, c4, c5).map { Arity5(it._1(), it._2(), it._3(), it._4(), it._5()) }
+
+
 @OptIn(ExperimentalStdlibApi::class)
 fun schema(type: KType, map: Map<String, KType> = mapOf()): DataType {
     val primitiveSchema = knownDataTypes[type.classifier]
diff --git a/kotlin-spark-api/3.0/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt b/kotlin-spark-api/3.0/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt
@@ -22,16 +22,16 @@ import ch.tutteli.atrium.domain.builders.migration.asExpect
 import ch.tutteli.atrium.verbs.expect
 import io.kotest.core.spec.style.ShouldSpec
 import io.kotest.matchers.shouldBe
+import org.apache.spark.sql.streaming.GroupState
+import org.apache.spark.sql.streaming.GroupStateTimeout
+import scala.Product
 import scala.Tuple1
 import scala.Tuple2
 import scala.Tuple3
-import org.apache.spark.sql.streaming.GroupState
-import org.apache.spark.sql.streaming.GroupStateTimeout
 import scala.collection.Seq
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.TypedColumn
 import org.apache.spark.sql.functions.*
-import scala.Product
 import java.io.Serializable
 import java.sql.Date
 import java.sql.Timestamp
@@ -350,6 +350,45 @@ class ApiTest : ShouldSpec({
                 val asList = dataset.takeAsList(2)
                 asList.first().tuple shouldBe Tuple3(5L, "test", Tuple1(""))
             }
+            @Suppress("UNCHECKED_CAST")
+            should("support dataset select") {
+                val dataset = dsOf(
+                    SomeClass(intArrayOf(1, 2, 3), 3),
+                    SomeClass(intArrayOf(1, 2, 4), 5),
+                )
+
+                val typedColumnA: TypedColumn<Any, IntArray> = dataset.col("a").`as`(encoder())
+
+                val newDS2 = dataset.selectTyped(
+                    col(SomeClass::a), // NOTE: this only works on 3.0, returning a data class with an array in it
+                    col(SomeClass::b),
+                )
+                newDS2.show()
+
+                val newDS3 = dataset.selectTyped(
+                    col(SomeClass::a),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS3.show()
+
+                val newDS4 = dataset.selectTyped(
+                    col(SomeClass::a),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS4.show()
+
+                val newDS5 = dataset.selectTyped(
+                    col(SomeClass::a),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                    col(SomeClass::b),
+                )
+                newDS5.show()
+            }
             should("Access columns using invoke on datasets") {
                 val dataset = dsOf(
                     SomeClass(intArrayOf(1, 2, 3), 4),