Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ import kotlin.reflect.KProperty

// region DataFrame

public fun <T> DataFrame<T>.flatten(): DataFrame<T> = flatten { all() }
public fun <T> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { all() }

public fun <T, C> DataFrame<T>.flatten(columns: ColumnsSelector<T, C>): DataFrame<T> = flattenImpl(columns)
public fun <T, C> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false, columns: ColumnsSelector<T, C>): DataFrame<T> = flattenImpl(columns, keepParentNameForColumns)

public fun <T> DataFrame<T>.flatten(vararg columns: String): DataFrame<T> = flattenImpl { columns.toColumnSet() }
public fun <T> DataFrame<T>.flatten(vararg columns: String, keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: ColumnReference<C>): DataFrame<T> =
flattenImpl { columns.toColumnSet() }
public fun <T, C> DataFrame<T>.flatten(vararg columns: ColumnReference<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: KProperty<C>): DataFrame<T> =
flattenImpl { columns.toColumnSet() }
public fun <T, C> DataFrame<T>.flatten(vararg columns: KProperty<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }

// endregion
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnSet

internal fun <T, C> DataFrame<T>.flattenImpl(
columns: ColumnsSelector<T, C>
columns: ColumnsSelector<T, C>,
keepParentNameForColumns: Boolean = false
): DataFrame<T> {
val rootColumns = getColumnsWithPaths { columns.toColumnSet().filter { it.isColumnGroup() }.top() }
val rootPrefixes = rootColumns.map { it.path }.toSet()
Expand All @@ -29,7 +30,8 @@ internal fun <T, C> DataFrame<T>.flattenImpl(
.into {
val targetPath = getRootPrefix(it.path).dropLast(1)
val nameGen = nameGenerators[targetPath]!!
val name = nameGen.addUnique(it.name())
val preferredName = if (keepParentNameForColumns) "${it.name()}.${it.parentName}" else it.name()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this is the wrong way around, should be parentName.name instead.

val name = nameGen.addUnique(preferredName)
targetPath + name
}
return result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,37 @@ class FlattenTests {
flattened.getColumnGroup("f").columnNames() shouldBe listOf("a", "b", "c")
flattened.ungroup("f") shouldBe df
}

@Test
fun `flatten the aggregation and check column names`() {
val df = dataFrameOf("firstName", "lastName", "age", "city", "weight", "isHappy")(
"Alice", "Cooper", 15, "London", 54, true,
"Bob", "Dylan", 45, "Dubai", 87, true,
"Charlie", "Daniels", 20, "Moscow", 35, false,
"Charlie", "Chaplin", 40, "Milan", 41, true,
"Bob", "Marley", 30, "Tokyo", 68, true,
"Alice", "Wolf", 20, "Milan", 55, false,
"Charlie", "Byrd", 30, "Moscow", 90, true
).cast<Person>()

val aggregate = df.groupBy("city")
.aggregate {
mean() into "mean"
std() into "std"
}

aggregate
.flatten(keepParentNameForColumns = true)
.columnNames() shouldBe listOf("city", "age.mean", "weight.mean", "age.std", "weight.std")
}

@DataSchema
interface Person {
val age: Int
val city: String?
val firstName: String
val lastName: String
val weight: Int?
val isHappy: Boolean
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class MoveTests {
val grouped = df.move { cols { it.name.contains(".") } }.into { it.name.split(".").toPath() }

@Test
fun batchGrouping() {
fun `batch grouping`() {
grouped.columnNames() shouldBe listOf("q", "a", "b", "w", "e", "r")
grouped["a"].asColumnGroup().columnNames() shouldBe listOf("b", "c")
grouped["a"]["c"].asColumnGroup().columnNames() shouldBe listOf("d")
Expand All @@ -33,7 +33,7 @@ class MoveTests {
}

@Test
fun batchUngrouping() {
fun `batch ungrouping`() {
val ungrouped = grouped.move { dfs { it.depth() > 0 && !it.isColumnGroup() } }.into { pathOf(it.path.joinToString(".")) }
ungrouped.columnNames() shouldBe listOf("q", "a.b", "a.c.d", "b.c", "b.d", "w", "e.f", "r")
}
Expand Down Expand Up @@ -64,7 +64,7 @@ class MoveTests {
}

@Test
fun `selectDfs`() {
fun `select Dfs`() {
val selected = grouped.select { it["a"].dfs { !it.isColumnGroup() } }
selected.columnNames() shouldBe listOf("b", "d")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ import kotlin.reflect.KProperty

// region DataFrame

public fun <T> DataFrame<T>.flatten(): DataFrame<T> = flatten { all() }
public fun <T> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { all() }

public fun <T, C> DataFrame<T>.flatten(columns: ColumnsSelector<T, C>): DataFrame<T> = flattenImpl(columns)
public fun <T, C> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false, columns: ColumnsSelector<T, C>): DataFrame<T> = flattenImpl(columns, keepParentNameForColumns)

public fun <T> DataFrame<T>.flatten(vararg columns: String): DataFrame<T> = flattenImpl { columns.toColumnSet() }
public fun <T> DataFrame<T>.flatten(vararg columns: String, keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: ColumnReference<C>): DataFrame<T> =
flattenImpl { columns.toColumnSet() }
public fun <T, C> DataFrame<T>.flatten(vararg columns: ColumnReference<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: KProperty<C>): DataFrame<T> =
flattenImpl { columns.toColumnSet() }
public fun <T, C> DataFrame<T>.flatten(vararg columns: KProperty<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }

// endregion
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnSet

internal fun <T, C> DataFrame<T>.flattenImpl(
columns: ColumnsSelector<T, C>
columns: ColumnsSelector<T, C>,
keepParentNameForColumns: Boolean = false
): DataFrame<T> {
val rootColumns = getColumnsWithPaths { columns.toColumnSet().filter { it.isColumnGroup() }.top() }
val rootPrefixes = rootColumns.map { it.path }.toSet()
Expand All @@ -29,7 +30,8 @@ internal fun <T, C> DataFrame<T>.flattenImpl(
.into {
val targetPath = getRootPrefix(it.path).dropLast(1)
val nameGen = nameGenerators[targetPath]!!
val name = nameGen.addUnique(it.name())
val preferredName = if (keepParentNameForColumns) "${it.name()}.${it.parentName}" else it.name()
val name = nameGen.addUnique(preferredName)
targetPath + name
}
return result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,37 @@ class FlattenTests {
flattened.getColumnGroup("f").columnNames() shouldBe listOf("a", "b", "c")
flattened.ungroup("f") shouldBe df
}

@Test
fun `flatten the aggregation and check column names`() {
val df = dataFrameOf("firstName", "lastName", "age", "city", "weight", "isHappy")(
"Alice", "Cooper", 15, "London", 54, true,
"Bob", "Dylan", 45, "Dubai", 87, true,
"Charlie", "Daniels", 20, "Moscow", 35, false,
"Charlie", "Chaplin", 40, "Milan", 41, true,
"Bob", "Marley", 30, "Tokyo", 68, true,
"Alice", "Wolf", 20, "Milan", 55, false,
"Charlie", "Byrd", 30, "Moscow", 90, true
).cast<Person>()

val aggregate = df.groupBy("city")
.aggregate {
mean() into "mean"
std() into "std"
}

aggregate
.flatten(keepParentNameForColumns = true)
.columnNames() shouldBe listOf("city", "age.mean", "weight.mean", "age.std", "weight.std")
}

@DataSchema
interface Person {
val age: Int
val city: String?
val firstName: String
val lastName: String
val weight: Int?
val isHappy: Boolean
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class MoveTests {
val grouped = df.move { cols { it.name.contains(".") } }.into { it.name.split(".").toPath() }

@Test
fun batchGrouping() {
fun `batch grouping`() {
grouped.columnNames() shouldBe listOf("q", "a", "b", "w", "e", "r")
grouped["a"].asColumnGroup().columnNames() shouldBe listOf("b", "c")
grouped["a"]["c"].asColumnGroup().columnNames() shouldBe listOf("d")
Expand All @@ -33,7 +33,7 @@ class MoveTests {
}

@Test
fun batchUngrouping() {
fun `batch ungrouping`() {
val ungrouped = grouped.move { dfs { it.depth() > 0 && !it.isColumnGroup() } }.into { pathOf(it.path.joinToString(".")) }
ungrouped.columnNames() shouldBe listOf("q", "a.b", "a.c.d", "b.c", "b.d", "w", "e.f", "r")
}
Expand Down Expand Up @@ -64,7 +64,7 @@ class MoveTests {
}

@Test
fun `selectDfs`() {
fun `select Dfs`() {
val selected = grouped.select { it["a"].dfs { !it.isColumnGroup() } }
selected.columnNames() shouldBe listOf("b", "d")
}
Expand Down