From cb11dee2914d10d28f13b3d015732e6ece48c98d Mon Sep 17 00:00:00 2001 From: Nikita Klimenko Date: Wed, 21 May 2025 02:07:42 +0300 Subject: [PATCH] Add few more pages about compiler plugin --- docs/StardustDocs/d.tree | 2 + .../topics/compilerPluginExamples.md | 87 ++++++++++++++++++ .../topics/staticInterpretation.md | 90 +++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 docs/StardustDocs/topics/compilerPluginExamples.md create mode 100644 docs/StardustDocs/topics/staticInterpretation.md diff --git a/docs/StardustDocs/d.tree b/docs/StardustDocs/d.tree index 7460f007f2..ec131ba7f5 100644 --- a/docs/StardustDocs/d.tree +++ b/docs/StardustDocs/d.tree @@ -44,7 +44,9 @@ + + diff --git a/docs/StardustDocs/topics/compilerPluginExamples.md b/docs/StardustDocs/topics/compilerPluginExamples.md new file mode 100644 index 0000000000..8ee59df853 --- /dev/null +++ b/docs/StardustDocs/topics/compilerPluginExamples.md @@ -0,0 +1,87 @@ +[//]: # (title: Compiler plugin examples) + +This page provides a few examples that you can copy directly to your project. +[Schema info](staticInterpretation.md#schema-info) will be a convenient way to observe the result of different operations. + +### Example 1 + +```kotlin +import org.jetbrains.kotlinx.dataframe.api.* + +fun main() { + val df = dataFrameOf("location", "income")( + "mall", "2.49", + "university", "2.99", + "university", "1.49", + "school", "0.99", + "hospital", "2.99", + "university", "0.49", + "hospital", "1.49", + "mall", "0.99", + "hospital", "0.49", + ) + + df + .convert { income }.with { it.toDouble() } + .groupBy { location }.aggregate { + income.toList() into "allTransactions" + sumOf { income } into "totalIncome" + }.forEach { + println(location) + println("totalIncome = $totalIncome") + } +} +``` + +### Example 2 + +```kotlin +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.io.* + +enum class State { + Idle, Productive, Maintenance +} + +class Event(val toolId: String, val state: State, val timestamp: Long) + +fun main() { + val tool1 = "tool_1" + val tool2 = "tool_2" + val tool3 = "tool_3" + + val events = listOf( + Event(tool1, State.Idle, 0), + Event(tool1, State.Productive, 5), + Event(tool2, State.Idle, 0), + Event(tool2, State.Maintenance, 10), + Event(tool2, State.Idle, 20), + Event(tool3, State.Idle, 0), + Event(tool3, State.Productive, 25), + ).toDataFrame() + + val lastTimestamp = events.maxOf { timestamp } + + val groupBy = events + .groupBy { toolId } + .sortBy { timestamp } + .add("stateDuration") { + (next()?.timestamp ?: lastTimestamp) - timestamp + } + + groupBy.updateGroups { + val allStates = State.entries.toDataFrame { + "state" from { it } + } + + val df = allStates.leftJoin(it) { state } + .fillNulls { stateDuration } + .with { -1 } + + df.groupBy { state }.sumFor { stateDuration } + } + .toDataFrame() + .toStandaloneHtml() + .openInBrowser() +} +``` diff --git a/docs/StardustDocs/topics/staticInterpretation.md b/docs/StardustDocs/topics/staticInterpretation.md new file mode 100644 index 0000000000..dcc2bc321d --- /dev/null +++ b/docs/StardustDocs/topics/staticInterpretation.md @@ -0,0 +1,90 @@ +# Static interpretation of DataFrame API + +Plugin evaluates dataframe operations, given compile-time known arguments such as constant String, resolved types, property access calls. +It updates the return type of the function call to provide properties that match column names and types. +The goal is to reflect the result of operations you apply to dataframe in types and have convenient typed API + +```kotlin +val weatherData = dataFrameOf( + "time" to columnOf(0, 1, 2, 4, 5, 7, 8, 9), + "temperature" to columnOf(12.0, 14.2, 15.1, 15.9, 17.9, 15.6, 14.2, 24.3), + "humidity" to columnOf(0.5, 0.32, 0.11, 0.89, 0.68, 0.57, 0.56, 0.5) +) + +weatherData.filter { temperature > 15.0 }.print() +``` + +## Schema info + +The schema of DataFrame, as the compiler plugin sees it, +is displayed when you hover on an expression or variable: + +![image.png](schema_info.png) + +This is a way to tell what properties are available. +For expressions with several operations, you can see how DataFrame changes at each step. + +## Visibility of the generated code + +Generated code itself is very similar to @DataSchema declarations in nature. +Take this expression as an example: + +```kotlin +fun main() { + val df: /* DataFrame */ = dataFrameOf("col" to columnOf(42)) +} +``` + +It produces two additional local classes: + +```kotlin +// Represents data schema +class DataFrameOf_39 { + val a: Int +} + +// Injected to implicit receiver scope of `main` function +class Scope { + val DataRow.a: Int + val ColumnsScope.a: DataColumn +} +``` + +You can read about the code transformation pipeline in [more detail](https://youtrack.jetbrains.com/issue/KT-65859). + +The fact that generated classes are anonymous local types limits their scope to the private scope of the file. +It means you can do this: + +```kotlin +private fun create(i: Int) = dataFrameOf("number" to columnOf(i)) + .first() + +fun main() { + val row = create(42) + println(row.number) +} +``` + +But you cannot refer to these classes from your code, have them appear in the explicit type of the variable or as parameter of a function. + +## Scope of compiler plugin + +Compiler plugin aims to cover all functions where the result of the operation depends only on input schema and arguments that can be resolved at compile time. +In the library, such functions are annotated with `@Refine` or `@Interpretable`. + +There are functions that are not supported: +`pivot`, `parse`, `read`, `ColumnSelectionDsl.filter`, etc. — operations where the resulting schema depends on data, so it's out of the scope +`gather`, `split`, `implode`, some CS DSL functions — they will be supported in the future release + +In Gradle projects it means that sometimes you'd need to provide [data schema](dataSchema.md) or fall back to String API. + +In Kotlin Notebook, the compiler plugin complements the built-in code generator that updates types or variables after cell execution. + +```kotlin +val df = DataFrame.read("...") +``` + +In the next cell you can add, convert, remove, aggregate columns and expect that schema will be updated accordingly, +without having to split your pipeline into multiple steps and trigger notebook code generation. + +