diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/corr.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/corr.kt index d4df7e426e..8c90cd7187 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/corr.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/corr.kt @@ -4,26 +4,180 @@ import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.ColumnsSelector import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.api.CorrDocs.Grammar +import org.jetbrains.kotlinx.dataframe.api.CorrDocs.SelectingOptions +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet +import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls +import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarLink +import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources +import org.jetbrains.kotlinx.dataframe.documentation.Indent +import org.jetbrains.kotlinx.dataframe.documentation.LineBreak +import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns import org.jetbrains.kotlinx.dataframe.impl.api.corrImpl import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API import kotlin.reflect.KProperty import kotlin.reflect.typeOf +/** + * Calculates the Pearson pairwise correlation between values in the specified [columns\]. + * + * This function does not compute the correlation immediately. + * Instead, it defines the primary set of columns + * and returns a [Corr] instance that allows configuring how the correlation should be computed. + * + * The function is available for numeric- and [Boolean] columns. + * [Boolean] values are converted into 1 for true and 0 for false. + * All other columns are ignored. + * If a [ColumnGroup] instance is passed as the target column for correlation, + * it will be unpacked into suitable nested columns. + * + * The [Corr] object provides two methods to perform correlation calculations: + * - [with][Corr.with] — computes correlations between the initially selected columns and a second set of columns. + * - [withItself][Corr.withItself] — computes pairwise correlations within the initially selected columns. + * + * Each method returns a square or rectangular correlation matrix represented by a [DataFrame], + * where rows and columns correspond to the selected column sets, + * and each cell contains the Pearson correlation coefficient between the corresponding pair of columns. + * + * To compute correlations between all suitable columns in the [DataFrame], use [DataFrame.corr()][DataFrame.corr]. + * + * Check out [Grammar]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See also: [Selecting Columns][SelectingOptions]. + * + * For more information, see: {@include [DocumentationUrls.Corr]} + */ +internal interface CorrDocs { + + /** + * {@comment Version of [SelectingColumns] with correctly filled in examples} + * @include [SelectingColumns] {@include [SetCorrOperationArg]} + */ + interface SelectingOptions + + /** + * ## Corr Operation Grammar + * {@include [LineBreak]} + * {@include [DslGrammarLink]} + * {@include [LineBreak]} + * + * **[`corr`][convert]**` { columnsSelector: `[`ColumnsSelector`][ColumnsSelector]` }` + * + * {@include [Indent]} + * __`.`__[**`with`**][Corr.with]` { columnsSelector: `[`ColumnsSelector`][ColumnsSelector]` }` + * + * {@include [Indent]} + *`| `__`.`__[**`withItself`**][Corr.withItself]`()` + */ + interface Grammar +} + +/** {@set [SelectingColumns.OPERATION] [corr][corr]} */ +@ExcludeFromSources +private interface SetCorrOperationArg + +/** + * {@include [CorrDocs]} + * ### This Corr Overload + */ +@ExcludeFromSources +private interface CommonCorrDocs + internal fun AnyCol.isSuitableForCorr() = isSubtypeOf() || type() == typeOf() // region DataFrame +/** + * An intermediate class used in the [corr] operation. + * + * This class does not perform any computation by itself — it serves as a transitional step + * before specifying how the correlation should be calculated. + * It must be followed by one of the computation methods to produce a correlation [DataFrame]. + * + * The resulting [DataFrame] is a correlation matrix where rows correspond to one set of columns, + * columns to the other set, and each cell contains the Pearson correlation coefficient + * between the respective pair of columns. + * + * Use the following methods to perform the computation: + * - [with] — selects a second set of columns and computes correlations between + * the initially selected columns and this second set. + * - [withItself] — computes pairwise correlations within the initially selected columns. + * + * See [Grammar][CorrDocs.Grammar] for more details. + */ public data class Corr(internal val df: DataFrame, internal val columns: ColumnsSelector) +/** + * Computes the pearson correlation between all suitable columns in this [DataFrame], + * including nested columns at any depth. + * + * The result is a square correlation matrix represented by a [DataFrame], + * where both rows and columns correspond to the original columns, + * and each cell contains the Pearson correlation coefficient between the respective pair of columns. + * + * The function is available for numeric- and [Boolean] columns. + * [Boolean] values are converted into 1 for true and 0 for false. + * All other columns are ignored. + * + * For more information, see: {@include [DocumentationUrls.Corr]} + * + * @return A square correlation matrix as a [DataFrame], where both rows and columns correspond to the original columns. + */ public fun DataFrame.corr(): DataFrame = corr { colsAtAnyDepth().filter { it.isSuitableForCorr() } }.withItself() +/** + * {@include [CommonCorrDocs]} + * @include [SelectingColumns.Dsl] {@include [SetCorrOperationArg]} + * + * The function is available for numeric- and [Boolean] columns. + * [Boolean] values are converted into 1 for true and 0 for false. + * All other columns are ignored. + * If a [ColumnGroup] instance is passed as the target column for correlation, + * it will be unpacked into suitable nested columns. + * + * ### Examples + * ```kotlin + * // Compute correlations between the "age" column and the "weight" and "height" columns + * df.corr { age }.with { weight and height } + * + * // Compute pairwise correlations between all columns of type `Number` + * df.corr { colsOf() }.withItself() + * ``` + * @param [columns\] The [Columns Selector][ColumnsSelector] used to select the columns + * of this [DataFrame] to compute a correlation. + * @return A [Corr] intermediate object with the selected columns. + */ public fun DataFrame.corr(columns: ColumnsSelector): Corr = Corr(this, columns) +/** + * {@include [CommonCorrDocs]} + * @include [SelectingColumns.ColumnNames] {@include [SetCorrOperationArg]} + * + * The function is available for numeric- and [Boolean] columns. + * [Boolean] values are converted into 1 for true and 0 for false. + * All other columns are ignored. + * If a [ColumnGroup] instance is passed as the target column for correlation, + * it will be unpacked into suitable nested columns. + * + * ### Examples + * ```kotlin + * // Compute correlations between the "age" column and the "weight" and "height" columns + * df.corr { age }.with { weight and height } + * + * // Compute pairwise correlations between all columns of type `Number` + * df.corr { colsOf() }.withItself() + * ``` + * @param [columns\] The [Column Names][String] used to select the columns + * of this [DataFrame] to compute a correlation. + * @return A [Corr] intermediate object with the selected columns. + */ public fun DataFrame.corr(vararg columns: String): Corr = corr { columns.toColumnSet() } @Deprecated(DEPRECATED_ACCESS_API) @@ -34,8 +188,67 @@ public fun DataFrame.corr(vararg columns: KProperty): Corr = @AccessApiOverload public fun DataFrame.corr(vararg columns: ColumnReference): Corr = corr { columns.toColumnSet() } +/** + * Calculates the correlation of specified [columns][otherColumns] + * with values in the columns previously selected with [corr]. + * + * Returns a correlation matrix represented by a [DataFrame], + * where rows and columns correspond to the selected column sets, + * and each cell contains the Pearson correlation coefficient between the corresponding pair of columns. + * + * Check out [Grammar]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See also: [Selecting Columns][SelectingOptions]. + * + * For more information, see: {@include [DocumentationUrls.Corr]} + */ +internal interface CorrWithDocs + +/** + * {@include [CorrWithDocs]} + * ### This Corr With Overload + */ +@ExcludeFromSources +private interface CommonCorrWithDocs + +/** + * {@include [CommonCorrWithDocs]} + * @include [SelectingColumns.Dsl] {@include [SetCorrOperationArg]} + * + * ### Examples + * ```kotlin + * // Compute correlations between the "age" column and the "weight" and "height" columns + * df.corr { age }.with { weight and height } + * + * // Compute correlations between the "speed" column and all columns of type `Double` (excluding itself) + * df.corr { speed }.with { colsOf() except speed } + * ``` + * + * @param otherColumns The [ColumnsSelector] used to select the second set of columns + * from this [DataFrame] to compute correlations against the initially selected columns. + * @return A [DataFrame] containing the resulting correlation matrix. + */ public fun Corr.with(otherColumns: ColumnsSelector): DataFrame = corrImpl(otherColumns) +/** + * {@include [CommonCorrWithDocs]} + * @include [SelectingColumns.ColumnNames] {@include [SetCorrOperationArg]} + * + * ### Examples + * ```kotlin + * // Compute correlations between the "age" column and the "weight" and "height" columns + * df.corr("age").with("weight", "height") + * + * // Compute correlations between the "speed" column and all columns of type `Number` + * df.corr { colsOf() }.with("speed") + * ``` + * + * @param otherColumns The [Column Names][String] used to select the second set of columns + * from this [DataFrame] to compute correlations against the initially selected columns. + * @return A [DataFrame] containing the resulting correlation matrix. + */ public fun Corr.with(vararg otherColumns: String): DataFrame = with { otherColumns.toColumnSet() } @Deprecated(DEPRECATED_ACCESS_API) @@ -48,6 +261,20 @@ public fun Corr.with(vararg otherColumns: KProperty): DataFra public fun Corr.with(vararg otherColumns: ColumnReference): DataFrame = with { otherColumns.toColumnSet() } +/** + * Calculates Pearson pairwise correlations between the columns + * previously selected with [corr]. + * + * Returns a square correlation matrix represented by a [DataFrame], + * where both rows and columns correspond to the selected columns, + * and each cell contains the Pearson correlation coefficient between the respective pair of columns. + * + * Check out [Grammar]. + * + * For more information, see: {@include [DocumentationUrls.Corr]} + * + * @return A [DataFrame] containing the pairwise correlation matrix. + */ public fun Corr.withItself(): DataFrame = with(columns) // endregion diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt index 221be75bd1..a1823956fe 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt @@ -102,6 +102,9 @@ internal interface DocumentationUrls { /** [See `convert` on the documentation website.]({@include [Url]}/convert.html) */ interface Convert + /** [See `convert` on the documentation website.]({@include [Url]}/corr.html) */ + interface Corr + /** [See `add` on the documentation website.]({@include [Url]}/add.html) */ interface Add