From c843ffdc520472063c557f9d6fbae6961a40027f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 20 Aug 2025 12:35:39 -0400 Subject: [PATCH 1/7] Improve documentation for Signature, Volatility, and TypeSignature --- datafusion/expr-common/src/signature.rs | 115 +++++++++++++++++------- datafusion/expr/src/udf.rs | 20 +++-- 2 files changed, 97 insertions(+), 38 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 5e1705d8ff61..657348841ea3 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Signature module contains foundational types that are used to represent signatures, types, -//! and return types of functions in DataFusion. +//! Function signatures: [`Volatility`], [`Signature`] and [`TypeSignature`] use std::fmt::Display; use std::hash::Hash; @@ -44,42 +43,79 @@ pub const TIMEZONE_WILDCARD: &str = "+TZ"; /// valid length. It exists to avoid the need to enumerate all possible fixed size list lengths. pub const FIXED_SIZE_LIST_WILDCARD: i32 = i32::MIN; -/// A function's volatility, which defines the functions eligibility for certain optimizations +/// When a function's output changes when the input does not +/// +/// The volatility of a function determine eligibility for certain +/// optimizations. You should always defined your function to have the strictest +/// volatility that applies to it to maximize performance and avoid unexpected +/// results. +/// #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] pub enum Volatility { - /// An immutable function will always return the same output when given the same - /// input. DataFusion will attempt to inline immutable functions during planning. + /// Always returns the same output when given the same input. + /// + /// DataFusion will inline immutable functions during planning. Immutable, - /// A stable function may return different values given the same input across different - /// queries but must return the same value for a given input within a query. An example of - /// this is the `Now` function. DataFusion will attempt to inline `Stable` functions - /// during planning, when possible. - /// For query `select col1, now() from t1`, it might take a while to execute but - /// `now()` column will be the same for each output row, which is evaluated - /// during planning. + /// May return different values given the same input across different + /// queries but must return the same value for a given input within a query. + /// + /// An example of a stable function is the `now()` function. For example, + /// the query `select col1, now() from t1`, will return different results + /// each time it is run, but within the same query, the output of the + /// `now()` function has the same value for each output row. + /// + /// DataFusion will inline `Stable` functions during planning, when + /// possible. Stable, - /// A volatile function may change the return value from evaluation to evaluation. - /// Multiple invocations of a volatile function may return different results when used in the - /// same query. An example of this is the random() function. DataFusion - /// can not evaluate such functions during planning. - /// In the query `select col1, random() from t1`, `random()` function will be evaluated - /// for each output row, resulting in a unique random value for each row. + /// May change the return value from evaluation to evaluation. + /// + /// Multiple invocations of a volatile function may return different results + /// when used in the same query on different rows. An example of this is the + /// `random()` function. + /// + /// DataFusion can not evaluate such functions during planning or push these + /// predicates into scans. In the query `select col1, random() from t1`, + /// `random()` function will be evaluated for each output row, resulting in + /// a unique random value for each row. Volatile, } -/// A function's type signature defines the types of arguments the function supports. +/// The types of arguments for which a function has implementations. +/// +/// Functions typically provide implementations for a small number of different +/// argument [`DataType`]s, rather than all possible combinations. If a user +/// calls a function with arguments that do not match any of the declared types, +/// DataFusion will attempt to automatically coerces (add casts to) function +/// arguments so they match the [`TypeSignature`]. See the [`type_coercion`] module +/// for more details +/// +/// For example, a function like `cos` may only provide an implementation for +/// [`DataType::Float64`]. When users call `cos` with a different argument type, +/// such as `cos(int_column)`, and type coercion automatically adds a cast such +/// as `cos(CAST int_column AS DOUBLE)` during planning. /// -/// Functions typically support only a few different types of arguments compared to the -/// different datatypes in Arrow. To make functions easy to use, when possible DataFusion -/// automatically coerces (add casts to) function arguments so they match the type signature. +/// [`type_coercion`]: crate::type_coercion /// -/// For example, a function like `cos` may only be implemented for `Float64` arguments. To support a query -/// that calls `cos` with a different argument type, such as `cos(int_column)`, type coercion automatically -/// adds a cast such as `cos(CAST int_column AS DOUBLE)` during planning. +/// ## Example: Strings /// -/// # Data Types +/// There are several different string types in Arrow, such as +/// [`DataType::Utf8`], [`DataType::LargeUtf8`], and [`DataType::Utf8View`]. /// -/// ## Timestamps +/// Some functions may have specialized implementations for these types, while others +/// may be able to handle only one of them. For example, a function that +/// only works with [`DataType::Utf8View`] would have the following signature: +/// +/// ``` +/// # use arrow::datatypes::DataType; +/// # use datafusion_expr_common::signature::{TypeSignature}; +/// // Declares the function must be invoked with a single argument of type `Utf8View`. +/// // if a user calls the function with `Utf8` or `LargeUtf8`, DataFusion will +/// // automatically add a cast to `Utf8View` during planning. +/// let type_signature = TypeSignature::Exact(vec![DataType::Utf8View]); +/// +/// ``` +/// +/// # Example: Timestamps /// /// Types to match are represented using Arrow's [`DataType`]. [`DataType::Timestamp`] has an optional variable /// timezone specification. To specify a function can handle a timestamp with *ANY* timezone, use @@ -130,8 +166,9 @@ pub enum TypeSignature { Exact(Vec), /// One or more arguments belonging to the [`TypeSignatureClass`], in order. /// - /// [`Coercion`] contains not only the desired type but also the allowed casts. - /// For example, if you expect a function has string type, but you also allow it to be casted from binary type. + /// [`Coercion`] contains not only the desired type but also the allowed + /// casts. For example, if you expect a function has string type, but you + /// also allow it to be casted from binary type. /// /// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`]. Coercible(Vec), @@ -206,7 +243,7 @@ impl TypeSignature { /// just listing specific DataTypes. For example, TypeSignatureClass::Timestamp matches any timestamp /// type regardless of timezone or precision. /// -/// Used primarily with TypeSignature::Coercible to define function signatures that can accept +/// Used primarily with [`TypeSignature::Coercible`] to define function signatures that can accept /// arguments that can be coerced to a particular class of types. #[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)] pub enum TypeSignatureClass { @@ -736,10 +773,22 @@ impl Hash for ImplicitCoercion { } } -/// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function. +/// Defines supported argument types and volatility for a function. +/// +/// A [`Signature`] provides DataFusion information necessary for calling a +/// function. +/// +/// The [`TypeSignature`] defines the types that a function has implementations +/// for. It **DOES NOT** define the types that a user query could call the +/// function with. DataFusion will automatically coerce (cast) argument types to +/// one of the supported function signatures, if possible. +/// +/// For example, if the `Signature` of a function is `Exact(vec![DataType::Float64])`, +/// the function only has an implementation for `Float64` arguments. However, a user +/// can call the function with `Float32` or `Int64` arguments, and DataFusion will +/// automatically coerce (cast) the arguments to `Float64` before calling the function. /// -/// DataFusion will automatically coerce (cast) argument types to one of the supported -/// function signatures, if possible. +/// The [`Volatility`] defines how the output of the function changes with the input. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub struct Signature { /// The data types that the function accepts. See [TypeSignature] for more information. diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index dcb942e65b91..91cd7b35d4b0 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -469,18 +469,28 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { )) } - /// Returns the function's [`Signature`] for information about what input - /// types are accepted and the function's Volatility. + /// Returns a [`Signature`] describing the argument types for which this + /// function has an implementation, and the function's [`Volatility`]. + /// + /// See [`Signature`] for more details on argument type handling + /// and [`Self::return_type`] for computing the return type. + /// + /// [`Volatility`]: datafusion_expr_common::signature::Volatility fn signature(&self) -> &Signature; - /// What [`DataType`] will be returned by this function, given the types of - /// the arguments. + /// [`DataType`] returned by this function, given the types of the + /// arguments. + /// + /// # Arguments + /// + /// `arg_types` Data types of the arguments. These types are guaranteed to + /// match one of [`Self::signature`]s. /// /// # Notes /// /// If you provide an implementation for [`Self::return_field_from_args`], /// DataFusion will not call `return_type` (this function). In such cases - /// is recommended to return [`DataFusionError::Internal`]. + /// is recommended to return [`DataFusionError::Internal`] from this function. /// /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal fn return_type(&self, arg_types: &[DataType]) -> Result; From a590dc2343a4b638d8b521d59bee99e0475b7697 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Aug 2025 17:16:34 -0400 Subject: [PATCH 2/7] Update datafusion/expr-common/src/signature.rs Co-authored-by: Jeffrey Vo --- datafusion/expr-common/src/signature.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 657348841ea3..5e3ef6b636ac 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -43,11 +43,11 @@ pub const TIMEZONE_WILDCARD: &str = "+TZ"; /// valid length. It exists to avoid the need to enumerate all possible fixed size list lengths. pub const FIXED_SIZE_LIST_WILDCARD: i32 = i32::MIN; -/// When a function's output changes when the input does not +/// How a function's output changes with respect to a fixed input /// -/// The volatility of a function determine eligibility for certain -/// optimizations. You should always defined your function to have the strictest -/// volatility that applies to it to maximize performance and avoid unexpected +/// The volatility of a function determines eligibility for certain +/// optimizations. You should always define your function to have the strictest +/// possible volatility to maximize performance and avoid unexpected /// results. /// #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] From 6d690d822c94e79111c695148c1e97baf2d67752 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Aug 2025 17:16:41 -0400 Subject: [PATCH 3/7] Update datafusion/expr-common/src/signature.rs Co-authored-by: Jeffrey Vo --- datafusion/expr-common/src/signature.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 5e3ef6b636ac..54451f7f3a28 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -85,7 +85,7 @@ pub enum Volatility { /// Functions typically provide implementations for a small number of different /// argument [`DataType`]s, rather than all possible combinations. If a user /// calls a function with arguments that do not match any of the declared types, -/// DataFusion will attempt to automatically coerces (add casts to) function +/// DataFusion will attempt to automatically coerce (add casts to) function /// arguments so they match the [`TypeSignature`]. See the [`type_coercion`] module /// for more details /// From 94e58f948dd56c79c22b18cfc3c36ffd4f03ef92 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Aug 2025 17:31:26 -0400 Subject: [PATCH 4/7] clarify immutable --- datafusion/expr-common/src/signature.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 54451f7f3a28..0ed30c29f20e 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -55,14 +55,18 @@ pub enum Volatility { /// Always returns the same output when given the same input. /// /// DataFusion will inline immutable functions during planning. + /// + /// For example, the `abs` function is immutable, so `abs(-1)` will be + /// evaluated and replaced with `1` during planning rather than invoking + /// the function at runtime. Immutable, /// May return different values given the same input across different /// queries but must return the same value for a given input within a query. /// - /// An example of a stable function is the `now()` function. For example, - /// the query `select col1, now() from t1`, will return different results - /// each time it is run, but within the same query, the output of the - /// `now()` function has the same value for each output row. + /// For example, the `now()` function is stable, because the query `select + /// col1, now() from t1`, will return different results each time it is run, + /// but within the same query, the output of the `now()` function has the + /// same value for each output row. /// /// DataFusion will inline `Stable` functions during planning, when /// possible. From 76c67d8914b9856e0745a942a242591f4348fb8f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Aug 2025 17:40:13 -0400 Subject: [PATCH 5/7] clarify ScalarUdfImpl --- datafusion/expr/src/udf.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index b80a165182cc..db6d9aa42090 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -500,14 +500,18 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// /// # Arguments /// - /// `arg_types` Data types of the arguments. These types are guaranteed to - /// match one of [`Self::signature`]s. + /// `arg_types` Data types of the arguments. The implementation of + /// `return_type` can assume that some other part of the code has coerced + /// the actual argument types to match [`Self::signature`]. /// /// # Notes /// /// If you provide an implementation for [`Self::return_field_from_args`], - /// DataFusion will not call `return_type` (this function). In such cases - /// is recommended to return [`DataFusionError::Internal`] from this function. + /// DataFusion will not call `return_type` (this function). While it is + /// valid to to put [`unimplemented!()`] or [`unreachable!()`], it is + /// recommended to return [`DataFusionError::Internal`] instead, which + /// reduces the severity of symptoms if bugs occur (an error rather than a + /// panic). /// /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal fn return_type(&self, arg_types: &[DataType]) -> Result; From a076d73fea873b7e2ba7680b28e59fabb6096457 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Aug 2025 17:43:27 -0400 Subject: [PATCH 6/7] examples of when stable functions will be inlined --- datafusion/expr-common/src/signature.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 0ed30c29f20e..f5d239a0bbe3 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -68,8 +68,9 @@ pub enum Volatility { /// but within the same query, the output of the `now()` function has the /// same value for each output row. /// - /// DataFusion will inline `Stable` functions during planning, when - /// possible. + /// DataFusion will inline `Stable` functions when possible. For example, + /// `Stable` functions are inlined when planning a query for execution, but + /// not in View definitions or prepared statements. Stable, /// May change the return value from evaluation to evaluation. /// From edaeca66a71bbd65870ba9ddd1efc7f48dc4f412 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Aug 2025 17:46:51 -0400 Subject: [PATCH 7/7] Reduce duplication between signature and TypeSignature --- datafusion/expr-common/src/signature.rs | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index f5d239a0bbe3..2c4590ab9d6d 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -87,6 +87,11 @@ pub enum Volatility { /// The types of arguments for which a function has implementations. /// +/// [`TypeSignature`] **DOES NOT** define the types that a user query could call the +/// function with. DataFusion will automatically coerce (cast) argument types to +/// one of the supported function signatures, if possible. +/// +/// # Overview /// Functions typically provide implementations for a small number of different /// argument [`DataType`]s, rather than all possible combinations. If a user /// calls a function with arguments that do not match any of the declared types, @@ -94,6 +99,7 @@ pub enum Volatility { /// arguments so they match the [`TypeSignature`]. See the [`type_coercion`] module /// for more details /// +/// # Example: Numeric Functions /// For example, a function like `cos` may only provide an implementation for /// [`DataType::Float64`]. When users call `cos` with a different argument type, /// such as `cos(int_column)`, and type coercion automatically adds a cast such @@ -778,22 +784,12 @@ impl Hash for ImplicitCoercion { } } -/// Defines supported argument types and volatility for a function. -/// -/// A [`Signature`] provides DataFusion information necessary for calling a -/// function. -/// -/// The [`TypeSignature`] defines the types that a function has implementations -/// for. It **DOES NOT** define the types that a user query could call the -/// function with. DataFusion will automatically coerce (cast) argument types to -/// one of the supported function signatures, if possible. +/// Provides information necessary for calling a function. /// -/// For example, if the `Signature` of a function is `Exact(vec![DataType::Float64])`, -/// the function only has an implementation for `Float64` arguments. However, a user -/// can call the function with `Float32` or `Int64` arguments, and DataFusion will -/// automatically coerce (cast) the arguments to `Float64` before calling the function. +/// - [`TypeSignature`] defines the argument types that a function has implementations +/// for. /// -/// The [`Volatility`] defines how the output of the function changes with the input. +/// - [`Volatility`] defines how the output of the function changes with the input. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub struct Signature { /// The data types that the function accepts. See [TypeSignature] for more information.