Skip to content

Commit 791da7f

Browse files
committed
Add additional documentation and examples to ArrayAccessor
1 parent e815d06 commit 791da7f

File tree

2 files changed

+93
-12
lines changed

2 files changed

+93
-12
lines changed

arrow-array/src/array/mod.rs

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -437,13 +437,84 @@ impl<'a, T: Array> Array for &'a T {
437437

438438
/// A generic trait for accessing the values of an [`Array`]
439439
///
440+
/// This trait helps write specialized implementations of algorithms for
441+
/// different array types. Specialized implementations allow the compiler
442+
/// to optimize the code for the specific array type, which can lead to
443+
/// significant performance improvements.
444+
///
445+
/// # Example
446+
/// For example, to write three different implementations of a string length function
447+
/// for [`StringArray`], [`LargeStringArray`], and [`StringViewArray`], you can write
448+
///
449+
/// ```
450+
/// # use std::sync::Arc;
451+
/// # use arrow_array::{ArrayAccessor, ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
452+
/// # use arrow_buffer::ArrowNativeType;
453+
/// # use arrow_array::cast::AsArray;
454+
/// # use arrow_array::iterator::ArrayIter;
455+
/// # use arrow_array::types::{Int32Type, Int64Type};
456+
/// # use arrow_schema::{ArrowError, DataType};
457+
/// /// This function takes a dynamically typed `ArrayRef` and calls
458+
/// /// calls one of three specialized implementations
459+
/// fn character_length(arg: ArrayRef) -> Result<ArrayRef, ArrowError> {
460+
/// match arg.data_type() {
461+
/// DataType::Utf8 => {
462+
/// // downcast the ArrayRef to a StringArray and call the specialized implementation
463+
/// let string_array = arg.as_string::<i32>();
464+
/// character_length_general::<Int32Type, _>(string_array)
465+
/// }
466+
/// DataType::LargeUtf8 => {
467+
/// character_length_general::<Int64Type, _>(arg.as_string::<i64>())
468+
/// }
469+
/// DataType::Utf8View => {
470+
/// character_length_general::<Int32Type, _>(arg.as_string_view())
471+
/// }
472+
/// _ => Err(ArrowError::InvalidArgumentError("Unsupported data type".to_string())),
473+
/// }
474+
/// }
475+
///
476+
/// /// A generic implementation of the character_length function
477+
/// /// This function uses the `ArrayAccessor` trait to access the values of the array
478+
/// /// so the compiler can generated specialized implementations for different array types
479+
/// ///
480+
/// /// Returns a new array with the length of each string in the input array
481+
/// /// * Int32Array for Utf8 and Utf8View arrays (lengths are 32-bit integers)
482+
/// /// * Int64Array for LargeUtf8 arrays (lengths are 64-bit integers)
483+
/// ///
484+
/// /// This is generic on the type of the primitive array (different string arrays have
485+
/// /// different lengths) and the type of the array accessor (different string arrays
486+
/// /// have different ways to access the values)
487+
/// fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor<Item = &'a str>>(
488+
/// array: V,
489+
/// ) -> Result<ArrayRef, ArrowError>
490+
/// where
491+
/// T::Native: OffsetSizeTrait,
492+
/// {
493+
/// let iter = ArrayIter::new(array);
494+
/// // Create a Int32Array / Int64Array with the length of each string
495+
/// let result = iter
496+
/// .map(|string| {
497+
/// string.map(|string: &str| {
498+
/// T::Native::from_usize(string.chars().count())
499+
/// .expect("should not fail as string.chars will always return integer")
500+
/// })
501+
/// })
502+
/// .collect::<PrimitiveArray<T>>();
503+
///
504+
/// /// Return the result as a new ArrayRef (dynamically typed)
505+
/// Ok(Arc::new(result) as ArrayRef)
506+
/// }
507+
/// ```
508+
///
440509
/// # Validity
441510
///
442-
/// An [`ArrayAccessor`] must always return a well-defined value for an index that is
443-
/// within the bounds `0..Array::len`, including for null indexes where [`Array::is_null`] is true.
511+
/// An [`ArrayAccessor`] must always return a well-defined value for an index
512+
/// that is within the bounds `0..Array::len`, including for null indexes where
513+
/// [`Array::is_null`] is true.
444514
///
445-
/// The value at null indexes is unspecified, and implementations must not rely on a specific
446-
/// value such as [`Default::default`] being returned, however, it must not be undefined
515+
/// The value at null indexes is unspecified, and implementations must not rely
516+
/// on a specific value such as [`Default::default`] being returned, however, it
517+
/// must not be undefined
447518
pub trait ArrayAccessor: Array {
448519
/// The Arrow type of the element being accessed.
449520
type Item: Send + Sync;

arrow/src/lib.rs

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@
4040
//! assert_eq!(array.values(), &[1, 0, 3])
4141
//! ```
4242
//!
43-
//! It is also possible to write generic code. For example, the following is generic over
44-
//! all primitively typed arrays
43+
//! It is also possible to write generic code for different concrete types.
44+
//! For example, since the following function is generic over all primitively
45+
//! typed arrays, when invoked the Rust compiler will generate specialized implementations
46+
//! with optimized code for each concrete type.
4547
//!
4648
//! ```rust
4749
//! # use std::iter::Sum;
@@ -60,7 +62,10 @@
6062
//! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6);
6163
//! ```
6264
//!
63-
//! And the following is generic over all arrays with comparable values
65+
//! And the following uses [`ArrayAccessor`] to implement a generic function
66+
//! over all arrays with comparable values.
67+
//!
68+
//! [`ArrayAccessor`]: array::ArrayAccessor
6469
//!
6570
//! ```rust
6671
//! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray};
@@ -81,10 +86,11 @@
8186
//!
8287
//! # Type Erasure / Trait Objects
8388
//!
84-
//! It is often the case that code wishes to handle any type of array, without necessarily knowing
85-
//! its concrete type. This use-case is catered for by a combination of [`Array`]
86-
//! and [`DataType`](datatypes::DataType), with the former providing a type-erased container for
87-
//! the array, and the latter identifying the concrete type of array.
89+
//! It is common to write code that handles any type of array, without necessarily
90+
//! knowing its concrete type. This is done using the [`Array`] trait and using
91+
//! [`DataType`] to determine the appropriate `downcast_ref`.
92+
//!
93+
//! [`DataType`]: datatypes::DataType
8894
//!
8995
//! ```rust
9096
//! # use arrow::array::{Array, Float32Array};
@@ -96,14 +102,18 @@
96102
//!
97103
//! fn impl_dyn(array: &dyn Array) {
98104
//! match array.data_type() {
105+
//! // downcast `dyn Array` to concrete `StringArray`
99106
//! DataType::Utf8 => impl_string(array.as_any().downcast_ref().unwrap()),
107+
//! // downcast `dyn Array` to concrete `Float32Array`
100108
//! DataType::Float32 => impl_f32(array.as_any().downcast_ref().unwrap()),
101109
//! _ => unimplemented!()
102110
//! }
103111
//! }
104112
//! ```
105113
//!
106-
//! To facilitate downcasting, the [`AsArray`](crate::array::AsArray) extension trait can be used
114+
//! You can use the [`AsArray`] extension trait to facilitate downcasting:
115+
//!
116+
//! [`AsArray`]: crate::array::AsArray
107117
//!
108118
//! ```rust
109119
//! # use arrow::array::{Array, Float32Array, AsArray};

0 commit comments

Comments
 (0)